正規表現によるWord文書の分割 (split documents using regular expression)

arithmer · August 19, 2020, 3:47am

正規表現を用いて、Word文書を分割するスクリプトを書いてみたのですが、
下記のエラーが発生します。

System.ArgumentException: ‘Start node and end node must be a child or descendant of a body’

デバッグの方針をご教示いただけたら幸いです。
書いたスクリプトは下記の通りです。
また、Commonに属するメソッドは下記のリンクのファイルから呼び出しております。

aspose-words/Aspose.Words-for-.NET/blob/master/Examples/DocsExamples/DocsExamples/Programming with Documents/Contents Management/Extract content.cs

using System;
using System.Collections.Generic;
using System.Text;
using Aspose.Words;
using Aspose.Words.Drawing;
using Aspose.Words.Fields;
using Aspose.Words.Tables;
using NUnit.Framework;

namespace DocsExamples.Programming_with_Documents.Contents_Management
{
    public class ExtractContent : DocsExamplesBase
    {
        [Test]
        public void ExtractContentBetweenBlockLevelNodes()
        {
            //ExStart:ExtractContentBetweenBlockLevelNodes
            Document doc = new Document(MyDir + "Extract content.docx");

            Paragraph startPara = (Paragraph) doc.LastSection.GetChild(NodeType.Paragraph, 2, true);

This file has been truncated. show original

using System;
using System.Collections;
using System.IO;
using System.Text.RegularExpressions;
using Aspose.Words;
using Aspose.Words.Examples.CSharp.Programming_Documents.Working_With_Document;
using Aspose.Words.Replacing;
namespace Split_document02
{
    class Program
    {
        static void Main(string[] args)
        {
            var fileInfo = new FileInfo(@"C:\Users\user\...\sample2.docx");
            //Load in the document
            Document doc = new Document(fileInfo.FullName);
            //insert bookmark at the start of document.
            DocumentBuilder builder = new DocumentBuilder(doc);
            builder.MoveToDocumentStart();
            builder.StartBookmark("BM_0");
            builder.EndBookmark("BM_0");
            //Find matches using regular expression and insert bookmark
            //This time tried to use \d as deviding regular expression
            Regex rx = new Regex(@"\d", RegexOptions.Compiled);
            FindReplaceOptions options = new FindReplaceOptions();
            options.ReplacingCallback = new FindAndInsertBookmark();
            doc.Range.Replace(rx, "", options);
            ArrayList bookmarks = new ArrayList();
            for (int i = 0; i < doc.Range.Bookmarks.Count; i++)
            {
                if (doc.Range.Bookmarks[i].Name.StartsWith("BM_"))
                    bookmarks.Add(doc.Range.Bookmarks[i]);
            }
            builder.MoveToDocumentEnd();
            builder.StartBookmark("BM_" + bookmarks.Count);
            builder.EndBookmark("BM_" + bookmarks.Count);
            for (int i = 0; i < bookmarks.Count - 1; i++)
            {
                BookmarkStart bStart = ((Bookmark)bookmarks[i]).BookmarkStart;
                BookmarkEnd bEnd = ((Bookmark)bookmarks[i + 1]).BookmarkEnd;
                ArrayList nodes = Common.ExtractContent(bStart, bEnd, true);
                Document newdoc = Common.GenerateDocument(doc, nodes);
                newdoc.Save("Out_" + i + ".docx");
            }
            Console.WriteLine("Process Finished!");
        }

        public class FindAndInsertBookmark : IReplacingCallback
        {
            int i = 1;
            ReplaceAction IReplacingCallback.Replacing(ReplacingArgs e)
            {
                // This is a Run node that contains either the beginning or the complete match.
                Node currentNode = e.MatchNode;
                // The first (and may be the only) run can contain text before the match,
                // in this case it is necessary to split the run.
                if (e.MatchOffset > 0)
                    currentNode = SplitRun((Run)currentNode, e.MatchOffset);
                // This array is used to store all nodes of the match for further highlighting.
                ArrayList runs = new ArrayList();
                // Find all runs that contain parts of the match string.
                int remainingLength = e.Match.Value.Length;
                while (
                        (remainingLength > 0) &&
                        (currentNode != null) &&
                        (currentNode.GetText().Length <= remainingLength))
                {
                    runs.Add(currentNode);
                    remainingLength = remainingLength - currentNode.GetText().Length;
                    // Select the next Run node.
                    // Have to loop because there could be other nodes such as BookmarkStart etc.
                    do
                    {
                        currentNode = currentNode.NextSibling;
                    }
                    while ((currentNode != null) && (currentNode.NodeType != NodeType.Run));
                }
                // Split the last run that contains the match if there is any text left.
                if ((currentNode != null) && (remainingLength > 0))
                {
                    SplitRun((Run)currentNode, remainingLength);
                    runs.Add(currentNode);
                }

                DocumentBuilder builder = new DocumentBuilder((Document)currentNode.Document);
                builder.MoveTo((Run)runs[0]);
                builder.InsertParagraph();
                builder.StartBookmark("BM_" + i);
                builder.EndBookmark("BM_" + i);
                builder.InsertParagraph();
                i++;

                // Signal to the replace engine to do nothing because we have already done all what we wanted.
                return ReplaceAction.Skip;
            }
        }

        /// <summary>
        /// Splits text of the specified run into two runs.
        /// Inserts the new run just after the specified run.
        /// </summary>
        private static Run SplitRun(Run run, int position)
        {
            Run afterRun = (Run)run.Clone(true);
            afterRun.Text = run.Text.Substring(position);
            run.Text = run.Text.Substring(0, position);
            run.ParentNode.InsertAfter(afterRun, run);
            return afterRun;
        }
    }
}

どうぞよろしくお願い申し上げます。

awais.hafeez · August 19, 2020, 11:43am

@shun1985,

タイムリーで正確な応答を確実にするために、ZIPで送信し、テスト用に以下のリソースをここに添付してください。

簡単な入力Word文書
スタンドアロンのシンプルなコンソールアプリケーション（コンパイルエラーのないソースコード）も作成してください。これにより、現在の問題を再現し、テストのためにここに添付できます。ファイルサイズを小さくするために、Aspose.Words DLLファイルを含めないでください。

これらの情報の準備が整い次第、シナリオの調査を開始し、より多くの情報を提供します。

arithmer · August 19, 2020, 2:17pm

Split_document_forAspose.zip (848.9 KB)
＠awais.hafeez

こちらにアプリケーション一式を添付いたします。
解凍したディレクトリ内のinputディレクトリに入力ファイルがございます。

awais.hafeez · August 20, 2020, 5:54am

@shun1985,

ソースのWord DOCX文書にはヘッダーとフッターがあり、検索と置換操作中にブックマークがヘッダーとフッターストーリー内に挿入されています。

単に次のコード行を置き換えてください

doc.Range.Replace(rx, "", options);

と

foreach (Section section in doc.Sections)
    section.Body.Range.Replace(rx, "", options);

arithmer · August 20, 2020, 7:22am

＠awais.hafeez,

ありがとうございます！
エラーなく、プログラムが動くようになりました。

いくつか出力されるファイルで気になる点がございます。
１．分割されるべき元ファイルの最後の部分が出力されない。
２．元ファイルにあるはずの文字列を正規表現にセットしても、何も出力されないことがある。
３．正規表現にマッチする文字列が元ファイルにたくさんある場合でも、1-2個などの少ない数の分割ファイルしか出力されない。

以上、ご確認の程よろしくお願いいたします。

awais.hafeez · August 20, 2020, 3:04pm

@shun1985,

正規表現の\ dメタ文字は、「sample2.docx」ドキュメントから[0-9]の範囲の1桁の文字の88個のオカレンスを検出しますが、55個の出力DOCXファイルを生成します。

ユースケースの完全な詳細を提供することにより、さらにお問い合わせを詳しく説明してください。 sample2.docxからいくつのDOCXファイルを作成しますか？参照用に、ここに簡略化されたソースドキュメントと対応するDOCXファイルを提供してください。これらの簡略化されたドキュメントは、MS Wordを使用して手動で作成できます。

DOCXファイルを作成するソースドキュメントのターゲット領域を強調表示（囲み）するスクリーンショットも作成して添付してください。

arithmer · August 21, 2020, 3:41am

@awais.hafeez

ご返信ありがとうございます。

正規表現の\ dメタ文字は、「sample2.docx」ドキュメントから[0-9]の範囲の1桁の文字の88個のオカレンスを検出しますが、55個の出力DOCXファイルを生成します。

元ファイルに88個の正規表現マッチがある場合、正規表現のマッチが元ファイルの最初に出現しない場合は89個の出力DOCXファイル、最初に出現する場合は88個の出力DOCXファイルを生成してほしいです。

１．分割されるべき元ファイルの最後の部分が出力されない。

入力ファイルを「sample2.docx」とし、正規表現を"連携"とすると、24個の出力DOCXファイル（分割ファイル）が生成されます。
生成された24個の分割ファイルと、生成されるべき最後の部分を示したWordファイル「last_part.docx」を添付しております。
分割ファイル24番目の「Out_23.docx」ではやはり、最後の部分が抜けており、「last_part.docx」のファイル中に赤枠で生成されるべき最後の部分を示しました。

２．元ファイルにあるはずの文字列を正規表現にセットしても、何も出力されないことがある。

sample2に中に存在する文字列、"○○○○○○○○○○○○○○○するため、以下を確認し、対応を行う。"を正規表現にセットした場合、プログラムは正常に動くけれども、出力ファイルが出てこないです。material_for_Aspose01.zip (434.3 KB)

arithmer · August 21, 2020, 4:05am

Thank you for your reply.
For ease of communication, I put English translation below for my comments just above.

正規表現の\ dメタ文字は、「sample2.docx」ドキュメントから[0-9]の範囲の1桁の文字の88個のオカレンスを検出しますが、55個の出力DOCXファイルを生成します。

If we have 88 matches for the regular expression, we should have 89 splitted outputs when the first match doesn’t occur in the beginning of the original Word file, and we have 88 splitted outputs when the first match occurs in the beginning of the original.

１．分割されるべき元ファイルの最後の部分が出力されない。

When you set “sample2.docx” as input file and set “連携” as regular expression、24 outputs are generated.
I attached these 24 outputs (spliited file) and Word file “last_part.docx” which shows the last part which were not generated as an output.
When seeing 24th splitted file “out_23.docx”、the last part of the original Word file is omitted as expected. In “last_part.docx”, I marked by red frame the last part which should be generated.

２．元ファイルにあるはずの文字列を正規表現にセットしても、何も出力されないことがある。

Although the string “○○○○○○○○○○○○○○○するため、以下を確認し、対応を行う。” exists in the original input file, when setting it as the regular expression, the program doesn’t show any errors but generated no output.

awais.hafeez · August 21, 2020, 2:42pm

@shun1985,

Please try the following code and see how it goes on your end?

Document doc = new Document("C:\\Temp\\Split_document_forAspose\\sample2.docx");

//insert bookmark at the start of document.
DocumentBuilder builder = new DocumentBuilder(doc);
builder.MoveToDocumentStart();
builder.StartBookmark("BM_0");
builder.EndBookmark("BM_0");

//Find matches and insert bookmarks

//Regex rx = new Regex(@"避難所の開設から撤収", RegexOptions.Compiled);
//Regex rx = new Regex(@"\d", RegexOptions.Compiled);
Regex rx = new Regex(@"○○○○○○○○○○○○○○○するため、以下を確認し、対応を行う。", RegexOptions.Compiled);
FindReplaceOptions options = new FindReplaceOptions();
options.ReplacingCallback = new FindAndInsertBookmark();
foreach (Section section in doc.Sections)
    section.Body.Range.Replace(rx, "", options);

ArrayList bookmarks = new ArrayList();

for (int i = 0; i < doc.Range.Bookmarks.Count; i++)
{
    if (doc.Range.Bookmarks[i].Name.StartsWith("BM_"))
        bookmarks.Add(doc.Range.Bookmarks[i]);
}

builder.MoveToDocumentEnd();
BookmarkStart bmStart = builder.StartBookmark("BM_" + bookmarks.Count);
builder.EndBookmark("BM_" + bookmarks.Count);
bookmarks.Add(bmStart.Bookmark);


for (int i = 0; i < bookmarks.Count - 1; i++)
{
    BookmarkStart bStart = ((Bookmark)bookmarks[i]).BookmarkStart;
    BookmarkEnd bEnd = ((Bookmark)bookmarks[i + 1]).BookmarkEnd;
    ArrayList nodes = Common.ExtractContent(bStart, bEnd, true);
    Document newdoc = Common.GenerateDocument(doc, nodes);

    var Outdir = @"C:\Temp\Split_document_forAspose\output\";
    newdoc.Save(Outdir + "Out_" + i + ".docx");
}

public class FindAndInsertBookmark : IReplacingCallback
{
    int i = 1;

    ReplaceAction IReplacingCallback.Replacing(ReplacingArgs e)
    {
        // This is a Run node that contains either the beginning or the complete match.
        Node currentNode = e.MatchNode;

        // The first (and may be the only) run can contain text before the match,
        // in this case it is necessary to split the run.

        if (e.MatchOffset > 0)
            currentNode = SplitRun((Run)currentNode, e.MatchOffset);

        // This array is used to store all nodes of the match for further highlighting.
        ArrayList runs = new ArrayList();

        // Find all runs that contain parts of the match string.
        int remainingLength = e.Match.Value.Length;

        while (
                (remainingLength > 0) &&
                (currentNode != null) &&
                (currentNode.GetText().Length <= remainingLength))

        {
            runs.Add(currentNode);
            remainingLength = remainingLength - currentNode.GetText().Length;

            // Select the next Run node.
            // Have to loop because there could be other nodes such as BookmarkStart etc.
            do
            {

                currentNode = currentNode.NextSibling;

            }
            while ((currentNode != null) && (currentNode.NodeType != NodeType.Run));
        }


        // Split the last run that contains the match if there is any text left.
        if ((currentNode != null) && (remainingLength > 0))
        {
            SplitRun((Run)currentNode, remainingLength);
            runs.Add(currentNode);
        }

        //if (currentNode != null)
        //{
        DocumentBuilder builder = new DocumentBuilder((Document)e.MatchNode.Document);

        builder.MoveTo((Run)runs[0]);
        builder.InsertParagraph();
        builder.StartBookmark("BM_" + i);
        builder.EndBookmark("BM_" + i);
        builder.InsertParagraph();
        i++;
        //}

        // Signal to the replace engine to do nothing because we have already done all what we wanted.
        return ReplaceAction.Skip;
    }
}


/// <summary>
/// Splits text of the specified run into two runs.
/// Inserts the new run just after the specified run.
/// </summary>
private static Run SplitRun(Run run, int position)
{
    Run afterRun = (Run)run.Clone(true);

    if (position < run.Text.Length)
        afterRun.Text = run.Text.Substring(position);
    else
        afterRun.Text = run.Text;

    if (position < run.Text.Length)
        run.Text = run.Text.Substring(0, position);

    run.ParentNode.InsertAfter(afterRun, run);
    return afterRun;
}

arithmer · August 24, 2020, 5:49am

＠awais.hafeez
Thank you very much !!
It works fine and meets my end.
You have been of great assistance to me !