using System;
using System.Collections;
using System.IO;
using System.Text.RegularExpressions;
using Aspose.Words;
using Aspose.Words.Examples.CSharp.Programming_Documents.Working_With_Document;
using Aspose.Words.Replacing;
namespace Split_document02
{
class Program
{
static void Main(string[] args)
{
var fileInfo = new FileInfo(@"C:\Users\user\...\sample2.docx");
//Load in the document
Document doc = new Document(fileInfo.FullName);
//insert bookmark at the start of document.
DocumentBuilder builder = new DocumentBuilder(doc);
builder.MoveToDocumentStart();
builder.StartBookmark("BM_0");
builder.EndBookmark("BM_0");
//Find matches using regular expression and insert bookmark
//This time tried to use \d as deviding regular expression
Regex rx = new Regex(@"\d", RegexOptions.Compiled);
FindReplaceOptions options = new FindReplaceOptions();
options.ReplacingCallback = new FindAndInsertBookmark();
doc.Range.Replace(rx, "", options);
ArrayList bookmarks = new ArrayList();
for (int i = 0; i < doc.Range.Bookmarks.Count; i++)
{
if (doc.Range.Bookmarks[i].Name.StartsWith("BM_"))
bookmarks.Add(doc.Range.Bookmarks[i]);
}
builder.MoveToDocumentEnd();
builder.StartBookmark("BM_" + bookmarks.Count);
builder.EndBookmark("BM_" + bookmarks.Count);
for (int i = 0; i < bookmarks.Count - 1; i++)
{
BookmarkStart bStart = ((Bookmark)bookmarks[i]).BookmarkStart;
BookmarkEnd bEnd = ((Bookmark)bookmarks[i + 1]).BookmarkEnd;
ArrayList nodes = Common.ExtractContent(bStart, bEnd, true);
Document newdoc = Common.GenerateDocument(doc, nodes);
newdoc.Save("Out_" + i + ".docx");
}
Console.WriteLine("Process Finished!");
}
public class FindAndInsertBookmark : IReplacingCallback
{
int i = 1;
ReplaceAction IReplacingCallback.Replacing(ReplacingArgs e)
{
// This is a Run node that contains either the beginning or the complete match.
Node currentNode = e.MatchNode;
// The first (and may be the only) run can contain text before the match,
// in this case it is necessary to split the run.
if (e.MatchOffset > 0)
currentNode = SplitRun((Run)currentNode, e.MatchOffset);
// This array is used to store all nodes of the match for further highlighting.
ArrayList runs = new ArrayList();
// Find all runs that contain parts of the match string.
int remainingLength = e.Match.Value.Length;
while (
(remainingLength > 0) &&
(currentNode != null) &&
(currentNode.GetText().Length <= remainingLength))
{
runs.Add(currentNode);
remainingLength = remainingLength - currentNode.GetText().Length;
// Select the next Run node.
// Have to loop because there could be other nodes such as BookmarkStart etc.
do
{
currentNode = currentNode.NextSibling;
}
while ((currentNode != null) && (currentNode.NodeType != NodeType.Run));
}
// Split the last run that contains the match if there is any text left.
if ((currentNode != null) && (remainingLength > 0))
{
SplitRun((Run)currentNode, remainingLength);
runs.Add(currentNode);
}
DocumentBuilder builder = new DocumentBuilder((Document)currentNode.Document);
builder.MoveTo((Run)runs[0]);
builder.InsertParagraph();
builder.StartBookmark("BM_" + i);
builder.EndBookmark("BM_" + i);
builder.InsertParagraph();
i++;
// Signal to the replace engine to do nothing because we have already done all what we wanted.
return ReplaceAction.Skip;
}
}
/// <summary>
/// Splits text of the specified run into two runs.
/// Inserts the new run just after the specified run.
/// </summary>
private static Run SplitRun(Run run, int position)
{
Run afterRun = (Run)run.Clone(true);
afterRun.Text = run.Text.Substring(position);
run.Text = run.Text.Substring(0, position);
run.ParentNode.InsertAfter(afterRun, run);
return afterRun;
}
}
}
If we have 88 matches for the regular expression, we should have 89 splitted outputs when the first match doesn’t occur in the beginning of the original Word file, and we have 88 splitted outputs when the first match occurs in the beginning of the original.
1.分割されるべき元ファイルの最後の部分が出力されない。
When you set “sample2.docx” as input file and set “連携” as regular expression、24 outputs are generated.
I attached these 24 outputs (spliited file) and Word file “last_part.docx” which shows the last part which were not generated as an output.
When seeing 24th splitted file “out_23.docx”、the last part of the original Word file is omitted as expected. In “last_part.docx”, I marked by red frame the last part which should be generated.
2.元ファイルにあるはずの文字列を正規表現にセットしても、何も出力されないことがある。
Although the string “○○○○○○○○○○○○○○○するため、以下を確認し、対応を行う。” exists in the original input file, when setting it as the regular expression, the program doesn’t show any errors but generated no output.
Please try the following code and see how it goes on your end?
Document doc = new Document("C:\\Temp\\Split_document_forAspose\\sample2.docx");
//insert bookmark at the start of document.
DocumentBuilder builder = new DocumentBuilder(doc);
builder.MoveToDocumentStart();
builder.StartBookmark("BM_0");
builder.EndBookmark("BM_0");
//Find matches and insert bookmarks
//Regex rx = new Regex(@"避難所の開設から撤収", RegexOptions.Compiled);
//Regex rx = new Regex(@"\d", RegexOptions.Compiled);
Regex rx = new Regex(@"○○○○○○○○○○○○○○○するため、以下を確認し、対応を行う。", RegexOptions.Compiled);
FindReplaceOptions options = new FindReplaceOptions();
options.ReplacingCallback = new FindAndInsertBookmark();
foreach (Section section in doc.Sections)
section.Body.Range.Replace(rx, "", options);
ArrayList bookmarks = new ArrayList();
for (int i = 0; i < doc.Range.Bookmarks.Count; i++)
{
if (doc.Range.Bookmarks[i].Name.StartsWith("BM_"))
bookmarks.Add(doc.Range.Bookmarks[i]);
}
builder.MoveToDocumentEnd();
BookmarkStart bmStart = builder.StartBookmark("BM_" + bookmarks.Count);
builder.EndBookmark("BM_" + bookmarks.Count);
bookmarks.Add(bmStart.Bookmark);
for (int i = 0; i < bookmarks.Count - 1; i++)
{
BookmarkStart bStart = ((Bookmark)bookmarks[i]).BookmarkStart;
BookmarkEnd bEnd = ((Bookmark)bookmarks[i + 1]).BookmarkEnd;
ArrayList nodes = Common.ExtractContent(bStart, bEnd, true);
Document newdoc = Common.GenerateDocument(doc, nodes);
var Outdir = @"C:\Temp\Split_document_forAspose\output\";
newdoc.Save(Outdir + "Out_" + i + ".docx");
}
public class FindAndInsertBookmark : IReplacingCallback
{
int i = 1;
ReplaceAction IReplacingCallback.Replacing(ReplacingArgs e)
{
// This is a Run node that contains either the beginning or the complete match.
Node currentNode = e.MatchNode;
// The first (and may be the only) run can contain text before the match,
// in this case it is necessary to split the run.
if (e.MatchOffset > 0)
currentNode = SplitRun((Run)currentNode, e.MatchOffset);
// This array is used to store all nodes of the match for further highlighting.
ArrayList runs = new ArrayList();
// Find all runs that contain parts of the match string.
int remainingLength = e.Match.Value.Length;
while (
(remainingLength > 0) &&
(currentNode != null) &&
(currentNode.GetText().Length <= remainingLength))
{
runs.Add(currentNode);
remainingLength = remainingLength - currentNode.GetText().Length;
// Select the next Run node.
// Have to loop because there could be other nodes such as BookmarkStart etc.
do
{
currentNode = currentNode.NextSibling;
}
while ((currentNode != null) && (currentNode.NodeType != NodeType.Run));
}
// Split the last run that contains the match if there is any text left.
if ((currentNode != null) && (remainingLength > 0))
{
SplitRun((Run)currentNode, remainingLength);
runs.Add(currentNode);
}
//if (currentNode != null)
//{
DocumentBuilder builder = new DocumentBuilder((Document)e.MatchNode.Document);
builder.MoveTo((Run)runs[0]);
builder.InsertParagraph();
builder.StartBookmark("BM_" + i);
builder.EndBookmark("BM_" + i);
builder.InsertParagraph();
i++;
//}
// Signal to the replace engine to do nothing because we have already done all what we wanted.
return ReplaceAction.Skip;
}
}
/// <summary>
/// Splits text of the specified run into two runs.
/// Inserts the new run just after the specified run.
/// </summary>
private static Run SplitRun(Run run, int position)
{
Run afterRun = (Run)run.Clone(true);
if (position < run.Text.Length)
afterRun.Text = run.Text.Substring(position);
else
afterRun.Text = run.Text;
if (position < run.Text.Length)
run.Text = run.Text.Substring(0, position);
run.ParentNode.InsertAfter(afterRun, run);
return afterRun;
}