Hi.
I have a word document (“master.docx”) that is going to be splitted into multiple documents, based on my own defined string (e.g %doc%)
Sample scenario: master.docx
---------------------------------------------------------------------------------
%doc%
this is my first document
%doc%
this is my second document
%doc%
this is my third document.
---------------------------------------------------------------------------------
This document should be splitted into 3 new documents.
- doc1.docx (“this is my first document”)
- doc2.docx (“this is my second document”)
- doc3.docx (“this is my third document”)
How could i achieve this without using bookmark, mergefield or other word object?
public class DocumentSpliter
{
public DocumentSpliter(Document doc)
{
mDoc = doc;
}
public List<Document> SplitDocument(string separator)
{
// Create regex
Regex regex = new Regex(Regex.Escape(separator));
mDoc.Range.Replace(regex, new ReplaceEvaluator(ReplaceGetSeparator), false);
// Create new Documents list
List<Document> subDocuments = new List<Document>();
// Create sub document
Document subDoc = null;
// Loop through all sections in the document
foreach (Section sect in mDoc.Sections)
{
// Loop through all nodes in section
foreach (Node node in sect.Body.ChildNodes)
{
if (mSeparators.Contains(node))
{
// Add sub document to the collection
if (subDoc != null)
subDocuments.Add(subDoc);
// Create new sub document
subDoc = new Document();
subDoc.FirstSection.Body.RemoveAllChildren();
}
else if (mDoc.LastSection.Body.LastChild.Equals(node))
{
// Import node and insert it into sub document
Node dstNode = subDoc.ImportNode(node, true, ImportFormatMode.KeepSourceFormatting);
subDoc.FirstSection.Body.AppendChild(dstNode);
// Add sub document to the collection
if (subDoc != null)
subDocuments.Add(subDoc);
}
else
{
// Import node and insert it into sub document
Node dstNode = subDoc.ImportNode(node, true, ImportFormatMode.KeepSourceFormatting);
subDoc.FirstSection.Body.AppendChild(dstNode);
}
}
}
return subDocuments;
}
private ReplaceAction ReplaceGetSeparator(object sender, ReplaceEvaluatorArgs e)
{
// Get paragraph with matched word
Paragraph par = (Paragraph)e.MatchNode.GetAncestor(NodeType.Paragraph);
if (mSeparators == null)
mSeparators = new ArrayList();
// Add to separators list
if (par != null)
mSeparators.Add(par);
return ReplaceAction.Skip;
}
private Document mDoc;
private ArrayList mSeparators;
}
Hi,
We have used the above code for spliting documents, but resulting documents contain lots of Formating mismatches.
We use the modifed verion of above code, our code is as follows,
public class DocumentSpliter
{
public DocumentSpliter(Document doc)
{
mDoc = doc;
}
public List<Document> SplitDocument(string separator)
{
//Create regex
System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(System.Text.RegularExpressions.Regex.Escape(separator));
mDoc.Range.Replace(regex, new ReplaceEvaluator(ReplaceGetSeparator), false);
//Create new Documents list
List<Document> subDocuments = new List<Document>();
//Create sub document
Document subDoc = null;
//subDoc.FirstSection.Body.RemoveAllChildren();
//bool test = false;
bool IsFirstNode = false;
NodeImporter Importer = new NodeImporter(mDoc, mDoc, ImportFormatMode.KeepSourceFormatting);
//Loop through all sections in the document
foreach (Section sect in mDoc.Sections)
{
//Loop through all nodes in section
if (sect.Body == null) continue;
foreach (Node node in sect.Body.ChildNodes)
{
if (mSeparators.Contains(node))
{
IsFirstNode = true;
//Add sub document to the collection
if (subDoc != null)
subDocuments.Add(subDoc);
//Create new sub document
subDoc = new Document();
subDoc.FirstSection.Body.RemoveAllChildren();
Importer = new NodeImporter(mDoc, subDoc, ImportFormatMode.KeepSourceFormatting);
}
else if (mDoc.LastSection.Body != null && mDoc.LastSection.Body.LastChild.Equals(node))
{
//Import node and insert it into sub document
//Node dstNode = subDoc.ImportNode(node, true, ImportFormatMode.KeepSourceFormatting);
//subDoc.FirstSection.Body.AppendChild(dstNode);
Node NewNode = Importer.ImportNode(node, true);
//Node dstNode = subDoc.ImportNode(node, true, ImportFormatMode.KeepSourceFormatting);
subDoc.FirstSection.Body.AppendChild(NewNode);
//Add sub document to the collection
if (subDoc != null)
subDocuments.Add(subDoc);
}
else
{
if (IsFirstNode)
{
//Import node and insert it into sub document
Node NewNode = Importer.ImportNode(node, true);
// Node dstNode = subDoc.ImportNode(node, true, ImportFormatMode.KeepSourceFormatting);
subDoc.FirstSection.Body.AppendChild(NewNode);
}
}
}
}
if (!subDocuments.Contains(subDoc))
{
subDocuments.Add(subDoc);
}
return subDocuments;
}
private ReplaceAction ReplaceGetSeparator(object sender, ReplaceEvaluatorArgs e)
{
//Get paragraph with matched word
Paragraph par = (Paragraph)e.MatchNode.GetAncestor(NodeType.Paragraph);
if (mSeparators == null)
mSeparators = new ArrayList();
//Add to separators list
if (par != null)
mSeparators.Add(par);
return ReplaceAction.Skip;
}
private Document mDoc;
private ArrayList mSeparators;
}
Thanks for your inquiry. Could you please attach your document for testing. I will investigate the issue and provide you more information.
What kind of formatting is lost?
Best regards.
Hi,
Thanks for your response. I am preparing the sample document and other details, i will send them soon.
I have one more query. The current code which you gave for extracting contents, does not fetch other resources like Images, Footers in the documents.
Can you pls post the sample code to Extract Contents (including Text,Images,HeaderFooters,etc) from Document between Userdefined String
We really appriciate your help.
Thanks,
Babu Kannan
Thanks for your request. Before creating sample code for you, I would like to ask a question. Why do you use strings as splitters between document parts? Maybe it is better to use sections. In case of using section, you don’t need any custom code to split your document, you can just import section from one document to another. Please see the following link to learn more about sections: https://docs.aspose.com/words/net/working-with-sections/
Best regards.
Hi,
Thanks for your suggestion.
But our business logic needs the document to be splitted between User defined string.
It would really help us if you can post that sample code.
Awaiting your response.
Thanks in Advance.
Babu Kannan
Thanks for your request. You can try using the following code to copy content between two paragraphs into a separate document.
///
/// Extracts content between nodes
/// nodes should be direct children of main story (body)
///
/// start node
/// end node
///
public Document ExtractContentBetweenNodes(Node startNode, Node endNode)
{
// Check whether start and end nodes are children of boby
if (startNode.ParentNode.NodeType != NodeType.Body || endNode.ParentNode.NodeType != NodeType.Body)
throw new Exception("Start and end nodes should be children of main story(body)");
// Clone the original document,
// this is needed to preserve styles of the original document
Document srcDoc = (Document)startNode.Document;
Document dstDoc = srcDoc.Clone();
dstDoc.RemoveAllChildren();
// Now we should copy parent nodes of the start node to the destination document
// these will Section, Body.
Node firstSect = dstDoc.ImportNode(startNode.GetAncestor(NodeType.Section), true, ImportFormatMode.UseDestinationStyles);
dstDoc.AppendChild(firstSect);
// Remove content from the section, except headers/footers
dstDoc.LastSection.Body.RemoveAllChildren();
Node currNode = startNode;
Node dstNode;
// Copy content
while (!currNode.Equals(endNode))
{
// Import node
dstNode = dstDoc.ImportNode(currNode, true, ImportFormatMode.UseDestinationStyles);
dstDoc.LastSection.Body.AppendChild(dstNode);
// move to the next node
if (currNode.NextSibling != null)
currNode = currNode.NextSibling;
// Move to the next section
else
{
Node sect = currNode.GetAncestor(NodeType.Section);
if (sect.NextSibling != null)
{
dstNode = dstDoc.ImportNode(sect.NextSibling, true, ImportFormatMode.UseDestinationStyles);
dstDoc.AppendChild(dstNode);
dstDoc.LastSection.Body.RemoveAllChildren();
currNode = ((Section)sect.NextSibling).Body.FirstChild;
}
else
{
break;
}
}
}
return dstDoc;
}
Hi Seth,
Thanks for your inquiry. The code should work with any recent version of Aspose.Words as there are no members in the code snippet that were changed in the refactored API.
Please find the VB version of the code below.
'''
''' Extracts content between nodes
''' nodes should be direct children of main story (body)
'''
''' start node
''' end node
'''
Public Function ExtractContentBetweenNodes(ByVal startNode As Node, ByVal endNode As Node) As Document
' Check whether start and end nodes are children of boby
If startNode.ParentNode.NodeType <> NodeType.Body OrElse endNode.ParentNode.NodeType <> NodeType.Body Then
Throw New Exception("Start and end nodes should be children of main story(body)")
End If
' Clone the original document,
' this is needed to preserve styles of the original document
Dim srcDoc As Document = CType(startNode.Document, Document)
Dim dstDoc As Document = srcDoc.Clone()
dstDoc.RemoveAllChildren()
' Now we should copy parent nodes of the start node to the destination document
' these will Section, Body.
Dim firstSect As Node = dstDoc.ImportNode(startNode.GetAncestor(NodeType.Section), True, ImportFormatMode.UseDestinationStyles)
dstDoc.AppendChild(firstSect)
'Remove content from the section, except headers/footers
dstDoc.LastSection.Body.RemoveAllChildren()
Dim currNode As Node = startNode
Dim dstNode As Node
' Copy content
Do While Not currNode.Equals(endNode)
'Import node
dstNode = dstDoc.ImportNode(currNode, True, ImportFormatMode.UseDestinationStyles)
dstDoc.LastSection.Body.AppendChild(dstNode)
'move to the next node
If currNode.NextSibling IsNot Nothing Then
currNode = currNode.NextSibling
'Move to the next section
Else
Dim sect As Node = currNode.GetAncestor(NodeType.Section)
If sect.NextSibling IsNot Nothing Then
dstNode = dstDoc.ImportNode(sect.NextSibling, True, ImportFormatMode.UseDestinationStyles)
dstDoc.AppendChild(dstNode)
dstDoc.LastSection.Body.RemoveAllChildren()
currNode = (CType(sect.NextSibling, Section)).Body.FirstChild
Else
Exit Do
End If
End If
Loop
Return dstDoc
End Function
If you have any troubles please feel free to ask for assitance.
Thanks,