Hi Team,
How to extract the text from between the field codes?
Input: Input.docx (58.9 KB)
Excepted output: FX7.docx (52.6 KB)
Regards,
Mahi
Hi Team,
How to extract the text from between the field codes?
Input: Input.docx (58.9 KB)
Excepted output: FX7.docx (52.6 KB)
Regards,
Mahi
@Mahi39 You can use the technique described here to extract content between nodes:
https://docs.aspose.com/words/java/extract-selected-content-between-nodes/
In your case you can use the start field’s end and the end field’s start and start and end nodes for extraction:
Document doc = new Document("C:\\Temp\\in.docx");
Field start = doc.getRange().getFields().get(0);
Field end = doc.getRange().getFields().get(1);
ArrayList<Node> nodes = ExtractContentHelper.extractContent(start.getEnd(), end.getStart(), false);
Document extractedContent = ExtractContentHelper.generateDocument(doc, nodes);
extractedContent.save("C:\\Temp\\out.docx");
@alexey.noskov, I’ve been unable to import the ExtractContentHelper class. I got the below error message.
ExtractContentHelper cannot be resolved
@Mahi39 Th method can be found in code example on our github:
https://github.com/aspose-words/Aspose.Words-for-Java/
For your convenience here is the class:
public class ExtractContentHelper {
//ExStart:CommonExtractContent
public static ArrayList<Node> extractContent(Node startNode, Node endNode, boolean isInclusive)
{
// First, check that the nodes passed to this method are valid for use.
verifyParameterNodes(startNode, endNode);
// Create a list to store the extracted nodes.
ArrayList<Node> nodes = new ArrayList<Node>();
// If either marker is part of a comment, including the comment itself, we need to move the pointer
// forward to the Comment Node found after the CommentRangeEnd node.
if (endNode.getNodeType() == NodeType.COMMENT_RANGE_END && isInclusive)
{
Node node = findNextNode(NodeType.COMMENT, endNode.getNextSibling());
if (node != null)
endNode = node;
}
// Keep a record of the original nodes passed to this method to split marker nodes if needed.
Node originalStartNode = startNode;
Node originalEndNode = endNode;
// Add the section where the start node is placed.
nodes.add(startNode.getAncestor(NodeType.SECTION));
// Extract content based on block-level nodes (paragraphs and tables). Traverse through parent nodes to find them.
// We will split the first and last nodes' content, depending if the marker nodes are inline.
startNode = getAncestorInBody(startNode);
endNode = getAncestorInBody(endNode);
boolean isExtracting = true;
boolean isStartingNode = true;
// The current node we are extracting from the document.
Node currNode = startNode;
// Begin extracting content. Process all block-level nodes and specifically split the first
// and last nodes when needed, so paragraph formatting is retained.
// Method is a little more complicated than a regular extractor as we need to factor
// in extracting using inline nodes, fields, bookmarks, etc. to make it useful.
while (isExtracting)
{
// Clone the current node and its children to obtain a copy.
Node cloneNode = currNode.deepClone(true);
boolean isEndingNode = currNode.equals(endNode);
if (isStartingNode || isEndingNode)
{
// We need to process each marker separately, so pass it off to a separate method instead.
// End should be processed at first to keep node indexes.
if (isEndingNode)
{
// !isStartingNode: don't add the node twice if the markers are the same node.
processMarker(cloneNode, nodes, originalEndNode, currNode, isInclusive,
false, !isStartingNode, false);
isExtracting = false;
}
// Conditional needs to be separate as the block level start and end markers, maybe the same node.
if (isStartingNode)
{
processMarker(cloneNode, nodes, originalStartNode, currNode, isInclusive,
true, true, false);
isStartingNode = false;
}
}
else
// Node is not a start or end marker, simply add the copy to the list.
nodes.add(cloneNode);
// Move to the next node and extract it. If the next node is null,
// the rest of the content is found in a different section.
if (currNode.getNextSibling() == null && isExtracting)
{
// Move to the next section.
Section nextSection = (Section)currNode.getAncestor(NodeType.SECTION).getNextSibling();
nodes.add(nextSection.deepClone(true));
currNode = nextSection.getBody().getFirstChild();
}
else
{
// Move to the next node in the body.
currNode = currNode.getNextSibling();
}
}
// For compatibility with mode with inline bookmarks, add the next paragraph (empty).
if (isInclusive && originalEndNode == endNode && !originalEndNode.isComposite())
includeNextParagraph(endNode, nodes);
// Return the nodes between the node markers.
return nodes;
}
//ExEnd:CommonExtractContent
//ExStart:CommonGenerateDocument
public static Document generateDocument(Document srcDoc, ArrayList<Node> nodes)
{
// Clone source document to preserve source styles.
Document dstDoc = (Document)srcDoc.deepClone(false);
// Import each node from the list into the new document. Keep the original formatting of the node.
ImportFormatOptions formattingOptions = new ImportFormatOptions();
formattingOptions.setKeepSourceNumbering(true);
formattingOptions.setSmartStyleBehavior(true);
NodeImporter importer = new NodeImporter(srcDoc, dstDoc, ImportFormatMode.USE_DESTINATION_STYLES, formattingOptions);
for (Node node : nodes)
{
if (node.getNodeType() == NodeType.SECTION)
{
Section srcSection = (Section)node;
Section importedSection = (Section)importer.importNode(srcSection, false);
importedSection.appendChild(importer.importNode(srcSection.getBody(), false));
for (HeaderFooter hf : srcSection.getHeadersFooters())
importedSection.getHeadersFooters().add(importer.importNode(hf, true));
dstDoc.appendChild(importedSection);
}
else
{
Node importNode = importer.importNode(node, true);
dstDoc.getLastSection().getBody().appendChild(importNode);
}
}
return dstDoc;
}
//ExEnd:CommonGenerateDocument
//ExStart:CommonExtractContentHelperMethods
private static void verifyParameterNodes(Node startNode, Node endNode)
{
// The order in which these checks are done is important.
if (startNode == null)
throw new IllegalArgumentException("Start node cannot be null");
if (endNode == null)
throw new IllegalArgumentException("End node cannot be null");
if (!startNode.getDocument().equals(endNode.getDocument()))
throw new IllegalArgumentException("Start node and end node must belong to the same document");
if (startNode.getAncestor(NodeType.BODY) == null || endNode.getAncestor(NodeType.BODY) == null)
throw new IllegalArgumentException("Start node and end node must be a child or descendant of a body");
// Check the end node is after the start node in the DOM tree.
// First, check if they are in different sections, then if they're not,
// check their position in the body of the same section.
Section startSection = (Section)startNode.getAncestor(NodeType.SECTION);
Section endSection = (Section)endNode.getAncestor(NodeType.SECTION);
int startIndex = startSection.getParentNode().indexOf(startSection);
int endIndex = endSection.getParentNode().indexOf(endSection);
if (startIndex == endIndex)
{
if (startSection.getBody().indexOf(getAncestorInBody(startNode)) >
endSection.getBody().indexOf(getAncestorInBody(endNode)))
throw new IllegalArgumentException("The end node must be after the start node in the body");
}
else if (startIndex > endIndex)
throw new IllegalArgumentException("The section of end node must be after the section start node");
}
private static Node findNextNode(int nodeType, Node fromNode)
{
if (fromNode == null || fromNode.getNodeType() == nodeType)
return fromNode;
if (fromNode.isComposite())
{
Node node = findNextNode(nodeType, ((CompositeNode)fromNode).getFirstChild());
if (node != null)
return node;
}
return findNextNode(nodeType, fromNode.getNextSibling());
}
private static void processMarker(Node cloneNode, ArrayList<Node> nodes, Node node, Node blockLevelAncestor,
boolean isInclusive, boolean isStartMarker, boolean canAdd, boolean forceAdd)
{
// If we are dealing with a block-level node, see if it should be included and add it to the list.
if (node == blockLevelAncestor)
{
if (canAdd && isInclusive)
nodes.add(cloneNode);
return;
}
// If a marker is a FieldStart node check if it's to be included or not.
// We assume for simplicity that the FieldStart and FieldEnd appear in the same paragraph.
if (node.getNodeType() == NodeType.FIELD_START)
{
// If the marker is a start node and is not included, skip to the end of the field.
// If the marker is an end node and is to be included, then move to the end field so the field will not be removed.
if (isStartMarker && !isInclusive || !isStartMarker && isInclusive)
{
while (node.getNextSibling() != null && node.getNodeType() != NodeType.FIELD_END)
node = node.getNextSibling();
}
}
// Support a case if the marker node is on the third level of the document body or lower.
ArrayList<Node> nodeBranch = fillSelfAndParents(node, blockLevelAncestor);
// Process the corresponding node in our cloned node by index.
Node currentCloneNode = cloneNode;
for (int i = nodeBranch.size() - 1; i >= 0; i--)
{
Node currentNode = nodeBranch.get(i);
int nodeIndex = currentNode.getParentNode().indexOf(currentNode);
currentCloneNode = ((CompositeNode)currentCloneNode).getChildNodes(NodeType.ANY, false).get(nodeIndex);
removeNodesOutsideOfRange(currentCloneNode, isInclusive || (i > 0), isStartMarker);
}
//cloneNode.
// After processing, the composite node may become empty if it has doesn't include it.
if (canAdd &&
(forceAdd || ((CompositeNode)cloneNode).hasChildNodes()))
{
nodes.add(cloneNode);
}
}
private static void removeNodesOutsideOfRange(Node markerNode, boolean isInclusive, boolean isStartMarker)
{
boolean isProcessing = true;
boolean isRemoving = isStartMarker;
Node nextNode = markerNode.getParentNode().getFirstChild();
while (isProcessing && nextNode != null)
{
Node currentNode = nextNode;
boolean isSkip = false;
if (currentNode.equals(markerNode))
{
if (isStartMarker)
{
isProcessing = false;
if (isInclusive)
isRemoving = false;
}
else
{
isRemoving = true;
if (isInclusive)
isSkip = true;
}
}
nextNode = nextNode.getNextSibling();
if (isRemoving && !isSkip)
currentNode.remove();
}
}
private static ArrayList<Node> fillSelfAndParents(Node node, Node tillNode)
{
ArrayList<Node> list = new ArrayList<Node>();
Node currentNode = node;
while (currentNode != tillNode)
{
list.add(currentNode);
currentNode = currentNode.getParentNode();
}
return list;
}
private static void includeNextParagraph(Node node, ArrayList<Node> nodes)
{
Paragraph paragraph = (Paragraph)findNextNode(NodeType.PARAGRAPH, node.getNextSibling());
if (paragraph != null)
{
// Move to the first child to include paragraphs without content.
Node markerNode = paragraph.hasChildNodes() ? paragraph.getFirstChild() : paragraph;
Node rootNode = getAncestorInBody(paragraph);
processMarker(rootNode.deepClone(true), nodes, markerNode, rootNode,
markerNode == paragraph, false, true, true);
}
}
private static Node getAncestorInBody(Node startNode)
{
while (startNode.getParentNode().getNodeType() != NodeType.BODY)
startNode = startNode.getParentNode();
return startNode;
}
//ExEnd:CommonExtractContentHelperMethods
}