Extracting Text from Field Codes

Mahi39 · December 19, 2024, 6:03am

Hi Team,

How to extract the text from between the field codes?

Input: Input.docx (58.9 KB)

Excepted output: FX7.docx (52.6 KB)

Regards,
Mahi

alexey.noskov · December 19, 2024, 6:22am

@Mahi39 You can use the technique described here to extract content between nodes:
https://docs.aspose.com/words/java/extract-selected-content-between-nodes/
In your case you can use the start field’s end and the end field’s start and start and end nodes for extraction:

Document doc = new Document("C:\\Temp\\in.docx");
Field start = doc.getRange().getFields().get(0);
Field end = doc.getRange().getFields().get(1);
ArrayList<Node> nodes = ExtractContentHelper.extractContent(start.getEnd(), end.getStart(), false);
Document extractedContent = ExtractContentHelper.generateDocument(doc, nodes);
extractedContent.save("C:\\Temp\\out.docx");

Mahi39 · December 19, 2024, 6:39am

@alexey.noskov, I’ve been unable to import the ExtractContentHelper class. I got the below error message.

ExtractContentHelper cannot be resolved

alexey.noskov · December 19, 2024, 6:51am

@Mahi39 Th method can be found in code example on our github:
https://github.com/aspose-words/Aspose.Words-for-Java/
For your convenience here is the class:

public class ExtractContentHelper {

    //ExStart:CommonExtractContent
    public static ArrayList<Node> extractContent(Node startNode, Node endNode, boolean isInclusive)
    {
        // First, check that the nodes passed to this method are valid for use.
        verifyParameterNodes(startNode, endNode);

        // Create a list to store the extracted nodes.
        ArrayList<Node> nodes = new ArrayList<Node>();

        // If either marker is part of a comment, including the comment itself, we need to move the pointer
        // forward to the Comment Node found after the CommentRangeEnd node.
        if (endNode.getNodeType() == NodeType.COMMENT_RANGE_END && isInclusive)
        {
            Node node = findNextNode(NodeType.COMMENT, endNode.getNextSibling());
            if (node != null)
                endNode = node;
        }

        // Keep a record of the original nodes passed to this method to split marker nodes if needed.
        Node originalStartNode = startNode;
        Node originalEndNode = endNode;

        // Add the section where the start node is placed.
        nodes.add(startNode.getAncestor(NodeType.SECTION));

        // Extract content based on block-level nodes (paragraphs and tables). Traverse through parent nodes to find them.
        // We will split the first and last nodes' content, depending if the marker nodes are inline.
        startNode = getAncestorInBody(startNode);
        endNode = getAncestorInBody(endNode);

        boolean isExtracting = true;
        boolean isStartingNode = true;
        // The current node we are extracting from the document.
        Node currNode = startNode;

        // Begin extracting content. Process all block-level nodes and specifically split the first
        // and last nodes when needed, so paragraph formatting is retained.
        // Method is a little more complicated than a regular extractor as we need to factor
        // in extracting using inline nodes, fields, bookmarks, etc. to make it useful.
        while (isExtracting)
        {
            // Clone the current node and its children to obtain a copy.
            Node cloneNode = currNode.deepClone(true);
            boolean isEndingNode = currNode.equals(endNode);

            if (isStartingNode || isEndingNode)
            {
                // We need to process each marker separately, so pass it off to a separate method instead.
                // End should be processed at first to keep node indexes.
                if (isEndingNode)
                {
                    // !isStartingNode: don't add the node twice if the markers are the same node.
                    processMarker(cloneNode, nodes, originalEndNode, currNode, isInclusive,
                            false, !isStartingNode, false);
                    isExtracting = false;
                }

                // Conditional needs to be separate as the block level start and end markers, maybe the same node.
                if (isStartingNode)
                {
                    processMarker(cloneNode, nodes, originalStartNode, currNode, isInclusive,
                            true, true, false);
                    isStartingNode = false;
                }
            }
            else
                // Node is not a start or end marker, simply add the copy to the list.
                nodes.add(cloneNode);

            // Move to the next node and extract it. If the next node is null,
            // the rest of the content is found in a different section.
            if (currNode.getNextSibling() == null && isExtracting)
            {
                // Move to the next section.
                Section nextSection = (Section)currNode.getAncestor(NodeType.SECTION).getNextSibling();
                nodes.add(nextSection.deepClone(true));
                currNode = nextSection.getBody().getFirstChild();
            }
            else
            {
                // Move to the next node in the body.
                currNode = currNode.getNextSibling();
            }
        }

        // For compatibility with mode with inline bookmarks, add the next paragraph (empty).
        if (isInclusive && originalEndNode == endNode && !originalEndNode.isComposite())
            includeNextParagraph(endNode, nodes);

        // Return the nodes between the node markers.
        return nodes;
    }
    //ExEnd:CommonExtractContent

    //ExStart:CommonGenerateDocument
    public static Document generateDocument(Document srcDoc, ArrayList<Node> nodes)
    {
        // Clone source document to preserve source styles.
        Document dstDoc = (Document)srcDoc.deepClone(false);

        // Import each node from the list into the new document. Keep the original formatting of the node.
        ImportFormatOptions formattingOptions = new ImportFormatOptions();
        formattingOptions.setKeepSourceNumbering(true);
        formattingOptions.setSmartStyleBehavior(true);
        NodeImporter importer = new NodeImporter(srcDoc, dstDoc, ImportFormatMode.USE_DESTINATION_STYLES, formattingOptions);

        for (Node node : nodes)
        {
            if (node.getNodeType() == NodeType.SECTION)
            {
                Section srcSection = (Section)node;
                Section importedSection = (Section)importer.importNode(srcSection, false);
                importedSection.appendChild(importer.importNode(srcSection.getBody(), false));
                for (HeaderFooter hf : srcSection.getHeadersFooters())
                importedSection.getHeadersFooters().add(importer.importNode(hf, true));

                dstDoc.appendChild(importedSection);
            }
            else
            {
                Node importNode = importer.importNode(node, true);
                dstDoc.getLastSection().getBody().appendChild(importNode);
            }
        }

        return dstDoc;
    }
    //ExEnd:CommonGenerateDocument

    //ExStart:CommonExtractContentHelperMethods
    private static void verifyParameterNodes(Node startNode, Node endNode)
    {
        // The order in which these checks are done is important.
        if (startNode == null)
            throw new IllegalArgumentException("Start node cannot be null");
        if (endNode == null)
            throw new IllegalArgumentException("End node cannot be null");

        if (!startNode.getDocument().equals(endNode.getDocument()))
            throw new IllegalArgumentException("Start node and end node must belong to the same document");

        if (startNode.getAncestor(NodeType.BODY) == null || endNode.getAncestor(NodeType.BODY) == null)
            throw new IllegalArgumentException("Start node and end node must be a child or descendant of a body");

        // Check the end node is after the start node in the DOM tree.
        // First, check if they are in different sections, then if they're not,
        // check their position in the body of the same section.
        Section startSection = (Section)startNode.getAncestor(NodeType.SECTION);
        Section endSection = (Section)endNode.getAncestor(NodeType.SECTION);

        int startIndex = startSection.getParentNode().indexOf(startSection);
        int endIndex = endSection.getParentNode().indexOf(endSection);

        if (startIndex == endIndex)
        {
            if (startSection.getBody().indexOf(getAncestorInBody(startNode)) >
                    endSection.getBody().indexOf(getAncestorInBody(endNode)))
                throw new IllegalArgumentException("The end node must be after the start node in the body");
        }
        else if (startIndex > endIndex)
            throw new IllegalArgumentException("The section of end node must be after the section start node");
    }

    private static Node findNextNode(int nodeType, Node fromNode)
    {
        if (fromNode == null || fromNode.getNodeType() == nodeType)
            return fromNode;

        if (fromNode.isComposite())
        {
            Node node = findNextNode(nodeType, ((CompositeNode)fromNode).getFirstChild());
            if (node != null)
                return node;
        }

        return findNextNode(nodeType, fromNode.getNextSibling());
    }

    private static void processMarker(Node cloneNode, ArrayList<Node> nodes, Node node, Node blockLevelAncestor,
                                      boolean isInclusive, boolean isStartMarker, boolean canAdd, boolean forceAdd)
    {
        // If we are dealing with a block-level node, see if it should be included and add it to the list.
        if (node == blockLevelAncestor)
        {
            if (canAdd && isInclusive)
                nodes.add(cloneNode);
            return;
        }

        // If a marker is a FieldStart node check if it's to be included or not.
        // We assume for simplicity that the FieldStart and FieldEnd appear in the same paragraph.
        if (node.getNodeType() == NodeType.FIELD_START)
        {
            // If the marker is a start node and is not included, skip to the end of the field.
            // If the marker is an end node and is to be included, then move to the end field so the field will not be removed.
            if (isStartMarker && !isInclusive || !isStartMarker && isInclusive)
            {
                while (node.getNextSibling() != null && node.getNodeType() != NodeType.FIELD_END)
                    node = node.getNextSibling();
            }
        }

        // Support a case if the marker node is on the third level of the document body or lower.
        ArrayList<Node> nodeBranch = fillSelfAndParents(node, blockLevelAncestor);

        // Process the corresponding node in our cloned node by index.
        Node currentCloneNode = cloneNode;
        for (int i = nodeBranch.size() - 1; i >= 0; i--)
        {
            Node currentNode = nodeBranch.get(i);
            int nodeIndex = currentNode.getParentNode().indexOf(currentNode);
            currentCloneNode = ((CompositeNode)currentCloneNode).getChildNodes(NodeType.ANY, false).get(nodeIndex);

            removeNodesOutsideOfRange(currentCloneNode, isInclusive || (i > 0), isStartMarker);
        }

        //cloneNode.
        // After processing, the composite node may become empty if it has doesn't include it.
        if (canAdd &&
                (forceAdd || ((CompositeNode)cloneNode).hasChildNodes()))
        {
            nodes.add(cloneNode);
        }
    }

    private static void removeNodesOutsideOfRange(Node markerNode, boolean isInclusive, boolean isStartMarker)
    {
        boolean isProcessing = true;
        boolean isRemoving = isStartMarker;
        Node nextNode = markerNode.getParentNode().getFirstChild();

        while (isProcessing && nextNode != null)
        {
            Node currentNode = nextNode;
            boolean isSkip = false;

            if (currentNode.equals(markerNode))
            {
                if (isStartMarker)
                {
                    isProcessing = false;
                    if (isInclusive)
                        isRemoving = false;
                }
                else
                {
                    isRemoving = true;
                    if (isInclusive)
                        isSkip = true;
                }
            }

            nextNode = nextNode.getNextSibling();
            if (isRemoving && !isSkip)
                currentNode.remove();
        }
    }

    private static ArrayList<Node> fillSelfAndParents(Node node, Node tillNode)
    {
        ArrayList<Node> list = new ArrayList<Node>();
        Node currentNode = node;

        while (currentNode != tillNode)
        {
            list.add(currentNode);
            currentNode = currentNode.getParentNode();
        }

        return list;
    }

    private static void includeNextParagraph(Node node, ArrayList<Node> nodes)
    {
        Paragraph paragraph = (Paragraph)findNextNode(NodeType.PARAGRAPH, node.getNextSibling());
        if (paragraph != null)
        {
            // Move to the first child to include paragraphs without content.
            Node markerNode = paragraph.hasChildNodes() ? paragraph.getFirstChild() : paragraph;
            Node rootNode = getAncestorInBody(paragraph);

            processMarker(rootNode.deepClone(true), nodes, markerNode, rootNode,
                    markerNode == paragraph, false, true, true);
        }
    }

    private static Node getAncestorInBody(Node startNode)
    {
        while (startNode.getParentNode().getNodeType() != NodeType.BODY)
            startNode = startNode.getParentNode();
        return startNode;
    }
    //ExEnd:CommonExtractContentHelperMethods
}