Extract text between 2 marker strings

ansar2024 · April 3, 2024, 1:20am

Hello,
I want to extract the text between 2 markers (see attached screenshot)

I tried like in the following code:

import sys
import aspose.words as aw

def extract_text_between_marks(filename, mark_start, mark_end):
    try:




        # Load the document
        doc = aw.Document(filename)
        builder = aw.DocumentBuilder(doc)
        
        # Replace markers with themselves to represent them as single runs.
        opt = aw.replacing.FindReplaceOptions()
        opt.use_substitutions = True
        doc.range.replace_regex("mark_start[\\w\\s]+?mark_end", "$0", opt)
        
        for r in doc.get_child_nodes(aw.NodeType.RUN, True):
            run = r.as_run()
            if run.text.startswith(mark_start):
                extracted_text = run.text.replace(mark_start, "").replace(mark_end, "")
                return extracted_text
            run.remove()  # Removing the run should be within the loop
    except Exception as e:
        print("Error:", e)
        return None

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: python recipe_render.py <filename> <mark_start> <mark_end>")
        sys.exit(1)

    filename = sys.argv[1]
    mark_start = sys.argv[2]
    mark_end = sys.argv[3]

    extracted_text = extract_text_between_marks(filename, mark_start, mark_end)
    if extracted_text is not None:
        print("Text between marks:")
        print(extracted_text)

The issue is that only the first line between the markers is extracted.
Any help is appreciated.

alexey.noskov · April 3, 2024, 5:32am

@ansar2024 Your scenario is a bit more complicated that extracting simple text. in your case it is required to extract content between nodes. You can achieve this as described here:
https://docs.aspose.com/words/python-net/how-to-extract-selected-content-between-nodes-in-a-document/

Here is the code:

import aspose.words as aw

class ExtractContentHelper:

    @staticmethod
    def extract_content(start_node: aw.Node, end_node: aw.Node, is_inclusive: bool):

        # First, check that the nodes passed to this method are valid for use.
        ExtractContentHelper.verify_parameter_nodes(start_node, end_node)

        # Create a list to store the extracted nodes.
        nodes = []

        # If either marker is part of a comment, including the comment itself, we need to move the pointer
        # forward to the Comment Node found after the CommentRangeEnd node.
        if end_node.node_type == aw.NodeType.COMMENT_RANGE_END and is_inclusive:
            node = ExtractContentHelper.find_next_node(aw.NodeType.COMMENT, end_node.next_sibling)
            if node is not None:
                end_node = node

        # Keep a record of the original nodes passed to this method to split marker nodes if needed.
        original_start_node = start_node
        original_end_node = end_node

        # Extract content based on block-level nodes (paragraphs and tables). Traverse through parent nodes to find them.
        # We will split the first and last nodes' content, depending if the marker nodes are inline.
        start_node = ExtractContentHelper.get_ancestor_in_body(start_node)
        end_node = ExtractContentHelper.get_ancestor_in_body(end_node)

        is_extracting = True
        is_starting_node = True
        # The current node we are extracting from the document.
        curr_node = start_node

        # Begin extracting content. Process all block-level nodes and specifically split the first
        # and last nodes when needed, so paragraph formatting is retained.
        # Method is a little more complicated than a regular extractor as we need to factor
        # in extracting using inline nodes, fields, bookmarks, etc. to make it useful.
        while is_extracting:

            # Clone the current node and its children to obtain a copy.
            clone_node = curr_node.clone(True)
            is_ending_node = curr_node == end_node

            if is_starting_node or is_ending_node:

                # We need to process each marker separately, so pass it off to a separate method instead.
                # End should be processed at first to keep node indexes.
                if is_ending_node:
                    # !isStartingNode: don't add the node twice if the markers are the same node.
                    ExtractContentHelper.process_marker(clone_node, nodes, original_end_node, curr_node, is_inclusive, False, not is_starting_node, False)
                    is_extracting = False

                # Conditional needs to be separate as the block level start and end markers, maybe the same node.
                if is_starting_node:
#                    print("test1")
#                    print(len(nodes))
                    ExtractContentHelper.process_marker(clone_node, nodes, original_start_node, curr_node, is_inclusive, True, True, False)
                    is_starting_node = False
            else:
                # Node is not a start or end marker, simply add the copy to the list.
                nodes.append(clone_node)

            # Move to the next node and extract it. If the next node is None,
            # the rest of the content is found in a different section.
            if curr_node.next_sibling is None and is_extracting:
                # Move to the next section.
                next_section = curr_node.get_ancestor(aw.NodeType.SECTION).next_sibling.as_section()
                curr_node = next_section.body.first_child
            else:
                # Move to the next node in the body.
                curr_node = curr_node.next_sibling

        # For compatibility with mode with inline bookmarks, add the next paragraph (empty).
        if is_inclusive and original_end_node == end_node and not original_end_node.is_composite:
            ExtractContentHelper.include_next_paragraph(end_node, nodes)

        # Return the nodes between the node markers.
        return nodes

    #ExStart:CommonGenerateDocument
    @staticmethod
    def generate_document(src_doc: aw.Document, nodes):

        dst_doc = src_doc.clone(False).as_document()

        # Import each node from the list into the new document. Keep the original formatting of the node.
        importer = aw.NodeImporter(src_doc, dst_doc, aw.ImportFormatMode.USE_DESTINATION_STYLES)

        # Put the section from the source document to retain original section page setup.
        dst_doc.append_child(importer.import_node(src_doc.last_section, True))

        # Remove all children from the impirted section.
        dst_doc.first_section.body.remove_all_children();

        for node in nodes:
            import_node = importer.import_node(node, True)
            dst_doc.first_section.body.append_child(import_node)

        return dst_doc
    #ExEnd:CommonGenerateDocument

    #ExStart:CommonExtractContentHelperMethods
    @staticmethod
    def verify_parameter_nodes(start_node: aw.Node, end_node: aw.Node):

        # The order in which these checks are done is important.
        if start_node is None:
            raise ValueError("Start node cannot be None")
        if end_node is None:
            raise ValueError("End node cannot be None")

        if start_node.document != end_node.document:
            raise ValueError("Start node and end node must belong to the same document")

        if start_node.get_ancestor(aw.NodeType.BODY) is None or end_node.get_ancestor(aw.NodeType.BODY) is None:
            raise ValueError("Start node and end node must be a child or descendant of a body")

        # Check the end node is after the start node in the DOM tree.
        # First, check if they are in different sections, then if they're not,
        # check their position in the body of the same section.
        start_section = start_node.get_ancestor(aw.NodeType.SECTION).as_section()
        end_section = end_node.get_ancestor(aw.NodeType.SECTION).as_section()

        start_index = start_section.parent_node.index_of(start_section)
        end_index = end_section.parent_node.index_of(end_section)

        if start_index == end_index:

            if (start_section.body.index_of(ExtractContentHelper.get_ancestor_in_body(start_node)) >
                end_section.body.index_of(ExtractContentHelper.get_ancestor_in_body(end_node))):
                raise ValueError("The end node must be after the start node in the body")

        elif start_index > end_index:
            raise ValueError("The section of end node must be after the section start node")

    @staticmethod
    def find_next_node(node_type: aw.NodeType, from_node: aw.Node):

        if from_node is None or from_node.node_type == node_type:
            return from_node

        if from_node.is_composite:

            node = ExtractContentHelper.find_next_node(node_type, from_node.as_composite_node().first_child)
            if node is not None:
                return node

        return ExtractContentHelper.find_next_node(node_type, from_node.next_sibling)


    @staticmethod
    def process_marker(clone_node: aw.Node, nodes, node: aw.Node, block_level_ancestor: aw.Node,
        is_inclusive: bool, is_start_marker: bool, can_add: bool, force_add: bool):

        # If we are dealing with a block-level node, see if it should be included and add it to the list.
        if node == block_level_ancestor:
            if can_add and is_inclusive:
                nodes.append(clone_node)
            return

        # cloneNode is a clone of blockLevelNode. If node != blockLevelNode, blockLevelAncestor
        # is the node's ancestor that means it is a composite node.
        assert clone_node.is_composite

        # If a marker is a FieldStart node check if it's to be included or not.
        # We assume for simplicity that the FieldStart and FieldEnd appear in the same paragraph.
        if node.node_type == aw.NodeType.FIELD_START:
            # If the marker is a start node and is not included, skip to the end of the field.
            # If the marker is an end node and is to be included, then move to the end field so the field will not be removed.
            if (is_start_marker and not is_inclusive) or (not is_start_marker and is_inclusive):
                while node.next_sibling is not None and node.node_type != aw.NodeType.FIELD_END:
                    node = node.next_sibling

        # Support a case if the marker node is on the third level of the document body or lower.
        node_branch = ExtractContentHelper.fill_self_and_parents(node, block_level_ancestor)

        # Process the corresponding node in our cloned node by index.
        current_clone_node = clone_node
        for i in range(len(node_branch) - 1, -1, -1):

            current_node = node_branch[i]
            node_index = current_node.parent_node.index_of(current_node)
            current_clone_node = current_clone_node.as_composite_node().get_child(aw.NodeType.ANY, node_index, False)

            ExtractContentHelper.remove_nodes_outside_of_range(current_clone_node, is_inclusive or (i > 0), is_start_marker)

        # After processing, the composite node may become empty if it has doesn't include it.
        if can_add and (force_add or clone_node.as_composite_node().has_child_nodes):
            nodes.append(clone_node)

    @staticmethod
    def remove_nodes_outside_of_range(marker_node: aw.Node, is_inclusive: bool, is_start_marker: bool):

        is_processing = True
        is_removing = is_start_marker
        next_node = marker_node.parent_node.first_child

        while is_processing and next_node is not None:

            current_node = next_node
            is_skip = False

            if current_node == marker_node:
                if is_start_marker:
                    is_processing = False
                    if is_inclusive:
                        is_removing = False
                else:
                    is_removing = True
                    if is_inclusive:
                        is_skip = True

            next_node = next_node.next_sibling
            if is_removing and not is_skip:
                current_node.remove()

    @staticmethod
    def fill_self_and_parents(node: aw.Node, till_node: aw.Node):

        nodes = []
        current_node = node

        while current_node != till_node:
            nodes.append(current_node)
            current_node = current_node.parent_node

        return nodes

    @staticmethod
    def include_next_paragraph(node: aw.Node, nodes):

        paragraph = ExtractContentHelper.find_next_node(aw.NodeType.PARAGRAPH, node.next_sibling).as_paragraph()
        if paragraph is not None:

            # Move to the first child to include paragraphs without content.
            marker_node = paragraph.first_child if paragraph.has_child_nodes else paragraph
            root_node = ExtractContentHelper.get_ancestor_in_body(paragraph)

            ExtractContentHelper.process_marker(root_node.clone(True), nodes, marker_node, root_node,
                marker_node == paragraph, False, True, True)

    @staticmethod
    def get_ancestor_in_body(start_node: aw.Node):

        while start_node.parent_node.node_type != aw.NodeType.BODY:
            start_node = start_node.parent_node
        return start_node

doc = aw.Document("C:\\Temp\\in.docx")

# Replace markers with themselves to represent them as single runs.
opt = aw.replacing.FindReplaceOptions()
opt.use_substitutions = True
start_marker = "#stfas"
end_marker = "#enfas"
doc.range.replace_regex(start_marker + "|" + end_marker, "$0", opt)

# Get start and end runs of the range.
start_run = None
end_run = None
for r in doc.get_child_nodes(aw.NodeType.RUN, True):
    run = r.as_run()
    if run.text.startswith(start_marker):
        start_run = run
    if run.text.startswith(end_marker):
        end_run = run

# Extract content between start and end markers.
nodes = ExtractContentHelper.extract_content(start_run, end_run, False)
# Create a document from the extracted nodes
extracted_doc = ExtractContentHelper.generate_document(doc, nodes)

extracted_doc.save("C:\\Temp\\out.docx")

ansar2024 · April 3, 2024, 6:09am

The code you provided is working fine. Thanks a lot for your support.

ansar2024 · April 3, 2024, 8:28am

After the content between the 2 markers was extracted, I wanted to replace a specific string in an other document by this extracted content but I got an error:

An error occurred during conversion: [‘function takes at most 2 arguments (3 given)’, “can’t build String from ‘list’”]
here is what I added to the code:

def find_replace(word_in, word_out, filename, output_file):
    try:
        doc = aw.Document(filename)
        doc.range.replace(word_in, word_out, aw.replacing.FindReplaceOptions(aw.replacing.FindReplaceDirection.FORWARD))
        doc.save(output_file)
        print(f"Find Replace complete. File saved as: {output_file}")
    except Exception as e:
        print(f"An error occurred during conversion: {str(e)}")
        sys.exit(1)

and after the content was extracted :

# Extract content between start and end markers.
nodes = ExtractContentHelper.extract_content(start_run, end_run, False)

find_replace("phasexx0o1z", nodes, "recipe1.docx", "output2.docx")

alexey.noskov · April 3, 2024, 9:43am

@ansar2024 ExtractContentHelper.extract_content returns a list of nodes, while Range.replace expectes a string as a replacement. In your case you should use DocumentBuilder.insert_document method to insert extracted content at the specific node in the target document. Please try using the following code:

doc = aw.Document("C:\\Temp\\in.docx")

# Replace markers with themselves to represent them as single runs.
opt = aw.replacing.FindReplaceOptions()
opt.use_substitutions = True
start_marker = "#stfas"
end_marker = "#enfas"
doc.range.replace_regex(start_marker + "|" + end_marker, "$0", opt)

# Get start and end runs of the range.
start_run = None
end_run = None
for r in doc.get_child_nodes(aw.NodeType.RUN, True):
    run = r.as_run()
    if run.text.startswith(start_marker):
        start_run = run
    if run.text.startswith(end_marker):
        end_run = run

# Extract content between start and end markers.
nodes = ExtractContentHelper.extract_content(start_run, end_run, False)
# Create a document from the extracted nodes
extracted_doc = ExtractContentHelper.generate_document(doc, nodes)

# Open the target document and replace placeholder with the extracted content.
dst = aw.Document("C:\\Temp\\dst.docx")
builder = aw.DocumentBuilder(dst)

# Replace placeholder with itself to represent them as single runs.
placeholder = "phasexx0o1z"
dst.range.replace(placeholder, "$0", opt)
# Replace placeholder with the contetn extracted earlier.
for r in dst.get_child_nodes(aw.NodeType.RUN, True):
    run = r.as_run()
    if run.text.startswith(placeholder):
        builder.move_to(run)
        builder.insert_document(extracted_doc, aw.ImportFormatMode.KEEP_SOURCE_FORMATTING)
        # Remove placeholder
        run.text = ""

dst.save("C:\\Temp\\out.docx")

dst.docx (12.6 KB)
in.docx (14.3 KB)
out.docx (11.6 KB)

ansar2024 · April 3, 2024, 11:15am

alexey.noskov:

# Replace placeholder with itself to represent them as single runs.
placeholder = "phasexx0o1z"
dst.range.replace(placeholder, "$0", opt)
# Replace placeholder with the contetn extracted earlier.
for r in dst.get_child_nodes(aw.NodeType.RUN, True):
    run = r.as_run()
    if run.text.startswith(placeholder):
        builder.move_to(run)
        builder.insert_document(extracted_doc, aw.ImportFormatMode.KEEP_SOURCE_FORMATTING)
        # Remove placeholder
        run.text = ""

dst.save("C:\\Temp\\out.docx")

Thank you very much for your support. The issue is solved.