Issue with builder.move_to_document_start()

ansar2024 · April 4, 2024, 1:29am

Hello,
In a loop, I am trying to extract string between 2 markers. After every iteration of the loop, I want to move to the document start using builder.move_to_document_start(). However, it does not seem to work. Here is my code:

def main(raw_file, template_file, final_file, start_markers, end_markers, matched_strings):
    doc = aw.Document(raw_file)

    for start_marker, end_marker, matched_string in zip(start_markers, end_markers, matched_strings):

        # Replace markers with themselves to represent them as single runs.
        opt = aw.replacing.FindReplaceOptions()
        opt.use_substitutions = True
        doc.range.replace_regex(start_marker + "|" + end_marker, "$0", opt)

        # Get start and end runs of the range.
        start_run = None
        end_run = None
        for r in doc.get_child_nodes(aw.NodeType.RUN, True):
            run = r.as_run()
            if run.text.startswith(start_marker):
                start_run = run
            if run.text.startswith(end_marker):
                end_run = run

        # Extract content between start and end markers.
        nodes = ExtractContentHelper.extract_content(start_run, end_run, False)

        # Create a document from the extracted nodes
        extracted_doc = ExtractContentHelper.generate_document(doc, nodes)
        dst = aw.Document(template_file)
        builder = aw.DocumentBuilder(dst)

        # Replace placeholder with itself to represent them as single runs.
        placeholder = matched_string
        dst.range.replace(placeholder, "$0", opt)
        # Replace placeholder with the content extracted earlier.
        for r in dst.get_child_nodes(aw.NodeType.RUN, True):
            run = r.as_run()
            if run.text.startswith(placeholder):
                builder.move_to(run)
                builder.insert_document(extracted_doc, aw.ImportFormatMode.KEEP_SOURCE_FORMATTING)
                # Remove placeholder
                run.text = ""
        builder.move_to_document_start()
    dst.save(final_file)

Any help or advice is appreciated.

alexey.noskov · April 4, 2024, 4:54am

@ansar2024 Could you please explain what you expect from moving DocumentBuilder’s cursor to the document start. In the provided code you create a new instance of DocumentBuilder on each iteration, so moving cursor to the document start does not have any effect on the following iterations.

ansar2024 · April 4, 2024, 5:56am

I initialy had the following function that works fine to extract text between 2 markers from src_file and replace the extracted text in template_file and save the result in
final_file.

def main(src_file, template_file, final_file, start_markers, end_markers, matched_string):
    doc = aw.Document(src_file)

    # Replace markers with themselves to represent them as single runs.
    opt = aw.replacing.FindReplaceOptions()
    opt.use_substitutions = True
    start_marker = start_markers
    end_marker = end_markers
    doc.range.replace_regex(start_marker + "|" + end_marker, "$0", opt)

    # Get start and end runs of the range.
    start_run = None
    end_run = None
    for r in doc.get_child_nodes(aw.NodeType.RUN, True):
        run = r.as_run()
        if run.text.startswith(start_marker):
            start_run = run
        if run.text.startswith(end_marker):
            end_run = run

    # Extract content between start and end markers.
    nodes = ExtractContentHelper.extract_content(start_run, end_run, False)

    # Create a document from the extracted nodes
    extracted_doc = ExtractContentHelper.generate_document(doc, nodes)

    dst = aw.Document(template_file)
    builder = aw.DocumentBuilder(dst)

    # Replace placeholder with itself to represent them as single runs.
    placeholder = matched_string
    dst.range.replace(placeholder, "$0", opt)
    # Replace placeholder with the content extracted earlier.
    for r in dst.get_child_nodes(aw.NodeType.RUN, True):
        run = r.as_run()
        if run.text.startswith(placeholder):
            builder.move_to(run)
            builder.insert_document(extracted_doc, aw.ImportFormatMode.KEEP_SOURCE_FORMATTING)
            # Remove placeholder
            run.text = ""

    dst.save(final_file)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Process raw document and template document to generate final document.")
    parser.add_argument("src_file", type=str, help="Path to raw document")
    parser.add_argument("template_file", type=str, help="Path to template document")
    parser.add_argument("final_file", type=str, help="Path to save final document")
    parser.add_argument("start_markers", type=str, help="Start marker")
    parser.add_argument("end_markers", type=str, help="End marker")
    parser.add_argument("matched_string", type=str, help="String to be matched")
    args = parser.parse_args()

    main(args.src_file, args.template_file, args.final_file, args.start_markers, args.end_markers, args.matched_string)

Since I have many markers pairs that extract many texts, I want to do the text search and replacement for all the marker pairs before I save the final_file (dst.save(final_file)). I thought that one way to do that is to put the trio (start_markers, end_markers, matched_strings) in an array and loop in it. Hence, after 1 iteration of the loop, the cursor should move to the document start.

the following is an example of src_file:
source_file.docx (17.4 KB)

and template_file:
recipe1.docx (3.3 MB)

alexey.noskov · April 4, 2024, 7:41am

@ansar2024 Thank you for additional information. Actually there is no need to move DocumentBuilder document start to achieve what you need. I would wrap the content between the markers in the source document into a bookmarks to make it more convenient to work with them and use placeholder names from the destination document as bookmark names. This will allow to extract bookmarks’ content when placeholder is found. For example see the following code:

def get_placeholder_name(placeholders, marker):
    for key in placeholders:
        if placeholders[key][0] == marker or placeholders[key][1] == marker :
            return key
    return ""

placeholders = {
    "titrxx0o1z" : ["#stlev", "#enlev"],
    "servxx0o1z" : ["#stportz", "#enportz"],
    "prepxx0o1z" : ["#stpreptim", "#enpreptim"],
    "cookxx0o1z" : ["#stcooktim", "#encooktim"],
    "introxx0o1z" : ["#stintro", "#enintro"],
    "ingredxx0o1z" : ["#stingr", "#eningr"],
    "phasexx0o1z" : ["#stfas", "#enfas"],
    "nutrixx0o1z" : ["#stnutr", "#ennutr"],
    "suggesxx0o1z" : ["#stsug", "#ensug"],
    "pairingxx0o1z" : ["#stzgen", "#enzgen"]
}

# STAGE 1
src = aw.Document("C:\\Temp\\source_file.docx")

# Replace markers with themselves to represent them as single runs.
# since all markers have simmilar structure like `#sometext`, use regular expression to achieve this.
opt = aw.replacing.FindReplaceOptions()
opt.use_substitutions = True
src.range.replace_regex("#\w+", "$0", opt)

# Wrap the content between markers with bookmakrs to make it more convinient to work with them.
for r in src.get_child_nodes(aw.NodeType.RUN, True):
    run = r.as_run()
    if run.text.startswith("#st"):
        bk_name = get_placeholder_name(placeholders, run.text)
        if bk_name!="" :
            run.parent_node.insert_after(aw.BookmarkStart(src, bk_name), run)
    if run.text.startswith("#en"):
        bk_name = get_placeholder_name(placeholders, run.text)
        if bk_name!="" :
            run.parent_node.insert_before(aw.BookmarkEnd(src, bk_name), run)

# STAGE 2
# Open template and replace placeholders with themselves to represent them as single runs.
dst = aw.Document("C:\\Temp\\recipe1.docx")
dst_builder = aw.DocumentBuilder(dst)
# compose a regular expression.
placeholder_regex = "|".join(placeholders)
dst.range.replace_regex(placeholder_regex, "$0", opt)

# replace placeholders with content from the source document.
for r in dst.get_child_nodes(aw.NodeType.RUN, True):
    run = r.as_run()
    if run.text in placeholders :
        # check whether there is bookmark with a placeholder name in the soruce document.
        bk = src.range.bookmarks.get_by_name(run.text)
        if bk != None :
            # Extract content of the bookmark.
            nodes = ExtractContentHelper.extract_content(bk.bookmark_start, bk.bookmark_end, False)
            # Create a document from the extracted nodes
            extracted_doc = ExtractContentHelper.generate_document(src, nodes)
            # move document builder to the run and insert extracted content
            dst_builder.move_to(run)
            dst_builder.insert_document(extracted_doc, aw.ImportFormatMode.KEEP_SOURCE_FORMATTING)
            run.text = ""

dst.save("C:\\Temp\\out.docx")

ansar2024 · April 4, 2024, 8:16am

It works! I’m so grateful for your support.

ansar2024 · April 8, 2024, 7:09am

Hello,
I noticed that the style is lost after insertion is done with:
dst_builder.insert_document(extracted_doc, aw.ImportFormatMode.KEEP_SOURCE_FORMATTING)
For example in the this template
recipe1.docx (3.3 MB)

the style of the title is “Heading 1”, however the created document out.docx has this title in “normal”
Is this related to the ImportFormatMode ?

alexey.noskov · April 8, 2024, 8:03am

@ansar2024 Yes, when ImportFormatMode.KEEP_SOURCE_FORMATTING is used, Aspose.Words tries to preserve formatting but not styles. If you need to preserve style names, you can use ImportFormatMode.USE_DESTINATION_STYLES. Please see our documentation for more information:
https://reference.aspose.com/words/python-net/aspose.words/importformatmode/

ansar2024 · April 8, 2024, 8:35am

Thank you very much.