Paste back shape data correctly

Hi,

Need help in pasting back the content after correction of the content in shape also maintaining the styles of the document,
need to maintain the correct referenc of complete document

doc, content_blocks, clean_text_paragraphs = extract_all_content_with_styles(file_path)

URL_PATTERN = re.compile(r'https?://[^\s)]+')

def build_word_offsets(text, words):
    offsets = []
    index = 0
    for word in words:
        index = text.find(word, index)
        offsets.append((index, index + len(word)))
        index += len(word)
    return offsets

def rgb_string_to_color(rgb_str):
    if rgb_str and rgb_str.startswith("RGB("):
        try:
            r, g, b = map(int, rgb_str[4:-1].split(","))
            return Color.from_argb(r, g, b)
        except Exception:
            return None
    return None

def apply_builder_style(builder, style):
   replacing the styles
#need help in references and hyperlink


def style_last_hyperlink_field(paragraph, style):#non clickable format
    try:
        last_run = paragraph.runs[-1]
        font = last_run.font
        font.bold = style.get("bold", False)
        font.italic = style.get("italic", False)
        font.underline = aw.Underline.SINGLE if style.get("underline") == "1" else aw.Underline.NONE
        font.name = style.get("font_name", "Calibri")
        font.size = style.get("font_size", 11.0)
        color = rgb_string_to_color(style.get("color"))
        highlight = rgb_string_to_color(style.get("highlight_color"))
        if color:
            font.color = color
        if highlight:
            font.highlight_color = highlight
    except Exception:
        pass


def tag_and_flatten_paragraph_blocks(content_blocks):
    paragraphs = []
    for block in content_blocks:
        if block.get("type") == "paragraph":
            block["source_type"] = "body"
            paragraphs.append(block)
        elif block.get("type") == "table":
            for row in block.get("rows", []):
                for cell in row.get("cells", []):
                    for content in cell.get("content", []):
                        if content.get("type") == "paragraph":
                            content["source_type"] = "table"
                            paragraphs.append(content)
                        elif content.get("type") == "shape":
                            for p in content.get("paras", []):
                                p["source_type"] = "shape_in_table"
                                p["shape_node"] = content.get("shape_node")
                                paragraphs.append(p)
        elif block.get("type") == "shape":
            for p in block.get("paras", []):
                p["source_type"] = "shape"
                p["shape_node"] = block.get("shape_node")
                paragraphs.append(p)
    return paragraphs

def pasteback(doc, content_blocks, corrected_paragraphs):
    builder = aw.DocumentBuilder(doc)

    para_map = {
        block.get("para_id"): block
        for block in tag_and_flatten_paragraph_blocks(content_blocks)
    }

    for para_idx, clean_text in enumerate(proofread_paragraphs):
        if not clean_text.strip():
            continue

        block = para_map.get(para_idx + 1)
        if not block:
            continue

        source_type = block.get("source_type")
        para_node = block.get("paragraph_node")
        original_runs = block["runs"]

        try:
            # Handle paragraphs in body or table
            if source_type in ["body", "table"] and para_node:
                builder.move_to(para_node)
                para_node.remove_all_children()
                for run in original_runs:
                    apply_builder_style(builder, run)
                    builder.write(run["text"])

            # Handle paragraphs inside shapes
            elif source_type in ["shape", "shape_in_table"]:
                shape = block.get("shape_node")
                if shape is None:
                    continue

                shape = shape.as_shape()

                # Remove all existing paragraphs in shape
                for p in shape.get_child_nodes(aw.NodeType.PARAGRAPH, True):
                    p.remove()

                # Add new paragraph
                new_para = aw.Paragraph(doc)
                shape.append_child(new_para)
                builder.move_to(new_para)

                # Write each run with style
                for run in original_runs:
                    apply_builder_style(builder, run)
                    builder.write(run["text"])

        except Exception as e:
            print(f"Skipping para {para_idx+1} due to error: {e}")
            continue

we are using this kind of approach to paste back the shape data
sample content blocks

[{'type': 'paragraph',
  'para_id': 1,
  'text': '',
  'runs': [],
  'empty': True,
  'paragraph_node': <aspose.words.Paragraph object at 0x00000219CCF04750>},
 {'type': 'paragraph',
  'para_id': 2,
  'text': '',
  'runs': [],
  'empty': True,
  'paragraph_node': <aspose.words.Paragraph object at 0x00000219CCF046F0>},
 {'type': 'paragraph',
  'para_id': 3,
  'text': '',
  'runs': [],
  'empty': True,
  'paragraph_node': <aspose.words.Paragraph object at 0x00000219CCF04770>},
 {'block_index': 1,
  'type': 'shape',
  'shape_type': 'TEXT_BOX',
  'paras': [{'type': 'paragraph',
    'para_id': 4,
    'text': 'Key issues',
    'runs': [{'text': 'Key issues',
      'bold': True,
      'italic': False,
      'underline': '0',
      'font_name': 'Arial',
      'font_size': 9.5,
      'color': None,
      'highlight_color': None,
      'shading_color': None,
      'all_caps': False,
      'strike_through': False,
      'superscript': False,
      'subscript': False,
      'hyperlink': False,
      'hyperlink_url': None,
      'reference_type': None,
      'reference_target': None,
      'reference_field_code': None}],
    'empty': False,
    'paragraph_node': <aspose.words.Paragraph object at 0x00000219CCF04670>},

Shape_only.docx (796.0 KB)

@tesapsoe It could go something like this:

elements = [
    {'type': 'paragraph', 'para_id': 1, 'text': '', 'runs': [], 'empty': True},
    {'type': 'paragraph', 'para_id': 2, 'text': '', 'runs': [], 'empty': True},
    {'type': 'paragraph', 'para_id': 3, 'text': '', 'runs': [], 'empty': True},
    {
        'block_index': 1,
        'type': 'shape',
        'shape_type': 'TEXT_BOX',
        'paras': [{
            'type': 'paragraph',
            'para_id': 4,
            'text': 'Key issues',
            'runs': [{
                'text': 'Key issues',
                'bold': True,
                'italic': False,
                'underline': '0',
                'font_name': 'Arial',
                'font_size': 9.5,
                'color': None,
                'highlight_color': None,
                'shading_color': None,
                'all_caps': False,
                'strike_through': False,
                'superscript': False,
                'subscript': False,
                'hyperlink': False,
                'hyperlink_url': None,
                'reference_type': None,
                'reference_target': None,
                'reference_field_code': None
            }],
            'empty': False
        }]
    }
]

doc = aw.Document()
self.process_document_elements(doc, elements)
doc.save("document_with_header_textbox.docx")

def process_document_elements(self, doc, elements):
    builder = aw.DocumentBuilder(doc)

    # First move to header (primary header in this case)
    builder.move_to_header_footer(aw.HeaderFooterType.HEADER_PRIMARY)

    for element in elements:
        if element['type'] == 'shape':
            # Process shape elements (like TEXT_BOX)
            self.process_shape_element(builder, element)
        elif element['type'] == 'paragraph':
            # Process empty paragraphs if needed
            if not element['empty']:
                self.process_paragraph_element(builder, element)

def process_shape_element(self, builder, shape_data):
    """Processes shape elements and inserts them into the document"""
    if shape_data['shape_type'] == 'TEXT_BOX':
        # Store current position
        current_pos = builder.current_node

        # Create the TextBox
        shape = builder.insert_shape(aw.drawing.ShapeType.TEXT_BOX, 100, 40)

        # Set basic shape properties
        shape.stroke_color = aspose.pydrawing.Color.black
        shape.fill.color = aspose.pydrawing.Color.white

        # Move builder inside the TextBox to add content
        builder.move_to(shape.first_paragraph)

        # Process each paragraph in the TextBox
        for para_data in shape_data.get('paras', []):
            self.process_paragraph_element(builder, para_data)

def process_paragraph_element(self, builder, para_data):
    """Processes paragraph content including runs with formatting"""
    # Start a new paragraph if needed
    builder.writeln()

    # Process each run in the paragraph
    for run_data in para_data.get('runs', []):
        # Apply font formatting before writing text
        font = builder.font
        font.name = run_data.get('font_name', 'Arial')
        font.size = run_data.get('font_size', 9.0)
        font.bold = run_data.get('bold', False)
        font.italic = run_data.get('italic', False)

        # Handle underline
        underline = run_data.get('underline', '0')
        font.underline = aw.Underline.SINGLE if underline != '0' else aw.Underline.NONE

        # Write the text
        builder.write(run_data['text'])

        # Reset formatting if needed for subsequent runs
        font.clear_formatting()

    # End paragraph if needed
    if para_data.get('empty', False):
        builder.end_paragraph()

thanks for quick response,

we are using another service inbetween to correct the text from document. that is the reason we are using corrected paras(we correct entire sentence like restructuring) as input during pasteback, is there a possibility to help us with that approach?

@tesapsoe Your code in pasteback is almost correct, just try changing it to write your text in the correct place:

new_para = aw.Paragraph(doc)
shape.append_child(new_para)
builder.move_to(shape.first_paragraph)

thanks for the quick repose,

we are missing styles when pasted back, or the shapes are not entirely getting pasted using the logic shared

@tesapsoe Unfortunately, it’s hard to tell. Are you passing all the necessary style information to the apply_builder_style method? You can try to analyze the code I gave above and find the differences. You don’t need to do anything special, just create a form and insert text using the document builder.

Hi,

Yes, we are capturing the following style information for each token
‘bold’: False,
‘italic’: False,
‘underline’: ‘0’,
‘font_name’: ‘Arial’,
‘font_size’: 8.5,
‘color’: ‘RGB(54, 54, 54)’,
‘highlight_color’: None,
‘shading_color’: None,
‘all_caps’: False,
‘strike_through’: False,
‘superscript’: False,
‘subscript’: False,
‘hyperlink’: False,
‘hyperlink_url’: None,
‘reference_type’: None,
‘reference_target’: None,
‘reference_field_code’: None

and each of these are pasted back to new/corrected token in apply_builder_style

@tesapsoe Unfortunatly, in this case your are in better position to debug your program.