获取docx内容怎么删除文档的浮动标签样式

hhh1111 · November 27, 2024, 3:36am

在下面代码里面更改

def aw_extract_headings_and_contents_table_dict_id(file):
    doc = aw.Document(file)
    current_level = 0
    data = []
    doc.update_list_labels()
    stack = []

    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    if node.node_type == aw.NodeType.FIELD_START:
                        continue
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level
                    label = ''
                    if node.list_format.is_list_item:
                        label = node.list_label.label_string
                    node.get_child_nodes(aw.NodeType.COMMENT, True).clear()
                    text_without_comments = node.get_text().strip()
                    data.append(
                        {label + text_without_comments: ''})
                else:
                    node.get_child_nodes(aw.NodeType.COMMENT, True).clear()
                    if node.get_text().strip() and not node.get_ancestor(
                            aw.NodeType.TABLE) and not node.get_ancestor(aw.NodeType.FIELD_START) and data:
                       
                        data[-1][list(data[-1].keys())[0]] += node.get_text().strip().replace('  SEQ 表 \* ARABIC ',
                                                                                               '').replace(
                            'TOC \h \c "表" HYPERLINK \l "_Toc14741"', '').replace(
                            '\u0013 SEQ 图 \\* ARABIC \u00141\u0015 ', '') + "\n"

            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                table_content = aw_read_table_id(parent_node)
                if data:
                    data[-1][list(data[-1].keys())[0]] += table_content

    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data
    merged_dict = {}
    for small_dict in data:
        for key, value in small_dict.items():
            # 檢查鍵是否为空，若为空則跳過
            if key:
                merged_dict[key] = value
    return merged_dict

vyacheslav.deryushev · November 27, 2024, 6:59am

@hhh1111 我不太明白你指的是什么风格。请提供有关此问题的更多信息。