请问怎么使用docx结构化读取标题和标题下的内容,如果标题下面有图片怎么判断读取呢

图 14.docx (112.2 KB)
以下是代码

def get_col_span(current_cell):
    col_span = 1
    if current_cell != None and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
        current_cell = current_cell.next_cell
        while current_cell != None:
            if current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                col_span += 1
            else:
                break
            current_cell = current_cell.next_cell
    return col_span


def get_row_span(current_cell, row_index, cell_index, table_row_data):
    row_span = 1
    if current_cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
        for row in list(table_row_data)[row_index + 1:]:
            row = row.as_row()
            row_cells = row.cells
            if row_cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                row_span += 1
            else:
                break
    return row_span

def aw_read_table_id(table=None):
    table.convert_to_horizontally_merged_cells()
    table_contents_lis = []
    row_data = table.rows
    for r_index, r in enumerate(row_data, 0):
        row = r.as_row()
        current_cell = row.first_cell
        cell_index = 0
        table_row_data = []
        while current_cell != None:
            if current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS or current_cell.cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                pass
            else:
                current_cell_text = current_cell.get_text().replace('\x07', '').strip()
                cos_pan = get_col_span(current_cell)
                row_pan = get_row_span(current_cell, r_index, cell_index, row_data)
                col_width = current_cell.cell_format.width
                current_dic = {
                    "type": "tableCell",
                    "attrs": {
                        "colspan": cos_pan,
                        "rowspan": row_pan,
                        "colwidth": col_width,
                        "cell_index": cell_index,
                    }, "content": [
                        {
                            "type": "paragraph",
                            "content": [{
                                "type": "text",
                                "text": current_cell_text
                            }]
                        }
                    ]
                }
                table_row_data.append(current_dic)
            cell_index += 1
            current_cell = current_cell.next_cell
        table_contents_lis.append({
            "type": "tableRow",
            "content": table_row_data
        })
    return_data = {
        "type": "table",
        "contents": {
            "type": "table",
           "content": table_contents_lis
        },
    }
    print(return_data)
    return return_data
def aw_extract_headings_and_contents_table_dict_id(file):
    import aspose.words as aw
    lic = aw.License()
    lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
    lic.set_license(lic_path)
    doc = aw.Document(file)
    current_level = 0
    data = []
    stack = []
    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:
                        # 如果级别更深,将当前标题添加到堆栈
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        # 如果级别更浅,将堆栈中的项添加回数据
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level
                    data.append(
                        {
                            "Title": node.get_text(),
                            "Content": [],
                            "Level": level,
                            "Table": [],
                            "Tbale_name": [],
                        }
                    )
                else:
                    if data:
                        if node.get_text().strip():
                            data[-1]["Content"].append(
                                {"type": "text", "content": node.get_text().strip(),

                                 })
            if data:
                if node.node_type == aw.NodeType.TABLE:
                    parent_node = node.as_table()
                    tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                    _able_content = aw_read_table_id(parent_node)
                    data[-1]["Content"].append(
                        {"type": "table",
                         "content": _able_content,}
                       )
    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data
    return data

@hhh1111 您只需要

  1. 查找段落
  2. 查找段落下的形状,因为在 Aspose.Words 中,图像是作为带有一些图像数据的形状存储的。
  3. 检查形状是否有图像数据 - ImageData class | Aspose.Words for Python

下面是一段代码:

doc = aw.Document("input.docx")

all_paragraphs = doc.get_child_nodes(aw.NodeType.PARAGRAPH, True)
for para in all_paragraphs:
    shapes = para.as_paragraph().get_child_nodes(aw.NodeType.SHAPE, True)
    for shape in shapes:
        shape = shape.as_shape()
        if shape.has_image:
            image_data = shape.image_data

async def aw_extract_headings_and_contents_table_dict_id(file):
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
block_id = generate_unique_id()
block_id1 = generate_unique_id()
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
“Title”: node.get_text(),
“block_id”: str(block_id),
“Content”: [],
“Level”: level,
“Table”: [],
“Tbale_name”: [],
}
)
else:
if data:
if node.get_text().strip():
data[-1][“Content”].append(
{“type”: “text”, “content”: node.get_text().strip().replace(" SEQ 表 * ARABIC ",‘’),
“block_id”: data[-1][“block_id”] + ‘&&’ + str(block_id1),
“parent_block_id”: data[-1][“block_id”]})
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
_able_content = aw_read_table_id(parent_node, data[-1][“block_id”] + ‘&&’ + str(block_id1))
data[-1][“Content”].append(
{“type”: “table”,
“content”: _able_content,
“block_id”: data[-1][“block_id”] + ‘&&’ + str(block_id1),
“parent_block_id”: data[-1][“block_id”]})
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
请问怎么按照读取的顺序,展示图片呢

image.png (74.5 KB)

为什么我这样写不生效呢

@hhh1111 您需要改变 sect.body.get_child_nodes(aw.NodeType.ANY, False) 改为sect.body.get_child_nodes(aw.NodeType.ANY, True)

如果我改成True。为什么表格信息,以段落信息展示一个,以表格格式展示一次呢
image.jpg (195.8 KB)

以下是代码

def aw_extract_headings_and_contents_table_dict_id(file):
    doc = aw.Document(file)
    current_level = 0
    data = []
    stack = []
    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            block_id = generate_unique_id()
            block_id1 = generate_unique_id()
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:

                        # 如果级别更深,将当前标题添加到堆栈
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        # 如果级别更浅,将堆栈中的项添加回数据
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level

                    data.append(
                        {
                            "Title": node.get_text(),
                            "block_id": str(block_id),
                            "Content": [],
                            "Level": level,
                            "Table": [],
                            "Tbale_name": [],
                        }
                    )
                else:

                    if data:
                        if node.get_text().strip():
                            data[-1]["Content"].append(
                                {"type": "text",
                                 "content": node.get_text().strip().replace("  SEQ 表 \* ARABIC ", '').replace(
                                     'TOC \h \c "表" HYPERLINK \l "_Toc14741"', ''),
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]})
            if data:
                if node.node_type == aw.NodeType.SHAPE:
                    shape = node.as_shape()
                    if shape.has_image:
                        image_id = str(uuid.uuid1())  # 长度是36
                        try:
                            image_extension = aw.FileFormatUtil.image_type_to_extension(shape.image_data.image_type)
                            image_file_name = f"{image_id}{image_extension}"
                            image_path = os.path.join(settings.IMAGES_PATH, image_file_name)
                            shape.image_data.save(image_path)
                            data[-1]["Content"].append(
                                {"type": "image",
                                 "content": [{"type": "image", "attrs": {
                                     "src": image_file_name,
                                     "alt": "tips", "title": ''}}],
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]}
                            )
                        except Exception as e:
                            # 捕获并处理无法转换图像类型的错误
                            print(f"Error saving image: {e}. Skipping this image.")
                            continue

                if node.node_type == aw.NodeType.TABLE:
                    parent_node = node.as_table()
                    tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                    _able_content = aw_read_table_id(parent_node, data[-1]["block_id"] + '&&' + str(block_id1))
                    data[-1]["Content"].append(
                        {"type": "table",
                         "content": _able_content,
                         "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                         "parent_block_id": data[-1]["block_id"]})
    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data
    return data


def aw_read_table_id(table=None, id=None):
    table.convert_to_horizontally_merged_cells()

    table_data = []
    for row in table.rows:
        content = {
            "type": "tableRow",
            "content": []
        }
        row_index = table.index_of(row)
        cell = row.as_row().first_cell
        row_span = 1
        col_span = 1
        current_cell = cell
        cell_index = 0
        cell_text = ""
        while current_cell is not None:
            cell_index = current_cell.parent_row.index_of(current_cell)
            if current_cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                cell_text = current_cell.get_text()
                current_cell = current_cell.next_cell
                for i in range(row_index, table.rows.count):
                    if table.rows[i].cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                        row_span += 1
                while current_cell is not None and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                    col_span = col_span + 1
                    current_cell = current_cell.next_cell
            elif current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                cell_text = current_cell.get_text()
                current_cell = current_cell.next_cell
                while current_cell is not None and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                    col_span = col_span + 1
                    current_cell = current_cell.next_cell
            elif current_cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                cell_text = current_cell.get_text()
                cell_index = current_cell.parent_row.index_of(current_cell)
                for i in range(row_index, table.rows.count):
                    if table.rows[i].cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                        row_span += 1
                current_cell = current_cell.next_cell
            else:

                cell_text = current_cell.get_text()
                current_cell = current_cell.next_cell

            cell_content = {
                "type": "tableCell",
                "attrs": {
                    "colspan": col_span,
                    "rowspan": row_span,
                    "colwidth": None
                },
                "content": []
            }

            paragraph = {
                "type": "paragraph",
                "content": [
                    {
                        "type": "text",
                        "text": cell_text,
                    }
                ]
            }
            cell_content["content"].append(paragraph)
            content["content"].append(cell_content)

            col_span = 1
            row_span = 1

        table_data.append(content)
    return_data = {
        "type": "table",
        "attrs": {
            "id": id
        },
        "content": table_data
    }
    return return_data

@hhh1111 您能否以 docx 文件的形式提供这部分文件?

可以的 顺便帮我看下,读取出来的 图表目录 信息出来是乱码
5 伦理学.docx (63.4 KB)

@hhh1111 在这种情况下,我们会以递归方式获得所有子节点,而且由于每个单元格都有段落,我们会分别获得表格和表格中的段落,并彼此重复。要避免这种情况,可以使用

elif data:
    if node.get_text().strip() and not node.get_ancestor(aw.NodeType.TABLE):
        data[-1]["Content"].append(
            {"type": "text",
             "content": node.get_text().strip().replace("  SEQ 表 \* ARABIC ", '').replace(
                 'TOC \h \c "表" HYPERLINK \l "_Toc14741"', ''),
             "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
             "parent_block_id": data[-1]["block_id"]})

好的谢谢。 帮我看下图标读取出来有乱码

@hhh1111 能否提供一份带有图标的文件?请说明问题出在哪里。保存到图像路径之后?

图表。 3.1表格目录段落下面的内容

@hhh1111 在提供的文档中,我看不到任何带图标的形状。

这个段落内容 乱码 不是图标

@hhh1111 很难理解这个问题。在 3.1 段之后有一个 TOC 字段。我在另一个帖子中回答了这个问题。

您可以在打开文档后添加doc.unlink_fields(),然后将 TOC 作为文本读取,而不使用字段;也可以标记需要添加 TOC 的位置,然后使用builder.insert_table_of_contents("\\o \"1-3\" \\h \\z \\u")将 TOC 插入结果文件中。最后,在保存结果之前,您需要更新字段 doc.update_fields()

你好我已经设置 还是会出现这个问题。

async def aw_extract_headings_and_contents_table_dict_id(file):
    doc = aw.Document(file)
    current_level = 0
    data = []
    stack = []
    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            block_id = generate_unique_id()
            block_id1 = generate_unique_id()
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:

                        # 如果级别更深,将当前标题添加到堆栈
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        # 如果级别更浅,将堆栈中的项添加回数据
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level

                    data.append(
                        {
                            "Title": node.get_text(),
                            "block_id": str(block_id),
                            "Content": [],
                            "Level": level,
                            "Table": [],
                            "Tbale_name": [],
                        }
                    )
                else:

                    if data:
                        if node.get_text().strip() and not node.get_ancestor(aw.NodeType.TABLE):
                            data[-1]["Content"].append(
                                {"type": "text",
                                 "content": node.get_text().strip().replace("  SEQ 表 \* ARABIC ", '').replace(
                                     'TOC \h \c "表" HYPERLINK \l "_Toc14741"', ''),
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]})
            if data:
                if node.node_type == aw.NodeType.SHAPE:
                    shape = node.as_shape()
                    if shape.has_image:
                        image_id = str(uuid.uuid1())  # 长度是36
                        try:
                            image_extension = aw.FileFormatUtil.image_type_to_extension(shape.image_data.image_type)
                            image_file_name = f"{image_id}{image_extension}"
                            image_path = os.path.join(settings.IMAGES_PATH, image_file_name)
                            shape.image_data.save(image_path)
                            data[-1]["Content"].append(
                                {"type": "image",
                                 "content": [{"type": "image", "attrs": {
                                     "src": image_file_name,
                                     "alt": "tips", "title": ''}}],
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]}
                            )
                        except Exception as e:
                            # 捕获并处理无法转换图像类型的错误
                            print(f"Error saving image: {e}. Skipping this image.")
                            continue

                if node.node_type == aw.NodeType.TABLE:
                    parent_node = node.as_table()
                    tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                    _able_content = aw_read_table_id(parent_node, data[-1]["block_id"] + '&&' + str(block_id1))
                    data[-1]["Content"].append(
                        {"type": "table",
                         "content": _able_content,
                         "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                         "parent_block_id": data[-1]["block_id"]})
    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data
    return data

读取的文件
5 伦理学.docx (19.5 KB)

写入结果文件
写入结果.docx (12.0 KB)

请问怎么把3标题下的内容设置wpsoffice手动目录
把图标目录和表格目录标题下面的内容设置成图表目录
5 伦理学.docx (22.8 KB)

@hhh1111 将代码更改为

doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
    sect = s.as_section()
    for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
        block_id = uuid.uuid4()
        block_id1 = uuid.uuid4()
        if node.node_type == aw.NodeType.PARAGRAPH:
            node = node.as_paragraph()
            if not node.get_ancestor(aw.NodeType.TABLE):
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:

                        # 如果级别更深,将当前标题添加到堆栈
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        # 如果级别更浅,将堆栈中的项添加回数据
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level

                    data.append(
                        {
                            "Title": node.get_text(),
                            "block_id": str(block_id),
                            "Content": [],
                            "Level": level,
                            "Table": [],
                            "Tbale_name": [],
                        }
                    )
                else:
                    if data:
                        if node.get_text().strip():
                            data[-1]["Content"].append(
                                {"type": "text",
                                 "content": node.get_text().strip().replace("  SEQ 表 \* ARABIC ", '').replace(
                                     'TOC \h \c "表" HYPERLINK \l "_Toc14741"', ''),
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]})
        if data:
            if node.node_type == aw.NodeType.SHAPE:
                shape = node.as_shape()
                if shape.has_image:
                    image_id = str(uuid.uuid1())  # 长度是36
                    try:
                        image_extension = aw.FileFormatUtil.image_type_to_extension(shape.image_data.image_type)
                        image_file_name = f"{image_id}{image_extension}"
                        image_path = os.path.join(IMAGE_DIR, image_file_name)
                        shape.image_data.save(image_path)
                        data[-1]["Content"].append(
                            {"type": "image",
                             "content": [{"type": "image", "attrs": {
                                 "src": image_file_name,
                                 "alt": "tips", "title": ''}}],
                             "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                             "parent_block_id": data[-1]["block_id"]}
                        )
                    except Exception as e:
                        # 捕获并处理无法转换图像类型的错误
                        print(f"Error saving image: {e}. Skipping this image.")
                        continue

            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                _able_content = self.aw_read_table_id(parent_node, data[-1]["block_id"] + '&&' + str(block_id1))
                data[-1]["Content"].append(
                    {"type": "table",
                     "content": _able_content,
                     "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                     "parent_block_id": data[-1]["block_id"]})
while stack:
    old_level, old_data = stack.pop()
    data = old_data + data
	
return data

@hhh1111 由于翻译问题,我无法理解您的目的。你是指改变内容表的样式吗?

至于字段:您不应该将其解析为文本。您需要添加段落文本,并在需要添加目录的地方使用 builder.insert_table_of_contents("...")

如果用 MS Word 打开原始文件并按 Alt+F9,就会看到有一个 TOC 字段:

下面是 4 个内容字段的表格:

TOC \o "1-3" \h \z \u 
TOC \h \z \c "表格" 
TOC \h \c "表"
TOC \h \z \c "图表"