请问怎么使用docx结构化读取标题和标题下的内容，如果标题下面有图片怎么判断读取呢

hhh1111 · May 21, 2024, 7:40am

图 14.docx (112.2 KB)
以下是代码

def get_col_span(current_cell):
    col_span = 1
    if current_cell != None and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
        current_cell = current_cell.next_cell
        while current_cell != None:
            if current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                col_span += 1
            else:
                break
            current_cell = current_cell.next_cell
    return col_span


def get_row_span(current_cell, row_index, cell_index, table_row_data):
    row_span = 1
    if current_cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
        for row in list(table_row_data)[row_index + 1:]:
            row = row.as_row()
            row_cells = row.cells
            if row_cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                row_span += 1
            else:
                break
    return row_span

def aw_read_table_id(table=None):
    table.convert_to_horizontally_merged_cells()
    table_contents_lis = []
    row_data = table.rows
    for r_index, r in enumerate(row_data, 0):
        row = r.as_row()
        current_cell = row.first_cell
        cell_index = 0
        table_row_data = []
        while current_cell != None:
            if current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS or current_cell.cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                pass
            else:
                current_cell_text = current_cell.get_text().replace('\x07', '').strip()
                cos_pan = get_col_span(current_cell)
                row_pan = get_row_span(current_cell, r_index, cell_index, row_data)
                col_width = current_cell.cell_format.width
                current_dic = {
                    "type": "tableCell",
                    "attrs": {
                        "colspan": cos_pan,
                        "rowspan": row_pan,
                        "colwidth": col_width,
                        "cell_index": cell_index,
                    }, "content": [
                        {
                            "type": "paragraph",
                            "content": [{
                                "type": "text",
                                "text": current_cell_text
                            }]
                        }
                    ]
                }
                table_row_data.append(current_dic)
            cell_index += 1
            current_cell = current_cell.next_cell
        table_contents_lis.append({
            "type": "tableRow",
            "content": table_row_data
        })
    return_data = {
        "type": "table",
        "contents": {
            "type": "table",
           "content": table_contents_lis
        },
    }
    print(return_data)
    return return_data
def aw_extract_headings_and_contents_table_dict_id(file):
    import aspose.words as aw
    lic = aw.License()
    lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
    lic.set_license(lic_path)
    doc = aw.Document(file)
    current_level = 0
    data = []
    stack = []
    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:
                        # 如果级别更深，将当前标题添加到堆栈
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        # 如果级别更浅，将堆栈中的项添加回数据
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level
                    data.append(
                        {
                            "Title": node.get_text(),
                            "Content": [],
                            "Level": level,
                            "Table": [],
                            "Tbale_name": [],
                        }
                    )
                else:
                    if data:
                        if node.get_text().strip():
                            data[-1]["Content"].append(
                                {"type": "text", "content": node.get_text().strip(),

                                 })
            if data:
                if node.node_type == aw.NodeType.TABLE:
                    parent_node = node.as_table()
                    tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                    _able_content = aw_read_table_id(parent_node)
                    data[-1]["Content"].append(
                        {"type": "table",
                         "content": _able_content,}
                       )
    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data
    return data

vyacheslav.deryushev · May 21, 2024, 8:20am

@hhh1111 您只需要

查找段落
查找段落下的形状，因为在 Aspose.Words 中，图像是作为带有一些图像数据的形状存储的。
检查形状是否有图像数据 - ImageData class | Aspose.Words for Python

下面是一段代码:

doc = aw.Document("input.docx")

all_paragraphs = doc.get_child_nodes(aw.NodeType.PARAGRAPH, True)
for para in all_paragraphs:
    shapes = para.as_paragraph().get_child_nodes(aw.NodeType.SHAPE, True)
    for shape in shapes:
        shape = shape.as_shape()
        if shape.has_image:
            image_data = shape.image_data

hhh1111 · May 27, 2024, 9:17am

vyacheslav.deryushev:

all_paragraphs = doc.get_child_nodes(aw.NodeType.PARAGRAPH, True)
for para in all_paragraphs:
    shapes = para.as_paragraph().get_child_nodes(aw.NodeType.SHAPE, True)
    for shape in shapes:
        shape = shape.as_shape()
        if shape.has_image:
            image_data = shape.image_data

async def aw_extract_headings_and_contents_table_dict_id(file):
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
block_id = generate_unique_id()
block_id1 = generate_unique_id()
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深，将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅，将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
“Title”: node.get_text(),
“block_id”: str(block_id),
“Content”: [],
“Level”: level,
“Table”: [],
“Tbale_name”: [],
}
)
else:
if data:
if node.get_text().strip():
data[-1][“Content”].append(
{“type”: “text”, “content”: node.get_text().strip().replace(" SEQ 表 * ARABIC ",‘’),
“block_id”: data[-1][“block_id”] + ‘&&’ + str(block_id1),
“parent_block_id”: data[-1][“block_id”]})
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
_able_content = aw_read_table_id(parent_node, data[-1][“block_id”] + ‘&&’ + str(block_id1))
data[-1][“Content”].append(
{“type”: “table”,
“content”: _able_content,
“block_id”: data[-1][“block_id”] + ‘&&’ + str(block_id1),
“parent_block_id”: data[-1][“block_id”]})
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
请问怎么按照读取的顺序，展示图片呢

hhh1111 · May 27, 2024, 9:25am

image.png (74.5 KB)

为什么我这样写不生效呢

vyacheslav.deryushev · May 27, 2024, 10:12am

@hhh1111 您需要改变 sect.body.get_child_nodes(aw.NodeType.ANY, False) 改为sect.body.get_child_nodes(aw.NodeType.ANY, True)

hhh1111 · May 28, 2024, 3:00am

如果我改成True。为什么表格信息，以段落信息展示一个，以表格格式展示一次呢
image.jpg (195.8 KB)

以下是代码

def aw_extract_headings_and_contents_table_dict_id(file):
    doc = aw.Document(file)
    current_level = 0
    data = []
    stack = []
    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            block_id = generate_unique_id()
            block_id1 = generate_unique_id()
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:

                        # 如果级别更深，将当前标题添加到堆栈
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        # 如果级别更浅，将堆栈中的项添加回数据
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level

                    data.append(
                        {
                            "Title": node.get_text(),
                            "block_id": str(block_id),
                            "Content": [],
                            "Level": level,
                            "Table": [],
                            "Tbale_name": [],
                        }
                    )
                else:

                    if data:
                        if node.get_text().strip():
                            data[-1]["Content"].append(
                                {"type": "text",
                                 "content": node.get_text().strip().replace("  SEQ 表 \* ARABIC ", '').replace(
                                     'TOC \h \c "表" HYPERLINK \l "_Toc14741"', ''),
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]})
            if data:
                if node.node_type == aw.NodeType.SHAPE:
                    shape = node.as_shape()
                    if shape.has_image:
                        image_id = str(uuid.uuid1())  # 长度是36
                        try:
                            image_extension = aw.FileFormatUtil.image_type_to_extension(shape.image_data.image_type)
                            image_file_name = f"{image_id}{image_extension}"
                            image_path = os.path.join(settings.IMAGES_PATH, image_file_name)
                            shape.image_data.save(image_path)
                            data[-1]["Content"].append(
                                {"type": "image",
                                 "content": [{"type": "image", "attrs": {
                                     "src": image_file_name,
                                     "alt": "tips", "title": ''}}],
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]}
                            )
                        except Exception as e:
                            # 捕获并处理无法转换图像类型的错误
                            print(f"Error saving image: {e}. Skipping this image.")
                            continue

                if node.node_type == aw.NodeType.TABLE:
                    parent_node = node.as_table()
                    tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                    _able_content = aw_read_table_id(parent_node, data[-1]["block_id"] + '&&' + str(block_id1))
                    data[-1]["Content"].append(
                        {"type": "table",
                         "content": _able_content,
                         "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                         "parent_block_id": data[-1]["block_id"]})
    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data
    return data


def aw_read_table_id(table=None, id=None):
    table.convert_to_horizontally_merged_cells()

    table_data = []
    for row in table.rows:
        content = {
            "type": "tableRow",
            "content": []
        }
        row_index = table.index_of(row)
        cell = row.as_row().first_cell
        row_span = 1
        col_span = 1
        current_cell = cell
        cell_index = 0
        cell_text = ""
        while current_cell is not None:
            cell_index = current_cell.parent_row.index_of(current_cell)
            if current_cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                cell_text = current_cell.get_text()
                current_cell = current_cell.next_cell
                for i in range(row_index, table.rows.count):
                    if table.rows[i].cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                        row_span += 1
                while current_cell is not None and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                    col_span = col_span + 1
                    current_cell = current_cell.next_cell
            elif current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                cell_text = current_cell.get_text()
                current_cell = current_cell.next_cell
                while current_cell is not None and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                    col_span = col_span + 1
                    current_cell = current_cell.next_cell
            elif current_cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                cell_text = current_cell.get_text()
                cell_index = current_cell.parent_row.index_of(current_cell)
                for i in range(row_index, table.rows.count):
                    if table.rows[i].cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                        row_span += 1
                current_cell = current_cell.next_cell
            else:

                cell_text = current_cell.get_text()
                current_cell = current_cell.next_cell

            cell_content = {
                "type": "tableCell",
                "attrs": {
                    "colspan": col_span,
                    "rowspan": row_span,
                    "colwidth": None
                },
                "content": []
            }

            paragraph = {
                "type": "paragraph",
                "content": [
                    {
                        "type": "text",
                        "text": cell_text,
                    }
                ]
            }
            cell_content["content"].append(paragraph)
            content["content"].append(cell_content)

            col_span = 1
            row_span = 1

        table_data.append(content)
    return_data = {
        "type": "table",
        "attrs": {
            "id": id
        },
        "content": table_data
    }
    return return_data

vyacheslav.deryushev · May 28, 2024, 6:55am

@hhh1111 您能否以 docx 文件的形式提供这部分文件？

hhh1111 · May 28, 2024, 7:01am

可以的顺便帮我看下，读取出来的图表目录信息出来是乱码
5 伦理学.docx (63.4 KB)

vyacheslav.deryushev · May 28, 2024, 8:03am

@hhh1111 在这种情况下，我们会以递归方式获得所有子节点，而且由于每个单元格都有段落，我们会分别获得表格和表格中的段落，并彼此重复。要避免这种情况，可以使用

elif data:
    if node.get_text().strip() and not node.get_ancestor(aw.NodeType.TABLE):
        data[-1]["Content"].append(
            {"type": "text",
             "content": node.get_text().strip().replace("  SEQ 表 \* ARABIC ", '').replace(
                 'TOC \h \c "表" HYPERLINK \l "_Toc14741"', ''),
             "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
             "parent_block_id": data[-1]["block_id"]})

hhh1111 · May 28, 2024, 8:04am

好的谢谢。帮我看下图标读取出来有乱码

vyacheslav.deryushev · May 28, 2024, 8:10am

@hhh1111 能否提供一份带有图标的文件？请说明问题出在哪里。保存到图像路径之后？

hhh1111 · May 28, 2024, 8:12am

图表。 3.1表格目录段落下面的内容

vyacheslav.deryushev · May 28, 2024, 8:36am

@hhh1111 在提供的文档中，我看不到任何带图标的形状。

hhh1111 · May 28, 2024, 9:24am

这个段落内容乱码不是图标

vyacheslav.deryushev · May 28, 2024, 10:13am

@hhh1111 很难理解这个问题。在 3.1 段之后有一个 TOC 字段。我在另一个帖子中回答了这个问题。

您可以在打开文档后添加doc.unlink_fields()，然后将 TOC 作为文本读取，而不使用字段；也可以标记需要添加 TOC 的位置，然后使用builder.insert_table_of_contents("\\o \"1-3\" \\h \\z \\u")将 TOC 插入结果文件中。最后，在保存结果之前，您需要更新字段 doc.update_fields()。

hhh1111 · May 29, 2024, 1:25am

你好我已经设置还是会出现这个问题。

async def aw_extract_headings_and_contents_table_dict_id(file):
    doc = aw.Document(file)
    current_level = 0
    data = []
    stack = []
    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            block_id = generate_unique_id()
            block_id1 = generate_unique_id()
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:

                        # 如果级别更深，将当前标题添加到堆栈
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        # 如果级别更浅，将堆栈中的项添加回数据
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level

                    data.append(
                        {
                            "Title": node.get_text(),
                            "block_id": str(block_id),
                            "Content": [],
                            "Level": level,
                            "Table": [],
                            "Tbale_name": [],
                        }
                    )
                else:

                    if data:
                        if node.get_text().strip() and not node.get_ancestor(aw.NodeType.TABLE):
                            data[-1]["Content"].append(
                                {"type": "text",
                                 "content": node.get_text().strip().replace("  SEQ 表 \* ARABIC ", '').replace(
                                     'TOC \h \c "表" HYPERLINK \l "_Toc14741"', ''),
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]})
            if data:
                if node.node_type == aw.NodeType.SHAPE:
                    shape = node.as_shape()
                    if shape.has_image:
                        image_id = str(uuid.uuid1())  # 长度是36
                        try:
                            image_extension = aw.FileFormatUtil.image_type_to_extension(shape.image_data.image_type)
                            image_file_name = f"{image_id}{image_extension}"
                            image_path = os.path.join(settings.IMAGES_PATH, image_file_name)
                            shape.image_data.save(image_path)
                            data[-1]["Content"].append(
                                {"type": "image",
                                 "content": [{"type": "image", "attrs": {
                                     "src": image_file_name,
                                     "alt": "tips", "title": ''}}],
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]}
                            )
                        except Exception as e:
                            # 捕获并处理无法转换图像类型的错误
                            print(f"Error saving image: {e}. Skipping this image.")
                            continue

                if node.node_type == aw.NodeType.TABLE:
                    parent_node = node.as_table()
                    tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                    _able_content = aw_read_table_id(parent_node, data[-1]["block_id"] + '&&' + str(block_id1))
                    data[-1]["Content"].append(
                        {"type": "table",
                         "content": _able_content,
                         "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                         "parent_block_id": data[-1]["block_id"]})
    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data
    return data

hhh1111 · May 29, 2024, 1:32am

读取的文件
5 伦理学.docx (19.5 KB)

写入结果文件
写入结果.docx (12.0 KB)

hhh1111 · May 29, 2024, 1:58am

请问怎么把3标题下的内容设置wpsoffice手动目录
把图标目录和表格目录标题下面的内容设置成图表目录
5 伦理学.docx (22.8 KB)

vyacheslav.deryushev · May 29, 2024, 5:11am

@hhh1111 将代码更改为

doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
    sect = s.as_section()
    for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
        block_id = uuid.uuid4()
        block_id1 = uuid.uuid4()
        if node.node_type == aw.NodeType.PARAGRAPH:
            node = node.as_paragraph()
            if not node.get_ancestor(aw.NodeType.TABLE):
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:

                        # 如果级别更深，将当前标题添加到堆栈
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        # 如果级别更浅，将堆栈中的项添加回数据
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level

                    data.append(
                        {
                            "Title": node.get_text(),
                            "block_id": str(block_id),
                            "Content": [],
                            "Level": level,
                            "Table": [],
                            "Tbale_name": [],
                        }
                    )
                else:
                    if data:
                        if node.get_text().strip():
                            data[-1]["Content"].append(
                                {"type": "text",
                                 "content": node.get_text().strip().replace("  SEQ 表 \* ARABIC ", '').replace(
                                     'TOC \h \c "表" HYPERLINK \l "_Toc14741"', ''),
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]})
        if data:
            if node.node_type == aw.NodeType.SHAPE:
                shape = node.as_shape()
                if shape.has_image:
                    image_id = str(uuid.uuid1())  # 长度是36
                    try:
                        image_extension = aw.FileFormatUtil.image_type_to_extension(shape.image_data.image_type)
                        image_file_name = f"{image_id}{image_extension}"
                        image_path = os.path.join(IMAGE_DIR, image_file_name)
                        shape.image_data.save(image_path)
                        data[-1]["Content"].append(
                            {"type": "image",
                             "content": [{"type": "image", "attrs": {
                                 "src": image_file_name,
                                 "alt": "tips", "title": ''}}],
                             "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                             "parent_block_id": data[-1]["block_id"]}
                        )
                    except Exception as e:
                        # 捕获并处理无法转换图像类型的错误
                        print(f"Error saving image: {e}. Skipping this image.")
                        continue

            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                _able_content = self.aw_read_table_id(parent_node, data[-1]["block_id"] + '&&' + str(block_id1))
                data[-1]["Content"].append(
                    {"type": "table",
                     "content": _able_content,
                     "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                     "parent_block_id": data[-1]["block_id"]})
while stack:
    old_level, old_data = stack.pop()
    data = old_data + data
	
return data

vyacheslav.deryushev · May 29, 2024, 5:28am

@hhh1111 由于翻译问题，我无法理解您的目的。你是指改变内容表的样式吗？

至于字段：您不应该将其解析为文本。您需要添加段落文本，并在需要添加目录的地方使用 builder.insert_table_of_contents("...")。

如果用 MS Word 打开原始文件并按 Alt+F9，就会看到有一个 TOC 字段：

下面是 4 个内容字段的表格：

TOC \o "1-3" \h \z \u 
TOC \h \z \c "表格" 
TOC \h \c "表"
TOC \h \z \c "图表"