怎么查询docx表格内容以及合并规则

vyacheslav.deryushev · May 8, 2024, 7:39am

@hhh1111 根据 Microsoft Word 的设计，Microsoft Word 文档中表格的行是完全独立的。这意味着每一行可以有任意数量、任意宽度的单元格。因此，如果您想象第一行有一个宽单元格，第二行有两个窄单元格，那么查看此文档时，第一行的单元格会出现水平合并。但这并不是合并单元格，它只是一个宽单元格。另一种完全正确的情况是第一行有两个单元格。第一个单元格有 CellMerge.First，第二个单元格有 CellMerge.Previous，在这种情况下，它就是一个合并单元格。在这两种情况下，MS Word 中的视觉效果是完全一样的。这两种情况都是有效的。

您可以使用下面的代码，它可以计算出合并单元格跨越了多少列或多少行：

def aw_read_table(self, table, tables):
    table_data = []
    for row in table.rows:
        for cell in row.as_row().cells:
            parent_table = table
            cell = cell.as_cell()
            is_horizontally_merged = cell.cell_format.horizontal_merge != aw.tables.CellMerge.NONE
            is_vertically_merged = cell.cell_format.vertical_merge != aw.tables.CellMerge.NONE
            table_index = tables.index_of(parent_table)
            row_index = parent_table.index_of(row)
            cell_index = cell.parent_row.index_of(cell)
            row_span = 1
            col_span = 1

            if row_index < parent_table.rows.count and cell_index < parent_table.rows[row_index].cells.count:
                if is_horizontally_merged and is_vertically_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1
                elif is_horizontally_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                elif is_vertically_merged:
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[cell_index] is not None and \
                                    parent_table.rows[i].cells[
                                        cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1

            table_data.append({
                "type": "tableCell",
                "attrs": {
                    "colspan": col_span,
                    "rowspan": row_span,
                    "id": f"{table_index}.{row_index}.{cell_index}",
                },
                "content": [
                    {
                        "type": "paragraph",
                        "content": [
                            {
                                "type": "text",
                                "text": cell.get_text(),
                            }
                        ],
                    }
                ],
            })

    return table_data

doc = aw.Document("CSR.docx")
current_level = 0
data = []
stack = []
for s in doc.sections:
    sect = s.as_section()
    for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
        if node.node_type == aw.NodeType.PARAGRAPH:
            node = node.as_paragraph()
            if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                level = int(node.paragraph_format.outline_level) + 1
                if level > current_level:
                    # 如果级别更深，将当前标题添加到堆栈
                    stack.append((current_level, data))
                    data = []
                    current_level = level
                elif level < current_level:
                    # 如果级别更浅，将堆栈中的项添加回数据
                    while stack and stack[-1][0] >= level:
                        old_level, old_data = stack.pop()
                        data = old_data + data
                        current_level = old_level
                data.append(
                    {
                        "Title": node.get_text(),
                        "Content": [],
                        "Level": level,
                        "Table": [],
                        "Tbale_name": [],
                    }
                )
            else:
                if data:
                    if node.get_text().startswith("表"):
                        data[-1]["Tbale_name"].append(
                            node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
                        )
                    if (
                            node.get_text().startswith("表")
                            or node.get_text().startswith("来源：")
                            or node.get_text().startswith("图")
                    ):
                        pass
                    else:

                        data[-1]["Content"].append(node.get_text())
        if data:
            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                able_content = self.aw_read_table(parent_node, tables)
                data[-1]["Table"].append(able_content)

while stack:
    old_level, old_data = stack.pop()
    data = old_data + data

return data

希望能帮到你。

hhh1111 · May 9, 2024, 3:04am

vyacheslav.deryushev:

parent_node = node.as_table()
                tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                able_content = self.aw_read_table(parent_node, tables)

表格出来合并的数据不对呀

这里 rowspan不可能是1啊

def aw_extract_headings_and_contents_table_dict_id(file):
    import aspose.words as aw
    lic = aw.License()
    lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
    lic.set_license(lic_path)
    doc = aw.Document(file)
    current_level = 0
    data = []
    stack = []
    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
            block_id = generate_unique_id()
            block_id1 = generate_unique_id()
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:
                        # 如果级别更深，将当前标题添加到堆栈
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        # 如果级别更浅，将堆栈中的项添加回数据
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level
                    data.append(
                        {
                            "Title": node.get_text(),
                            "block_id": str(block_id),
                            "Content": [],
                            "Level": level,
                            "Table": [],
                            "Tbale_name": [],
                        }
                    )
                else:
                    if data:
                        if node.get_text().strip():
                            data[-1]["Content"].append(
                                {"type": "text", "content": node.get_text().strip(),
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]})
            if data:
                if node.node_type == aw.NodeType.TABLE:
                    parent_node = node.as_table()
                    tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                    _able_content = aw_read_table_id(parent_node, tables)
                    able_content = {"type": "table",
                                    "attrs": {
                                        "id": data[-1]["block_id"] + '&&' + str(block_id1)
                                    },
                                    "content": _able_content}
                    data[-1]["Content"].append(
                        {"type": "table",
                         "content": able_content,
                         "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                         "parent_block_id": data[-1]["block_id"]})
    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data
    return data
def aw_read_table_id(table, tables):
    import aspose.words as aw
    lic = aw.License()
    lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
    lic.set_license(lic_path)
    table_data = []
    for row in table.rows:
        content = {
            "type": "tableRow",
            "content": []
        }
        for cell in row.as_row().cells:
            parent_table = table
            cell = cell.as_cell()
            is_horizontally_merged = cell.cell_format.horizontal_merge != aw.tables.CellMerge.NONE
            is_vertically_merged = cell.cell_format.vertical_merge != aw.tables.CellMerge.NONE
            table_index = tables.index_of(parent_table)
            row_index = parent_table.index_of(row)
            cell_index = cell.parent_row.index_of(cell)
            row_span = 1
            col_span = 1

            if row_index < parent_table.rows.count and cell_index < parent_table.rows[row_index].cells.count:
                if is_horizontally_merged and is_vertically_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[
                                cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1
                elif is_horizontally_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                elif is_vertically_merged:
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[cell_index] is not None and \
                                    parent_table.rows[i].cells[
                                        cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1

            cell_content = {
                "type": "tableCell",
                "attrs": {
                    "colspan": col_span,
                    "rowspan": row_span,
                    "colwidth": None
                },
                "content": []
            }

            # Add paragraph content to the cell
            paragraph = {
                "type": "paragraph",
                "content": [
                    {
                        "type": "text",
                        "text": cell.get_text(),
                    }
                ]
            }
            cell_content["content"].append(paragraph)
            content["content"].append(cell_content)
        table_data.append(content)
    return table_data

hhh1111 · May 9, 2024, 3:19am

上面图片通过以上代码出来的效果不对呀
下面原文件格式

alexey.noskov · May 9, 2024, 1:36pm

@hhh1111 正如 Vyacheslav 所提到的，在 MS Word 表格中没有“列”概念，表格中的每一行都是独立的，可以包含任意数量的单元格。请参阅我们的文档以获取更多信息：
https://docs.aspose.com/words/python-net/working-with-columns-and-rows/#work-with-columns

因此，MS Word 表格中的水平合并单元格可以通过简单的宽单元格来模拟。您可以使用 Table.convert_to_horizontally_merged_cells 方法将“简单宽度”单元格转换为水平合并单元格。

hhh1111 · May 10, 2024, 1:18am

可以帮我进行更改一下嘛，我不太理解

hhh1111 · May 10, 2024, 1:19am

可以通过上面代码帮我简单更改一下嘛

hhh1111 · May 10, 2024, 5:46am

可以通过上面代码帮我简单更改一下嘛

alexey.noskov · May 10, 2024, 2:03pm

@hhh1111 不太清楚你指的是什么变化？请注意，支持人员不应该为您编写代码，我们的目标是为您指明正确的方向。为了使对话更具建设性，请发布简化版本的代码，这将使您和我们更容易更好地理解您的需求。

如果您的目标是获取合并单元格的数量，您可以使用以下简单代码：

doc = aw.Document("C:\\Temp\\in.docx")

for t in doc.get_child_nodes(aw.NodeType.TABLE, True) :
    table = t.as_table()
    # Convert simply wide cells to horizontally merged cells
    table.convert_to_horizontally_merged_cells()
    for r in table.rows:
        row = r.as_row()
        print("---------=========Row Start========------------")
        current_cell = row.first_cell
        col_span = 1
        # loop through cells in the row and calculate col span
        while current_cell != None:
            if current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                # move to the next cell
                current_cell = current_cell.next_cell
                while current_cell != None and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                    col_span=col_span+1
                    current_cell = current_cell.next_cell
            else:
                # move to the next cell in the row.
                current_cell = current_cell.next_cell
            
            print("col_span: " + str(col_span))

            # reset col_span
            col_span = 1;

hhh1111 · May 11, 2024, 1:25am

hhh1111:

def aw_read_table_id(table, tables):
    import aspose.words as aw
    lic = aw.License()
    lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
    lic.set_license(lic_path)
    table_data = []
    for row in table.rows:
        content = {
            "type": "tableRow",
            "content": []
        }
        for cell in row.as_row().cells:
            parent_table = table
            cell = cell.as_cell()
            is_horizontally_merged = cell.cell_format.horizontal_merge != aw.tables.CellMerge.NONE
            is_vertically_merged = cell.cell_format.vertical_merge != aw.tables.CellMerge.NONE
            table_index = tables.index_of(parent_table)
            row_index = parent_table.index_of(row)
            cell_index = cell.parent_row.index_of(cell)
            row_span = 1
            col_span = 1

            if row_index < parent_table.rows.count and cell_index < parent_table.rows[row_index].cells.count:
                if is_horizontally_merged and is_vertically_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[
                                cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1
                elif is_horizontally_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                elif is_vertically_merged:
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[cell_index] is not None and \
                                    parent_table.rows[i].cells[
                                        cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1

            cell_content = {
                "type": "tableCell",
                "attrs": {
                    "colspan": col_span,
                    "rowspan": row_span,
                    "colwidth": None
                },
                "content": []
            }

            # Add paragraph content to the cell
            paragraph = {
                "type": "paragraph",
                "content": [
                    {
                        "type": "text",
                        "text": cell.get_text(),
                    }
                ]
            }
            cell_content["content"].append(paragraph)
            content["content"].append(cell_content)
        table_data.append(content)
    return table_data

需要使用以下代码进行修改，我需要知道row_span 和col_span 的结果

def aw_read_table_id(table, tables):
    import aspose.words as aw
    lic = aw.License()
    lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
    lic.set_license(lic_path)
    table_data = []
    for row in table.rows:
        content = {
            "type": "tableRow",
            "content": []
        }
        for cell in row.as_row().cells:
            parent_table = table
            cell = cell.as_cell()
            is_horizontally_merged = cell.cell_format.horizontal_merge != aw.tables.CellMerge.NONE
            is_vertically_merged = cell.cell_format.vertical_merge != aw.tables.CellMerge.NONE
            table_index = tables.index_of(parent_table)
            row_index = parent_table.index_of(row)
            cell_index = cell.parent_row.index_of(cell)
            row_span = 1
            col_span = 1

            if row_index < parent_table.rows.count and cell_index < parent_table.rows[row_index].cells.count:
                if is_horizontally_merged and is_vertically_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[
                                cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1
                elif is_horizontally_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                elif is_vertically_merged:
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[cell_index] is not None and \
                                    parent_table.rows[i].cells[
                                        cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1

            cell_content = {
                "type": "tableCell",
                "attrs": {
                    "colspan": col_span,
                    "rowspan": row_span,
                    "colwidth": None
                },
                "content": []
            }

            # Add paragraph content to the cell
            paragraph = {
                "type": "paragraph",
                "content": [
                    {
                        "type": "text",
                        "text": cell.get_text(),
                    }
                ]
            }
            cell_content["content"].append(paragraph)
            content["content"].append(cell_content)
        table_data.append(content)
    return table_data

hhh1111 · May 11, 2024, 1:26am

主要是有一些用法不太理解

hhh1111 · May 11, 2024, 2:17am

我需要知道 col_span和row_span的结果

alexey.noskov · May 11, 2024, 11:21am

@hhh1111 请参阅我们的文档以了解如何使用合并单元格：
https://docs.aspose.com/words/python-net/working-with-merged-cells/