怎么查询docx表格内容以及合并规则

hhh1111 · May 7, 2024, 2:46am

可以给我写一个通用的demo吗，因为每个表格合并的规则都不一样

hhh1111 · May 7, 2024, 5:45am

hhh1111 · May 7, 2024, 5:46am

table_row["content"].append(
                                    {
                                        "type": "tableCell",
                                        "attrs": {
                                            "colspan": colspan,
                                            "rowspan": rowspan,
                                            'id': item["block_id"],
                                        },
                                        "content": [
                                            {
                                                "type": "paragraph",
                                                "content": [
                                                    {
                                                        "type": "text",
                                                        "text": cell,
                                                    }
                                                ],
                                            }
                                        ],
                                    }
                                )

我想处理成这样的格式

hhh1111 · May 7, 2024, 5:44am

可以看下我发布的帖子ma

hhh1111 · May 7, 2024, 6:11am

怎么查询docx表格内容以及合并规则 - #6 by hhh1111 可以看下我的嘛

hhh1111 · May 7, 2024, 6:23am

看下我的帖子？？？？？？？？？

hhh1111 · May 7, 2024, 6:24am

帮我看下我的帖子？？？？

hhh1111 · May 7, 2024, 7:34am

测试.docx (21.0 KB)

处理后的格式需要如下
{
“type”: “tableCell”,
“attrs”: {
“colspan”: colspan,
“rowspan”: rowspan,
‘id’: item[“block_id”],
},
“content”: [
{
“type”: “paragraph”,
“content”: [
{
“type”: “text”,
“text”: cell,
}
],
}
],
}

hhh1111 · May 7, 2024, 8:09am

我的帖子大家看不见嘛？？？？？

hhh1111 · May 7, 2024, 8:10am

你可以看见我的帖子嘛？？？？？

hhh1111 · May 7, 2024, 8:11am

请问可以看见我的帖子嘛？？？？？？？？？？？？？

vyacheslav.deryushev · May 7, 2024, 8:47am

@hhh1111 要获取行跨度和列跨度，您需要将 docx 文件转换为 HTML 文件，然后像这样收集以下信息：

def test_merged_cells(self):
    doc = aw.Document("merged.docx")

    with BytesIO() as html_stream:
        save_options = aw.saving.HtmlSaveOptions()
        save_options.images_folder = ARTIFACTS_DIR
        doc.save(html_stream, save_options)
        html_stream.seek(0)
        xml_doc = xml_tree.parse(html_stream)

    tables = xml_doc.findall(".//table")
    for table in tables:
        table_inf = []
        rows = table.findall(".//tr")
        for row in rows:
            row_inf = []
            cells = row.findall(".//td")
            for cell in cells:
                col_span_attr = cell.get("colspan")
                row_span_attr = cell.get("rowspan")
                col_span = int(col_span_attr) if col_span_attr else 0
                row_span = int(row_span_attr) if row_span_attr else 0
                cell_inf = self.CellInfo(col_span, row_span)
                row_inf.append(cell_inf)
            table_inf.append(row_inf)
        self.mTables.append(table_inf)

    print("test")

class CellInfo:
    def __init__(self, colSpan, rowSpan):
        self.ColSpan = colSpan
        self.RowSpan = rowSpan

mTables = []

hhh1111 · May 7, 2024, 8:51am

那如果我文件即有表格信息和段落信息呢，我只想把表格信息处理成上面的格式
def aw_read_table(tables):
_table = []

for row in tables.rows:
    _row = ""
    for cell in row.as_row().cells:
        if _row:
            _row = _row + "\t" + cell.as_cell().get_text().strip()
        else:
            _row = cell.as_cell().get_text().strip()
    _table.append(_row)
table_info = "\n".join(_table)
return table_info

def aw_extract_headings_and_contents_table_dict(file):
import aspose.words as aw

lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)

doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
    sect = s.as_section()
    for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
        if node.node_type == aw.NodeType.PARAGRAPH:
            node = node.as_paragraph()
            if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                level = int(node.paragraph_format.outline_level) + 1
                if level > current_level:
                    # 如果级别更深，将当前标题添加到堆栈
                    stack.append((current_level, data))
                    data = []
                    current_level = level
                elif level < current_level:
                    # 如果级别更浅，将堆栈中的项添加回数据
                    while stack and stack[-1][0] >= level:
                        old_level, old_data = stack.pop()
                        data = old_data + data
                        current_level = old_level
                data.append(
                    {
                        "Title": node.get_text(),
                        "Content": [],
                        "Level": level,
                        "Table": [],
                        "Tbale_name": [],
                    }
                )
            else:
                if data:
                    if node.get_text().startswith("表"):
                        data[-1]["Tbale_name"].append(
                            node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
                        )
                    if (
                            node.get_text().startswith("表")
                            or node.get_text().startswith("来源：")
                            or node.get_text().startswith("图")
                    ):
                        pass
                    else:

                        data[-1]["Content"].append(node.get_text())
        if data:
            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                able_content = aw_read_table(parent_node)
                data[-1]["Table"].append(able_content)
while stack:
    old_level, old_data = stack.pop()
    data = old_data + data
return data

vyacheslav.deryushev · May 7, 2024, 9:03am

@hhh1111 它不会计量，因为你只解析表格信息。

hhh1111 · May 7, 2024, 9:05am

不是的如果我文件是这样的呢
CSR-20240410-h63D4QaGoR.docx (838.1 KB)

我需要table数据为这种格式
def aw_read_table(tables):
_table = []

for row in tables.rows:
    _row = ""
    for cell in row.as_row().cells:
        if _row:
            _row = _row + "\t" + cell.as_cell().get_text().strip()
        else:
            _row = cell.as_cell().get_text().strip()
    _table.append(_row)
table_info = "\n".join(_table)
return table_info

def aw_extract_headings_and_contents_table_dict(file):
import aspose.words as aw

lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)

doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
    sect = s.as_section()
    for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
        if node.node_type == aw.NodeType.PARAGRAPH:
            node = node.as_paragraph()
            if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                level = int(node.paragraph_format.outline_level) + 1
                if level > current_level:
                    # 如果级别更深，将当前标题添加到堆栈
                    stack.append((current_level, data))
                    data = []
                    current_level = level
                elif level < current_level:
                    # 如果级别更浅，将堆栈中的项添加回数据
                    while stack and stack[-1][0] >= level:
                        old_level, old_data = stack.pop()
                        data = old_data + data
                        current_level = old_level
                data.append(
                    {
                        "Title": node.get_text(),
                        "Content": [],
                        "Level": level,
                        "Table": [],
                        "Tbale_name": [],
                    }
                )
            else:
                if data:
                    if node.get_text().startswith("表"):
                        data[-1]["Tbale_name"].append(
                            node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
                        )
                    if (
                            node.get_text().startswith("表")
                            or node.get_text().startswith("来源：")
                            or node.get_text().startswith("图")
                    ):
                        pass
                    else:

                        data[-1]["Content"].append(node.get_text())
        if data:
            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                able_content = aw_read_table(parent_node)
                data[-1]["Table"].append(able_content)
while stack:
    old_level, old_data = stack.pop()
    data = old_data + data
return data

def test_merged_cells(self):
    doc = aw.Document("merged.docx")

    with BytesIO() as html_stream:
        save_options = aw.saving.HtmlSaveOptions()
        save_options.images_folder = ARTIFACTS_DIR
        doc.save(html_stream, save_options)
        html_stream.seek(0)
        xml_doc = xml_tree.parse(html_stream)

    tables = xml_doc.findall(".//table")
    for table in tables:
        table_inf = []
        rows = table.findall(".//tr")
        for row in rows:
            row_inf = []
            cells = row.findall(".//td")
            for cell in cells:
                col_span_attr = cell.get("colspan")
                row_span_attr = cell.get("rowspan")
                col_span = int(col_span_attr) if col_span_attr else 0
                row_span = int(row_span_attr) if row_span_attr else 0
                cell_inf = self.CellInfo(col_span, row_span)
                row_inf.append(cell_inf)
            table_inf.append(row_inf)
        self.mTables.append(table_inf)

    print("test")

class CellInfo:
    def __init__(self, colSpan, rowSpan):
        self.ColSpan = colSpan
        self.RowSpan = rowSpan

mTables = []

这个为table数据  if data:
            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                able_content = aw_read_table(parent_node)
                data[-1]["Table"].append(able_content)

vyacheslav.deryushev · May 7, 2024, 10:52am

@hhh1111 使用此代码，您可以获得有关所拥有的表的信息。并以您需要的任何格式收集。

doc = aw.Document("input.docx")
word_tables = doc.get_child_nodes(aw.NodeType.TABLE, True)

with BytesIO() as html_stream:
    save_options = aw.saving.HtmlSaveOptions()
    save_options.images_folder = ARTIFACTS_DIR
    doc.save(html_stream, save_options)
    html_stream.seek(0)
    xml_doc = xml_tree.parse(html_stream)

m_tables = []
tables = xml_doc.findall(".//table")
for table in tables:
    table_inf = self.TableInfo()
    rows = table.findall(".//tr")
    for row in rows:
        row_inf = self.RowInfo()
        cells = row.findall(".//td")
        for cell in cells:
            col_span_attr = cell.get("colspan")
            row_span_attr = cell.get("rowspan")
            col_span = int(col_span_attr) if col_span_attr else 0
            row_span = int(row_span_attr) if row_span_attr else 0
            cell_inf = self.CellInfo(col_span, row_span)
            row_inf.Cells.append(cell_inf)
        table_inf.Rows.append(row_inf)
    m_tables.append(table_inf)

for table in word_tables:
    for row in table.as_table().rows:
        for cell in row.as_row().cells:
            cell = cell.as_cell()
            tab_id = word_tables.index_of(cell.parent_row.parent_table)
            row_id = cell.parent_row.parent_table.index_of(cell.parent_row)
            cell_id = cell.parent_row.index_of(cell)

            col_span = 0
            row_span = 0
            if tab_id < len(m_tables) and row_id < len(m_tables[tab_id].Rows) and cell_id < len(
                    m_tables[tab_id].Rows[row_id].Cells):
                col_span = m_tables[tab_id].Rows[row_id].Cells[cell_id].ColSpan
                row_span = m_tables[tab_id].Rows[row_id].Cells[cell_id].RowSpan

            print("{0}.{1}.{2} colspan={3}\t rowspan={4}\t text={5}".format(tab_id, row_id, cell_id, col_span,
                                                                            row_span, cell.get_text()))


class TableInfo:
    def __init__(self):
        self.Rows = []

class RowInfo:
    def __init__(self):
        self.Cells = []

class CellInfo:
    def __init__(self, col_span, row_span):
        self.ColSpan = col_span
        self.RowSpan = row_span

hhh1111 · May 8, 2024, 1:25am

不对呀，我要通过以下代码，更改表格数据格式
ic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, “core/Aspose.Total.Product.Family.lic”)
lic.set_license(lic_path)

doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深，将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅，将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
“Title”: node.get_text(),
“Content”: [],
“Level”: level,
“Table”: [],
“Tbale_name”: [],
}
)
else:
if data:
if node.get_text().startswith(“表”):
data[-1][“Tbale_name”].append(
node.get_text().strip(“SEQ * ARABIC”).strip(“SEQ”)
)
if (
node.get_text().startswith(“表”)
or node.get_text().startswith(“来源：”)
or node.get_text().startswith(“图”)
):
pass
else:

                    data[-1]["Content"].append(node.get_text())
    if data:
        if node.node_type == aw.NodeType.TABLE:
            parent_node = node.as_table()
            able_content = aw_read_table(parent_node)
            data[-1]["Table"].append(able_content)

while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data

if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)

hhh1111 · May 8, 2024, 1:27am

if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)这个为读取的表格信息，我需要这个table数据知道合并规则if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)

vyacheslav.deryushev · May 8, 2024, 7:39am

@hhh1111 根据 Microsoft Word 的设计，Microsoft Word 文档中表格的行是完全独立的。这意味着每一行可以有任意数量、任意宽度的单元格。因此，如果您想象第一行有一个宽单元格，第二行有两个窄单元格，那么查看此文档时，第一行的单元格会出现水平合并。但这并不是合并单元格，它只是一个宽单元格。另一种完全正确的情况是第一行有两个单元格。第一个单元格有 CellMerge.First，第二个单元格有 CellMerge.Previous，在这种情况下，它就是一个合并单元格。在这两种情况下，MS Word 中的视觉效果是完全一样的。这两种情况都是有效的。

您可以使用下面的代码，它可以计算出合并单元格跨越了多少列或多少行：

def aw_read_table(self, table, tables):
    table_data = []
    for row in table.rows:
        for cell in row.as_row().cells:
            parent_table = table
            cell = cell.as_cell()
            is_horizontally_merged = cell.cell_format.horizontal_merge != aw.tables.CellMerge.NONE
            is_vertically_merged = cell.cell_format.vertical_merge != aw.tables.CellMerge.NONE
            table_index = tables.index_of(parent_table)
            row_index = parent_table.index_of(row)
            cell_index = cell.parent_row.index_of(cell)
            row_span = 1
            col_span = 1

            if row_index < parent_table.rows.count and cell_index < parent_table.rows[row_index].cells.count:
                if is_horizontally_merged and is_vertically_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1
                elif is_horizontally_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                elif is_vertically_merged:
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[cell_index] is not None and \
                                    parent_table.rows[i].cells[
                                        cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1

            table_data.append({
                "type": "tableCell",
                "attrs": {
                    "colspan": col_span,
                    "rowspan": row_span,
                    "id": f"{table_index}.{row_index}.{cell_index}",
                },
                "content": [
                    {
                        "type": "paragraph",
                        "content": [
                            {
                                "type": "text",
                                "text": cell.get_text(),
                            }
                        ],
                    }
                ],
            })

    return table_data

doc = aw.Document("CSR.docx")
current_level = 0
data = []
stack = []
for s in doc.sections:
    sect = s.as_section()
    for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
        if node.node_type == aw.NodeType.PARAGRAPH:
            node = node.as_paragraph()
            if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                level = int(node.paragraph_format.outline_level) + 1
                if level > current_level:
                    # 如果级别更深，将当前标题添加到堆栈
                    stack.append((current_level, data))
                    data = []
                    current_level = level
                elif level < current_level:
                    # 如果级别更浅，将堆栈中的项添加回数据
                    while stack and stack[-1][0] >= level:
                        old_level, old_data = stack.pop()
                        data = old_data + data
                        current_level = old_level
                data.append(
                    {
                        "Title": node.get_text(),
                        "Content": [],
                        "Level": level,
                        "Table": [],
                        "Tbale_name": [],
                    }
                )
            else:
                if data:
                    if node.get_text().startswith("表"):
                        data[-1]["Tbale_name"].append(
                            node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
                        )
                    if (
                            node.get_text().startswith("表")
                            or node.get_text().startswith("来源：")
                            or node.get_text().startswith("图")
                    ):
                        pass
                    else:

                        data[-1]["Content"].append(node.get_text())
        if data:
            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                able_content = self.aw_read_table(parent_node, tables)
                data[-1]["Table"].append(able_content)

while stack:
    old_level, old_data = stack.pop()
    data = old_data + data

return data

希望能帮到你。

hhh1111 · May 9, 2024, 3:04am

vyacheslav.deryushev:

parent_node = node.as_table()
                tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                able_content = self.aw_read_table(parent_node, tables)

表格出来合并的数据不对呀

这里 rowspan不可能是1啊

def aw_extract_headings_and_contents_table_dict_id(file):
    import aspose.words as aw
    lic = aw.License()
    lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
    lic.set_license(lic_path)
    doc = aw.Document(file)
    current_level = 0
    data = []
    stack = []
    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
            block_id = generate_unique_id()
            block_id1 = generate_unique_id()
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:
                        # 如果级别更深，将当前标题添加到堆栈
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        # 如果级别更浅，将堆栈中的项添加回数据
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level
                    data.append(
                        {
                            "Title": node.get_text(),
                            "block_id": str(block_id),
                            "Content": [],
                            "Level": level,
                            "Table": [],
                            "Tbale_name": [],
                        }
                    )
                else:
                    if data:
                        if node.get_text().strip():
                            data[-1]["Content"].append(
                                {"type": "text", "content": node.get_text().strip(),
                                 "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                                 "parent_block_id": data[-1]["block_id"]})
            if data:
                if node.node_type == aw.NodeType.TABLE:
                    parent_node = node.as_table()
                    tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
                    _able_content = aw_read_table_id(parent_node, tables)
                    able_content = {"type": "table",
                                    "attrs": {
                                        "id": data[-1]["block_id"] + '&&' + str(block_id1)
                                    },
                                    "content": _able_content}
                    data[-1]["Content"].append(
                        {"type": "table",
                         "content": able_content,
                         "block_id": data[-1]["block_id"] + '&&' + str(block_id1),
                         "parent_block_id": data[-1]["block_id"]})
    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data
    return data
def aw_read_table_id(table, tables):
    import aspose.words as aw
    lic = aw.License()
    lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
    lic.set_license(lic_path)
    table_data = []
    for row in table.rows:
        content = {
            "type": "tableRow",
            "content": []
        }
        for cell in row.as_row().cells:
            parent_table = table
            cell = cell.as_cell()
            is_horizontally_merged = cell.cell_format.horizontal_merge != aw.tables.CellMerge.NONE
            is_vertically_merged = cell.cell_format.vertical_merge != aw.tables.CellMerge.NONE
            table_index = tables.index_of(parent_table)
            row_index = parent_table.index_of(row)
            cell_index = cell.parent_row.index_of(cell)
            row_span = 1
            col_span = 1

            if row_index < parent_table.rows.count and cell_index < parent_table.rows[row_index].cells.count:
                if is_horizontally_merged and is_vertically_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[
                                cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1
                elif is_horizontally_merged:
                    if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                        for i in range(cell_index, cell.parent_row.cells.count):
                            if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                                col_span += 1
                elif is_vertically_merged:
                    if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                        for i in range(row_index, parent_table.rows.count):
                            if parent_table.rows[i].cells[cell_index] is not None and \
                                    parent_table.rows[i].cells[
                                        cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                                row_span += 1

            cell_content = {
                "type": "tableCell",
                "attrs": {
                    "colspan": col_span,
                    "rowspan": row_span,
                    "colwidth": None
                },
                "content": []
            }

            # Add paragraph content to the cell
            paragraph = {
                "type": "paragraph",
                "content": [
                    {
                        "type": "text",
                        "text": cell.get_text(),
                    }
                ]
            }
            cell_content["content"].append(paragraph)
            content["content"].append(cell_content)
        table_data.append(content)
    return table_data