怎么查询docx表格内容以及合并规则

hhh1111 · May 7, 2024, 2:44am

需要这样处理数据，需要每一行每个单元格加上合并的规则

hhh1111 · May 7, 2024, 2:44am

def aw_read_table(tables):
    _table = []

    for row in tables.rows:
        _row = ""
        for cell in row.as_row().cells:
            if _row:
                _row = _row + "\t" + cell.as_cell().get_text().strip()
            else:
                _row = cell.as_cell().get_text().strip()
        _table.append(_row)
    table_info = "\n".join(_table)
    return table_info

hhh1111 · May 7, 2024, 2:46am

可以给我写一个通用的demo吗，因为每个表格合并的规则都不一样

hhh1111 · May 7, 2024, 5:45am

@alexey.noskov

hhh1111 · May 7, 2024, 5:46am

table_row["content"].append(
                                    {
                                        "type": "tableCell",
                                        "attrs": {
                                            "colspan": colspan,
                                            "rowspan": rowspan,
                                            'id': item["block_id"],
                                        },
                                        "content": [
                                            {
                                                "type": "paragraph",
                                                "content": [
                                                    {
                                                        "type": "text",
                                                        "text": cell,
                                                    }
                                                ],
                                            }
                                        ],
                                    }
                                )

我想处理成这样的格式

hhh1111 · May 7, 2024, 5:44am

可以看下我发布的帖子ma

hhh1111 · May 7, 2024, 6:11am

怎么查询docx表格内容以及合并规则 - #6 by hhh1111 可以看下我的嘛

hhh1111 · May 7, 2024, 6:23am

看下我的帖子？？？？？？？？？

hhh1111 · May 7, 2024, 6:24am

帮我看下我的帖子？？？？

hhh1111 · May 7, 2024, 7:34am

测试.docx (21.0 KB)

处理后的格式需要如下
{
“type”: “tableCell”,
“attrs”: {
“colspan”: colspan,
“rowspan”: rowspan,
‘id’: item[“block_id”],
},
“content”: [
{
“type”: “paragraph”,
“content”: [
{
“type”: “text”,
“text”: cell,
}
],
}
],
}

hhh1111 · May 7, 2024, 8:09am

我的帖子大家看不见嘛？？？？？

hhh1111 · May 7, 2024, 8:10am

你可以看见我的帖子嘛？？？？？

hhh1111 · May 7, 2024, 8:11am

请问可以看见我的帖子嘛？？？？？？？？？？？？？

vyacheslav.deryushev · May 7, 2024, 8:47am

@hhh1111 要获取行跨度和列跨度，您需要将 docx 文件转换为 HTML 文件，然后像这样收集以下信息：

def test_merged_cells(self):
    doc = aw.Document("merged.docx")

    with BytesIO() as html_stream:
        save_options = aw.saving.HtmlSaveOptions()
        save_options.images_folder = ARTIFACTS_DIR
        doc.save(html_stream, save_options)
        html_stream.seek(0)
        xml_doc = xml_tree.parse(html_stream)

    tables = xml_doc.findall(".//table")
    for table in tables:
        table_inf = []
        rows = table.findall(".//tr")
        for row in rows:
            row_inf = []
            cells = row.findall(".//td")
            for cell in cells:
                col_span_attr = cell.get("colspan")
                row_span_attr = cell.get("rowspan")
                col_span = int(col_span_attr) if col_span_attr else 0
                row_span = int(row_span_attr) if row_span_attr else 0
                cell_inf = self.CellInfo(col_span, row_span)
                row_inf.append(cell_inf)
            table_inf.append(row_inf)
        self.mTables.append(table_inf)

    print("test")

class CellInfo:
    def __init__(self, colSpan, rowSpan):
        self.ColSpan = colSpan
        self.RowSpan = rowSpan

mTables = []

hhh1111 · May 7, 2024, 8:51am

那如果我文件即有表格信息和段落信息呢，我只想把表格信息处理成上面的格式
def aw_read_table(tables):
_table = []

for row in tables.rows:
    _row = ""
    for cell in row.as_row().cells:
        if _row:
            _row = _row + "\t" + cell.as_cell().get_text().strip()
        else:
            _row = cell.as_cell().get_text().strip()
    _table.append(_row)
table_info = "\n".join(_table)
return table_info

def aw_extract_headings_and_contents_table_dict(file):
import aspose.words as aw

lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)

doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
    sect = s.as_section()
    for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
        if node.node_type == aw.NodeType.PARAGRAPH:
            node = node.as_paragraph()
            if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                level = int(node.paragraph_format.outline_level) + 1
                if level > current_level:
                    # 如果级别更深，将当前标题添加到堆栈
                    stack.append((current_level, data))
                    data = []
                    current_level = level
                elif level < current_level:
                    # 如果级别更浅，将堆栈中的项添加回数据
                    while stack and stack[-1][0] >= level:
                        old_level, old_data = stack.pop()
                        data = old_data + data
                        current_level = old_level
                data.append(
                    {
                        "Title": node.get_text(),
                        "Content": [],
                        "Level": level,
                        "Table": [],
                        "Tbale_name": [],
                    }
                )
            else:
                if data:
                    if node.get_text().startswith("表"):
                        data[-1]["Tbale_name"].append(
                            node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
                        )
                    if (
                            node.get_text().startswith("表")
                            or node.get_text().startswith("来源：")
                            or node.get_text().startswith("图")
                    ):
                        pass
                    else:

                        data[-1]["Content"].append(node.get_text())
        if data:
            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                able_content = aw_read_table(parent_node)
                data[-1]["Table"].append(able_content)
while stack:
    old_level, old_data = stack.pop()
    data = old_data + data
return data

vyacheslav.deryushev · May 7, 2024, 9:03am

@hhh1111 它不会计量，因为你只解析表格信息。

hhh1111 · May 7, 2024, 9:05am

不是的如果我文件是这样的呢
CSR-20240410-h63D4QaGoR.docx (838.1 KB)

我需要table数据为这种格式
def aw_read_table(tables):
_table = []

for row in tables.rows:
    _row = ""
    for cell in row.as_row().cells:
        if _row:
            _row = _row + "\t" + cell.as_cell().get_text().strip()
        else:
            _row = cell.as_cell().get_text().strip()
    _table.append(_row)
table_info = "\n".join(_table)
return table_info

def aw_extract_headings_and_contents_table_dict(file):
import aspose.words as aw

lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)

doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
    sect = s.as_section()
    for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
        if node.node_type == aw.NodeType.PARAGRAPH:
            node = node.as_paragraph()
            if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                level = int(node.paragraph_format.outline_level) + 1
                if level > current_level:
                    # 如果级别更深，将当前标题添加到堆栈
                    stack.append((current_level, data))
                    data = []
                    current_level = level
                elif level < current_level:
                    # 如果级别更浅，将堆栈中的项添加回数据
                    while stack and stack[-1][0] >= level:
                        old_level, old_data = stack.pop()
                        data = old_data + data
                        current_level = old_level
                data.append(
                    {
                        "Title": node.get_text(),
                        "Content": [],
                        "Level": level,
                        "Table": [],
                        "Tbale_name": [],
                    }
                )
            else:
                if data:
                    if node.get_text().startswith("表"):
                        data[-1]["Tbale_name"].append(
                            node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
                        )
                    if (
                            node.get_text().startswith("表")
                            or node.get_text().startswith("来源：")
                            or node.get_text().startswith("图")
                    ):
                        pass
                    else:

                        data[-1]["Content"].append(node.get_text())
        if data:
            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                able_content = aw_read_table(parent_node)
                data[-1]["Table"].append(able_content)
while stack:
    old_level, old_data = stack.pop()
    data = old_data + data
return data

def test_merged_cells(self):
    doc = aw.Document("merged.docx")

    with BytesIO() as html_stream:
        save_options = aw.saving.HtmlSaveOptions()
        save_options.images_folder = ARTIFACTS_DIR
        doc.save(html_stream, save_options)
        html_stream.seek(0)
        xml_doc = xml_tree.parse(html_stream)

    tables = xml_doc.findall(".//table")
    for table in tables:
        table_inf = []
        rows = table.findall(".//tr")
        for row in rows:
            row_inf = []
            cells = row.findall(".//td")
            for cell in cells:
                col_span_attr = cell.get("colspan")
                row_span_attr = cell.get("rowspan")
                col_span = int(col_span_attr) if col_span_attr else 0
                row_span = int(row_span_attr) if row_span_attr else 0
                cell_inf = self.CellInfo(col_span, row_span)
                row_inf.append(cell_inf)
            table_inf.append(row_inf)
        self.mTables.append(table_inf)

    print("test")

class CellInfo:
    def __init__(self, colSpan, rowSpan):
        self.ColSpan = colSpan
        self.RowSpan = rowSpan

mTables = []

这个为table数据  if data:
            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                able_content = aw_read_table(parent_node)
                data[-1]["Table"].append(able_content)

vyacheslav.deryushev · May 7, 2024, 10:52am

@hhh1111 使用此代码，您可以获得有关所拥有的表的信息。并以您需要的任何格式收集。

doc = aw.Document("input.docx")
word_tables = doc.get_child_nodes(aw.NodeType.TABLE, True)

with BytesIO() as html_stream:
    save_options = aw.saving.HtmlSaveOptions()
    save_options.images_folder = ARTIFACTS_DIR
    doc.save(html_stream, save_options)
    html_stream.seek(0)
    xml_doc = xml_tree.parse(html_stream)

m_tables = []
tables = xml_doc.findall(".//table")
for table in tables:
    table_inf = self.TableInfo()
    rows = table.findall(".//tr")
    for row in rows:
        row_inf = self.RowInfo()
        cells = row.findall(".//td")
        for cell in cells:
            col_span_attr = cell.get("colspan")
            row_span_attr = cell.get("rowspan")
            col_span = int(col_span_attr) if col_span_attr else 0
            row_span = int(row_span_attr) if row_span_attr else 0
            cell_inf = self.CellInfo(col_span, row_span)
            row_inf.Cells.append(cell_inf)
        table_inf.Rows.append(row_inf)
    m_tables.append(table_inf)

for table in word_tables:
    for row in table.as_table().rows:
        for cell in row.as_row().cells:
            cell = cell.as_cell()
            tab_id = word_tables.index_of(cell.parent_row.parent_table)
            row_id = cell.parent_row.parent_table.index_of(cell.parent_row)
            cell_id = cell.parent_row.index_of(cell)

            col_span = 0
            row_span = 0
            if tab_id < len(m_tables) and row_id < len(m_tables[tab_id].Rows) and cell_id < len(
                    m_tables[tab_id].Rows[row_id].Cells):
                col_span = m_tables[tab_id].Rows[row_id].Cells[cell_id].ColSpan
                row_span = m_tables[tab_id].Rows[row_id].Cells[cell_id].RowSpan

            print("{0}.{1}.{2} colspan={3}\t rowspan={4}\t text={5}".format(tab_id, row_id, cell_id, col_span,
                                                                            row_span, cell.get_text()))


class TableInfo:
    def __init__(self):
        self.Rows = []

class RowInfo:
    def __init__(self):
        self.Cells = []

class CellInfo:
    def __init__(self, col_span, row_span):
        self.ColSpan = col_span
        self.RowSpan = row_span

hhh1111 · May 8, 2024, 1:25am

不对呀，我要通过以下代码，更改表格数据格式
ic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, “core/Aspose.Total.Product.Family.lic”)
lic.set_license(lic_path)

doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深，将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅，将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
“Title”: node.get_text(),
“Content”: [],
“Level”: level,
“Table”: [],
“Tbale_name”: [],
}
)
else:
if data:
if node.get_text().startswith(“表”):
data[-1][“Tbale_name”].append(
node.get_text().strip(“SEQ * ARABIC”).strip(“SEQ”)
)
if (
node.get_text().startswith(“表”)
or node.get_text().startswith(“来源：”)
or node.get_text().startswith(“图”)
):
pass
else:

                    data[-1]["Content"].append(node.get_text())
    if data:
        if node.node_type == aw.NodeType.TABLE:
            parent_node = node.as_table()
            able_content = aw_read_table(parent_node)
            data[-1]["Table"].append(able_content)

while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data

if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)

hhh1111 · May 8, 2024, 1:27am

if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)这个为读取的表格信息，我需要这个table数据知道合并规则if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)