为什么提取docx和段落内容前面的序号和文字提取不出来呢

hhh1111 · June 18, 2024, 2:25am

hhh1111 · June 18, 2024, 2:25am

代码：

class AW_ReadFileUtils:
    def aw_read_table(self, tables):
        _table = []

        for row in tables.rows:
            _row = ""
            for cell in row.as_row().cells:
                if _row:
                    _row = _row + "\t" + cell.as_cell().get_text().strip()
                else:
                    _row = cell.as_cell().get_text().strip()
            _table.append(_row)
        table_info = "\n".join(_table)
        return table_info

    def aw_extract_headings_and_contents_table_dict(self, file):

        doc = aw.Document(file)
        current_level = 0
        data = []
        stack = []
        for s in doc.sections:
            sect = s.as_section()
            for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
                if node.node_type == aw.NodeType.PARAGRAPH:
                    node = node.as_paragraph()
                    if node.paragraph_format.outline_level in [0]:
                        level = int(node.paragraph_format.outline_level) + 1
                        if level > current_level:
                            # 如果级别更深，将当前标题添加到堆栈
                            stack.append((current_level, data))
                            data = []
                            current_level = level
                        elif level < current_level:
                            # 如果级别更浅，将堆栈中的项添加回数据
                            while stack and stack[-1][0] >= level:
                                old_level, old_data = stack.pop()
                                data = old_data + data
                                current_level = old_level
                        data.append(
                            {
                                "Title": node.get_text().strip(),
                                "Content": [],
                                "Level": level,
                                "Table": [],
                                "Table_name": [],
                            }
                        )
                    else:
                        if data:
                            if node.get_text().startswith("表"):
                                data[-1]["Table_name"].append(
                                    node.get_text().replace(' STYLEREF 1 \s', '').replace(
                                        ' SEQ 表 \* ARABIC \s 1 ', '-')
                                )
                            if (
                                    node.get_text().startswith("表")
                                    or node.get_text().startswith("来源：")
                                    or node.get_text().startswith("图")
                            ):
                                pass
                            else:

                                data[-1]["Content"].append(node.get_text())
                if data:
                    if node.node_type == aw.NodeType.TABLE:
                        parent_node = node.as_table()
                        able_content = self.aw_read_table(parent_node)
                        data[-1]["Table"].append(able_content)
        while stack:
            old_level, old_data = stack.pop()
            data = old_data + data
        return data


aw_read_file_utils = AW_ReadFileUtils()
import json
if __name__ == "__main__":
    pass
    a = aw_read_file_utils.aw_extract_headings_and_contents_table_dict(
        '/Users/dip/Desktop/场景1、2海外专利资源服务项目合同-3.0 - （需要修改的文件）.docx')
    with open("场景1、2海外专利资源服务项目合同-3.0 - （需要修改的文件）.json", "w",
              encoding="utf-8") as f:
        json.dump(a, f, ensure_ascii=False, indent=4)

alexey.noskov · June 18, 2024, 5:24am

@hhh1111 它们是列表标签。您应该调用 Document.update_list_labels，然后使用 Paragraph.list_label_label_string 属性来获取值。

hhh1111 · June 18, 2024, 5:29am

代码示例有吗？？？？？？？

hhh1111 · June 18, 2024, 5:35am

获取不到为什么？？？？？？
image.png (53.3 KB)

hhh1111 · June 18, 2024, 6:44am

已解决。。。。。。。。。。

hhh1111 · June 18, 2024, 10:14am

为什么批注内容也属于段落内容这么过滤掉
image.png (72.2 KB)

hhh1111 · June 18, 2024, 10:17am

from core.config import settings
import aspose.words as aw

lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)


class AW_ReadFileUtils:
    def aw_read_table(self, tables):
        _table = []

        for row in tables.rows:
            _row = ""
            for cell in row.as_row().cells:
                if _row:
                    _row = _row + "\t" + cell.as_cell().get_text().strip()
                else:
                    _row = cell.as_cell().get_text().strip()
            _table.append(_row)
        table_info = "\n".join(_table)
        return table_info

    def aw_extract_headings_and_contents_table_dict(self, file):
        doc = aw.Document(file)
        doc.update_list_labels()
        current_level = 0
        data = []
        stack = []
        for s in doc.sections:
            sect = s.as_section()
            for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
                if node.node_type == aw.NodeType.PARAGRAPH:
                    node = node.as_paragraph()
                    label = ''
                    if node.paragraph_format.outline_level in [0]:
                        level = int(node.paragraph_format.outline_level) + 1
                        if level > current_level:
                            # 如果级别更深，将当前标题添加到堆栈
                            stack.append((current_level, data))
                            data = []
                            current_level = level
                        elif level < current_level:
                            # 如果级别更浅，将堆栈中的项添加回数据
                            while stack and stack[-1][0] >= level:
                                old_level, old_data = stack.pop()
                                data = old_data + data
                                current_level = old_level
                        label = ''
                        if node.list_format.is_list_item:
                            label = node.list_label.label_string
                        print(label + node.get_text().strip() if label else node.get_text().strip())
                        data.append(
                            {
                                "Title":
                                    label + node.get_text().strip() if label else node.get_text().strip(),
                                "Content": [],
                                "Level": level,
                                "Table": [],
                                "Table_name": [],
                            }
                        )
                    else:
                        if data:
                            if node.list_format.is_list_item:
                                label = node.list_label.label_string
                            if node.get_text().startswith("表") and not node.get_ancestor(aw.NodeType.TABLE):
                                data[-1]["Table_name"].append(
                                    node.get_text().replace(' STYLEREF 1 \s', '').replace(
                                        ' SEQ 表 \* ARABIC \s 1 ', '-')
                                )
                            if (
                                    node.get_text().startswith("表")
                                    or node.get_text().startswith("来源：")
                                    or node.get_text().startswith("图")
                            ):
                                pass
                            if not node.get_ancestor(aw.NodeType.TABLE) and node.get_text().strip() and not node.get_ancestor(aw.NodeType.COMMENT):
                                data[-1]["Content"].append(
                                    label + node.get_text().strip() if label else node.get_text().strip())

                if data:
                    if node.node_type == aw.NodeType.TABLE:
                        parent_node = node.as_table()
                        able_content = self.aw_read_table(parent_node)
                        data[-1]["Table"].append(able_content)
        while stack:
            old_level, old_data = stack.pop()
            data = old_data + data
        return data


aw_read_file_utils = AW_ReadFileUtils()
import json

代码段落内容已经过滤掉批注的内容。但是标题还是没有过滤掉

vyacheslav.deryushev · June 18, 2024, 12:30pm

@hhh1111 您可以使用两种方法获取不带注释的文本：

node.get_child_nodes(aw.NodeType.COMMENT, True).clear()
text_without_comments = node.get_text().strip()

data.append(
    {
        "Title": label + text_without_comments if label else text_without_comments,
        "block_id": str(block_id),
        "Content": [],
        "Level": level,
        "Table": [],
        "Tbale_name": [],
    }

或者

builder = []
for child in node.get_child_nodes(aw.NodeType.ANY, True):
    if child.node_type is not aw.NodeType.COMMENT or aw.NodeType.COMMENT_RANGE_START or aw.NodeType.COMMENT_RANGE_END:
        builder.append(child.to_string(aw.SaveFormat.TEXT))
result = ''.join(builder)
text_without_comments = result

data.append(
    {
        "Title": label + text_without_comments if label else text_without_comments,
        "block_id": str(block_id),
        "Content": [],
        "Level": level,
        "Table": [],
        "Tbale_name": [],
    }

为什么提取docx和段落内容 前面的序号和文字提取不出来呢

为什么提取docx和段落内容前面的序号和文字提取不出来呢