怎么读取pdf数据需要按照以下word代码进行读取

hhh1111 · October 31, 2024, 2:02am

def aw_read_table_id(table=None):
    # 将表格转换为HTML
    html = table.to_string(aw.SaveFormat.HTML)

    # 使用BeautifulSoup解析HTML
    soup = BeautifulSoup(html, "html.parser")
    # 删除不需要的标签和属性，仅保留基本结构
    allowed_tags = ['table', 'tr', 'td']
    for tag in soup.find_all():
        if tag.name not in allowed_tags:
            tag.unwrap()  # 只保留标签中的内容，删除标签本身
        else:
            tag.attrs = {}  # 删除标签中的所有属性

    # 返回处理后的HTML字符串
    clean_html = str(soup)
    return clean_html

# 提取文档中的标题和内容函数
async def aw_extract_headings_and_contents_table_dict_id(file):
    doc = aw.Document(file)
    current_level = 0
    data = []
    doc.update_list_labels()
    stack = []

    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    if node.node_type == aw.NodeType.FIELD_START:
                        continue
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level
                    label = ''
                    if node.list_format.is_list_item:
                        label = node.list_label.label_string
                    text_without_comments = node.get_text().strip()
                    data.append(
                        {"content": label + text_without_comments if label else text_without_comments,
                         "level": level, "type": "title", "block_id": generate_unique_id(), "content_type": 1})
                else:
                    if node.get_text().strip() and not node.get_ancestor(
                            aw.NodeType.TABLE) and not node.get_ancestor(aw.NodeType.FIELD_START) and data:
                        data.append({"content": node.get_text().strip().replace("  SEQ 表 \* ARABIC ", '').replace(
                            'TOC \h \c "表" HYPERLINK \l "_Toc14741"', '').replace(
                            "\u0013 SEQ 图 \\* ARABIC \u00141\u0015 ", ''), "type": "text",
                            "block_id": generate_unique_id(), "content_type": 2})

            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                table_content = aw_read_table_id(parent_node)
                if data:
                    data.append(
                        {"content": table_content, "type": "table", "block_id": generate_unique_id(), "content_type": 3})

    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data
    return data

asad.ali · October 31, 2024, 2:31pm

@hhh1111

看起来该查询与 Aspose.Words 有关，您很快就会从这个角度得到帮助。

alexey.noskov · October 31, 2024, 3:57pm

@hhh1111 恐怕你的问题不够清楚。请你详细说明一下，并提供输入文档和预期输出。

hhh1111 · November 1, 2024, 1:51am

我希望提取pdf标题段落内容分为表格和文本。需要按照顺序提取

alexey.noskov · November 1, 2024, 5:41am

@hhh1111 请附上您的输入文档和预期输出。我们将检查您的文档并为您提供更多信息。

hhh1111 · November 1, 2024, 5:49am

没有更多的信息。就读取正常的pdf文件

alexey.noskov · November 1, 2024, 7:49am

@hhh1111 请附上您的输入文档和预期输出。我们将检查您的文档并为您提供更多信息。

怎么读取pdf数据 需要按照以下word代码 进行读取

怎么读取pdf数据需要按照以下word代码进行读取