def aw_read_table_id(table=None):
# 将表格转换为HTML
html = table.to_string(aw.SaveFormat.HTML)
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(html, "html.parser")
# 删除不需要的标签和属性,仅保留基本结构
allowed_tags = ['table', 'tr', 'td']
for tag in soup.find_all():
if tag.name not in allowed_tags:
tag.unwrap() # 只保留标签中的内容,删除标签本身
else:
tag.attrs = {} # 删除标签中的所有属性
# 返回处理后的HTML字符串
clean_html = str(soup)
return clean_html
# 提取文档中的标题和内容函数
async def aw_extract_headings_and_contents_table_dict_id(file):
doc = aw.Document(file)
current_level = 0
data = []
doc.update_list_labels()
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
if node.node_type == aw.NodeType.FIELD_START:
continue
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
label = ''
if node.list_format.is_list_item:
label = node.list_label.label_string
text_without_comments = node.get_text().strip()
data.append(
{"content": label + text_without_comments if label else text_without_comments,
"level": level, "type": "title", "block_id": generate_unique_id(), "content_type": 1})
else:
if node.get_text().strip() and not node.get_ancestor(
aw.NodeType.TABLE) and not node.get_ancestor(aw.NodeType.FIELD_START) and data:
data.append({"content": node.get_text().strip().replace(" SEQ 表 \* ARABIC ", '').replace(
'TOC \h \c "表" HYPERLINK \l "_Toc14741"', '').replace(
"\u0013 SEQ 图 \\* ARABIC \u00141\u0015 ", ''), "type": "text",
"block_id": generate_unique_id(), "content_type": 2})
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
table_content = aw_read_table_id(parent_node)
if data:
data.append(
{"content": table_content, "type": "table", "block_id": generate_unique_id(), "content_type": 3})
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
我希望提取pdf标题 段落内容分为表格和文本。需要按照顺序提取
没有更多的信息。就读取正常的pdf文件