Tiaohh
1
def read_table(tables):
_table = []
_row = ""
for cell in tables.as_row().cells:
_row = _row + "\t" + cell.as_cell().get_text().strip()
_table.append(_row)
table_info = "\n".join(_table)
return table_info
for paragraph in doc.get_child_nodes(aw.NodeType.PARAGRAPH, True):
try:
node = paragraph.as_paragraph()
level = 1
if node.get_ancestor(aw.NodeType.TABLE) == None:
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
print(node.paragraph_format.style.name, "11111")
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text(),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
)
else:
if data:
if node.get_text().startswith("表"):
if (
"Bullet" in node.paragraph_format.style.name
or "Caption" in node.paragraph_format.style.name
):
data[-1]["Tbale_name"].append(
node.get_text().strip("SEQ 表 \* ARABIC").strip("SEQ")
)
if (
node.get_text().startswith("表")
or node.get_text().startswith("来源:")
or node.get_text().startswith("图")
):
pass
else:
data[-1]["Content"].append(node.get_text())
if node.get_ancestor(aw.NodeType.TABLE) != None:
# parent_node = node.parent_node.as_table()
# able_content = read_table(parent_node)
data[-1]["Table"].append(node.get_text())
except Exception as e:
print(e)
while stack:
old_level, old_data = stack.pop()
data = old_data + data