def aw_extract_headings_and_contents_table_dict(file):
import aspose.words as aw
lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text(),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
)
else:
if data:
if node.get_text().startswith("表"):
data[-1]["Tbale_name"].append(
node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
)
if (
node.get_text().startswith("表")
or node.get_text().startswith("来源:")
or node.get_text().startswith("图")
):
pass
else:
data[-1]["Content"].append(node.get_text())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
我这样按照下标定为文件段落内容的位置是不是有点不太合理呢
def aw_extract_headings_and_contents_table_dict_id(file):
import aspose.words as aw
lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for index, node in enumerate(sect.body.get_child_nodes(aw.NodeType.ANY, False)):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
print(index, "信息-----", node.get_text())
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text(),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
)
else:
if data:
if node.get_text().startswith("表"):
data[-1]["Tbale_name"].append(
node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
)
if (
node.get_text().startswith("表")
or node.get_text().startswith("来源:")
or node.get_text().startswith("图")
):
pass
else:
data[-1]["Content"].append(node.get_text())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
# print(index, "信息-----", able_content)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
如果我根据下标拿数据这样不对呀paragraphs[0].as_paragraph().get_text()
def aw_extract_headings_and_contents_table_dict_id(file):
import aspose.words as aw
lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)
doc = aw.Document(file)
current_level = 0
data = []
stack = []
paragraph = doc.get_child(aw.NodeType.PARAGRAPH, 4, True)
print(paragraph.as_paragraph().get_text())
for s in doc.sections:
sect = s.as_section()
for node in enumerate(sect.body.get_child_nodes(aw.NodeType.ANY, False)):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text(),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
)
else:
if data:
if node.get_text().startswith("表"):
data[-1]["Tbale_name"].append(
node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
)
if (
node.get_text().startswith("表")
or node.get_text().startswith("来源:")
or node.get_text().startswith("图")
):
pass
else:
data[-1]["Content"].append(node.get_text())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
paragraph = doc.get_child(aw.NodeType.PARAGRAPH, 4, True)
这个代码我看可以获取到段落内容信息呀,为什么不能把上面代码获取到段落的位置呢
tables = doc.get_child_nodes(aw.NodeType.TABLE,1, True)
_table = []
for table in tables:
table = table.as_table()
able_content = aw_read_table(table)
print(able_content)
为什么表格没有制定的表格位置进行查找呢
@Tiaohh 不清楚您需要获取什么,也不清楚您在这段代码中使用了哪些文件。也许你需要使用布局方法:aspose.words.layout module | Aspose.Words for Python
怎么删除标题下的制定表格
@Tiaohh 您可以使用这样的方法:
list = []
para = doc.first_section.body.get_child(aw.NodeType.PARAGRAPH, 1, True)
if "C-Heading" in para.as_paragraph().paragraph_format.style_name:
current_node = para.next_sibling
is_continue = True
while current_node is not None and is_continue:
if current_node.node_type == aw.NodeType.PARAGRAPH:
temp_para = current_node.as_paragraph()
if "C-Heading" in temp_para.paragraph_format.style_name:
is_continue = False
continue
if current_node.node_type == aw.NodeType.TABLE:
list.append(current_node)
current_node = current_node.next_sibling
for node in list:
node.remove()