测试文档
测试22222.docx (14.7 KB)
代码:
class AW_ReadFileUtils:
def aw_read_table(self, tables):
_table = []
for row in tables.rows:
_row = ""
for cell in row.as_row().cells:
if _row:
_row = _row + "\t" + cell.as_cell().get_text().strip()
else:
_row = cell.as_cell().get_text().strip()
_table.append(_row)
table_info = "\n".join(_table)
return table_info
def aw_extract_headings_and_contents_table_dict(self, file):
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text().strip(),
"Content": [],
"Level": level,
"Table": [],
"Table_name": [],
}
)
else:
if data:
if node.get_text().startswith("表"):
data[-1]["Table_name"].append(
node.get_text().replace(' STYLEREF 1 \s', '').replace(
' SEQ 表 \* ARABIC \s 1 ', '-')
)
if (
node.get_text().startswith("表")
or node.get_text().startswith("来源:")
or node.get_text().startswith("图")
):
pass
else:
data[-1]["Content"].append(node.get_text())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = self.aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
aw_read_file_utils = AW_ReadFileUtils()
import json
if __name__ == "__main__":
pass
a = aw_read_file_utils.aw_extract_headings_and_contents_table_dict(
'/Users/dip/Desktop/场景1、2海外专利资源服务项目合同-3.0 - (需要修改的文件).docx')
with open("场景1、2海外专利资源服务项目合同-3.0 - (需要修改的文件).json", "w",
encoding="utf-8") as f:
json.dump(a, f, ensure_ascii=False, indent=4)
代码示例有吗???????
已解决。。。。。。。。。。
from core.config import settings
import aspose.words as aw
lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)
class AW_ReadFileUtils:
def aw_read_table(self, tables):
_table = []
for row in tables.rows:
_row = ""
for cell in row.as_row().cells:
if _row:
_row = _row + "\t" + cell.as_cell().get_text().strip()
else:
_row = cell.as_cell().get_text().strip()
_table.append(_row)
table_info = "\n".join(_table)
return table_info
def aw_extract_headings_and_contents_table_dict(self, file):
doc = aw.Document(file)
doc.update_list_labels()
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
label = ''
if node.paragraph_format.outline_level in [0]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
label = ''
if node.list_format.is_list_item:
label = node.list_label.label_string
print(label + node.get_text().strip() if label else node.get_text().strip())
data.append(
{
"Title":
label + node.get_text().strip() if label else node.get_text().strip(),
"Content": [],
"Level": level,
"Table": [],
"Table_name": [],
}
)
else:
if data:
if node.list_format.is_list_item:
label = node.list_label.label_string
if node.get_text().startswith("表") and not node.get_ancestor(aw.NodeType.TABLE):
data[-1]["Table_name"].append(
node.get_text().replace(' STYLEREF 1 \s', '').replace(
' SEQ 表 \* ARABIC \s 1 ', '-')
)
if (
node.get_text().startswith("表")
or node.get_text().startswith("来源:")
or node.get_text().startswith("图")
):
pass
if not node.get_ancestor(aw.NodeType.TABLE) and node.get_text().strip() and not node.get_ancestor(aw.NodeType.COMMENT):
data[-1]["Content"].append(
label + node.get_text().strip() if label else node.get_text().strip())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = self.aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
aw_read_file_utils = AW_ReadFileUtils()
import json
代码 段落内容已经过滤掉批注的内容。但是标题还是没有过滤掉
@hhh1111 您可以使用两种方法获取不带注释的文本:
node.get_child_nodes(aw.NodeType.COMMENT, True).clear()
text_without_comments = node.get_text().strip()
data.append(
{
"Title": label + text_without_comments if label else text_without_comments,
"block_id": str(block_id),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
或者
builder = []
for child in node.get_child_nodes(aw.NodeType.ANY, True):
if child.node_type is not aw.NodeType.COMMENT or aw.NodeType.COMMENT_RANGE_START or aw.NodeType.COMMENT_RANGE_END:
builder.append(child.to_string(aw.SaveFormat.TEXT))
result = ''.join(builder)
text_without_comments = result
data.append(
{
"Title": label + text_without_comments if label else text_without_comments,
"block_id": str(block_id),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}