图 14.docx (112.2 KB)
以下是代码
def get_col_span(current_cell):
col_span = 1
if current_cell != None and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
current_cell = current_cell.next_cell
while current_cell != None:
if current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
col_span += 1
else:
break
current_cell = current_cell.next_cell
return col_span
def get_row_span(current_cell, row_index, cell_index, table_row_data):
row_span = 1
if current_cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
for row in list(table_row_data)[row_index + 1:]:
row = row.as_row()
row_cells = row.cells
if row_cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
row_span += 1
else:
break
return row_span
def aw_read_table_id(table=None):
table.convert_to_horizontally_merged_cells()
table_contents_lis = []
row_data = table.rows
for r_index, r in enumerate(row_data, 0):
row = r.as_row()
current_cell = row.first_cell
cell_index = 0
table_row_data = []
while current_cell != None:
if current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS or current_cell.cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
pass
else:
current_cell_text = current_cell.get_text().replace('\x07', '').strip()
cos_pan = get_col_span(current_cell)
row_pan = get_row_span(current_cell, r_index, cell_index, row_data)
col_width = current_cell.cell_format.width
current_dic = {
"type": "tableCell",
"attrs": {
"colspan": cos_pan,
"rowspan": row_pan,
"colwidth": col_width,
"cell_index": cell_index,
}, "content": [
{
"type": "paragraph",
"content": [{
"type": "text",
"text": current_cell_text
}]
}
]
}
table_row_data.append(current_dic)
cell_index += 1
current_cell = current_cell.next_cell
table_contents_lis.append({
"type": "tableRow",
"content": table_row_data
})
return_data = {
"type": "table",
"contents": {
"type": "table",
"content": table_contents_lis
},
}
print(return_data)
return return_data
def aw_extract_headings_and_contents_table_dict_id(file):
import aspose.words as aw
lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text(),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
)
else:
if data:
if node.get_text().strip():
data[-1]["Content"].append(
{"type": "text", "content": node.get_text().strip(),
})
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
_able_content = aw_read_table_id(parent_node)
data[-1]["Content"].append(
{"type": "table",
"content": _able_content,}
)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data