测试.docx (21.0 KB)
需要这样处理数据,需要每一行每个单元格加上合并的规则
def aw_read_table(tables):
_table = []
for row in tables.rows:
_row = ""
for cell in row.as_row().cells:
if _row:
_row = _row + "\t" + cell.as_cell().get_text().strip()
else:
_row = cell.as_cell().get_text().strip()
_table.append(_row)
table_info = "\n".join(_table)
return table_info
可以给我写一个通用的demo吗,因为每个表格合并的规则都不一样
table_row["content"].append(
{
"type": "tableCell",
"attrs": {
"colspan": colspan,
"rowspan": rowspan,
'id': item["block_id"],
},
"content": [
{
"type": "paragraph",
"content": [
{
"type": "text",
"text": cell,
}
],
}
],
}
)
我想处理成这样的格式
可以看下我发布的帖子ma
看下我的帖子?????????
帮我看下我的帖子????
测试.docx (21.0 KB)
处理后的格式需要如下
{
“type”: “tableCell”,
“attrs”: {
“colspan”: colspan,
“rowspan”: rowspan,
‘id’: item[“block_id”],
},
“content”: [
{
“type”: “paragraph”,
“content”: [
{
“type”: “text”,
“text”: cell,
}
],
}
],
}
我的帖子大家看不见嘛?????
你可以看见我的帖子嘛?????
请问可以看见我的帖子嘛?????????????
@hhh1111 要获取行跨度和列跨度,您需要将 docx 文件转换为 HTML 文件,然后像这样收集以下信息:
def test_merged_cells(self):
doc = aw.Document("merged.docx")
with BytesIO() as html_stream:
save_options = aw.saving.HtmlSaveOptions()
save_options.images_folder = ARTIFACTS_DIR
doc.save(html_stream, save_options)
html_stream.seek(0)
xml_doc = xml_tree.parse(html_stream)
tables = xml_doc.findall(".//table")
for table in tables:
table_inf = []
rows = table.findall(".//tr")
for row in rows:
row_inf = []
cells = row.findall(".//td")
for cell in cells:
col_span_attr = cell.get("colspan")
row_span_attr = cell.get("rowspan")
col_span = int(col_span_attr) if col_span_attr else 0
row_span = int(row_span_attr) if row_span_attr else 0
cell_inf = self.CellInfo(col_span, row_span)
row_inf.append(cell_inf)
table_inf.append(row_inf)
self.mTables.append(table_inf)
print("test")
class CellInfo:
def __init__(self, colSpan, rowSpan):
self.ColSpan = colSpan
self.RowSpan = rowSpan
mTables = []
那如果我文件即有表格信息和段落信息呢,我只想把表格信息处理成上面的格式
def aw_read_table(tables):
_table = []
for row in tables.rows:
_row = ""
for cell in row.as_row().cells:
if _row:
_row = _row + "\t" + cell.as_cell().get_text().strip()
else:
_row = cell.as_cell().get_text().strip()
_table.append(_row)
table_info = "\n".join(_table)
return table_info
def aw_extract_headings_and_contents_table_dict(file):
import aspose.words as aw
lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text(),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
)
else:
if data:
if node.get_text().startswith("表"):
data[-1]["Tbale_name"].append(
node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
)
if (
node.get_text().startswith("表")
or node.get_text().startswith("来源:")
or node.get_text().startswith("图")
):
pass
else:
data[-1]["Content"].append(node.get_text())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
不是的 如果我文件是这样的呢
CSR-20240410-h63D4QaGoR.docx (838.1 KB)
我需要table数据为这种格式
def aw_read_table(tables):
_table = []
for row in tables.rows:
_row = ""
for cell in row.as_row().cells:
if _row:
_row = _row + "\t" + cell.as_cell().get_text().strip()
else:
_row = cell.as_cell().get_text().strip()
_table.append(_row)
table_info = "\n".join(_table)
return table_info
def aw_extract_headings_and_contents_table_dict(file):
import aspose.words as aw
lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text(),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
)
else:
if data:
if node.get_text().startswith("表"):
data[-1]["Tbale_name"].append(
node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
)
if (
node.get_text().startswith("表")
or node.get_text().startswith("来源:")
or node.get_text().startswith("图")
):
pass
else:
data[-1]["Content"].append(node.get_text())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
def test_merged_cells(self):
doc = aw.Document("merged.docx")
with BytesIO() as html_stream:
save_options = aw.saving.HtmlSaveOptions()
save_options.images_folder = ARTIFACTS_DIR
doc.save(html_stream, save_options)
html_stream.seek(0)
xml_doc = xml_tree.parse(html_stream)
tables = xml_doc.findall(".//table")
for table in tables:
table_inf = []
rows = table.findall(".//tr")
for row in rows:
row_inf = []
cells = row.findall(".//td")
for cell in cells:
col_span_attr = cell.get("colspan")
row_span_attr = cell.get("rowspan")
col_span = int(col_span_attr) if col_span_attr else 0
row_span = int(row_span_attr) if row_span_attr else 0
cell_inf = self.CellInfo(col_span, row_span)
row_inf.append(cell_inf)
table_inf.append(row_inf)
self.mTables.append(table_inf)
print("test")
class CellInfo:
def __init__(self, colSpan, rowSpan):
self.ColSpan = colSpan
self.RowSpan = rowSpan
mTables = []
这个为table数据 if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
@hhh1111 使用此代码,您可以获得有关所拥有的表的信息。并以您需要的任何格式收集。
doc = aw.Document("input.docx")
word_tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
with BytesIO() as html_stream:
save_options = aw.saving.HtmlSaveOptions()
save_options.images_folder = ARTIFACTS_DIR
doc.save(html_stream, save_options)
html_stream.seek(0)
xml_doc = xml_tree.parse(html_stream)
m_tables = []
tables = xml_doc.findall(".//table")
for table in tables:
table_inf = self.TableInfo()
rows = table.findall(".//tr")
for row in rows:
row_inf = self.RowInfo()
cells = row.findall(".//td")
for cell in cells:
col_span_attr = cell.get("colspan")
row_span_attr = cell.get("rowspan")
col_span = int(col_span_attr) if col_span_attr else 0
row_span = int(row_span_attr) if row_span_attr else 0
cell_inf = self.CellInfo(col_span, row_span)
row_inf.Cells.append(cell_inf)
table_inf.Rows.append(row_inf)
m_tables.append(table_inf)
for table in word_tables:
for row in table.as_table().rows:
for cell in row.as_row().cells:
cell = cell.as_cell()
tab_id = word_tables.index_of(cell.parent_row.parent_table)
row_id = cell.parent_row.parent_table.index_of(cell.parent_row)
cell_id = cell.parent_row.index_of(cell)
col_span = 0
row_span = 0
if tab_id < len(m_tables) and row_id < len(m_tables[tab_id].Rows) and cell_id < len(
m_tables[tab_id].Rows[row_id].Cells):
col_span = m_tables[tab_id].Rows[row_id].Cells[cell_id].ColSpan
row_span = m_tables[tab_id].Rows[row_id].Cells[cell_id].RowSpan
print("{0}.{1}.{2} colspan={3}\t rowspan={4}\t text={5}".format(tab_id, row_id, cell_id, col_span,
row_span, cell.get_text()))
class TableInfo:
def __init__(self):
self.Rows = []
class RowInfo:
def __init__(self):
self.Cells = []
class CellInfo:
def __init__(self, col_span, row_span):
self.ColSpan = col_span
self.RowSpan = row_span
不对呀,我要通过以下代码 ,更改表格数据格式
ic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, “core/Aspose.Total.Product.Family.lic”)
lic.set_license(lic_path)
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
“Title”: node.get_text(),
“Content”: [],
“Level”: level,
“Table”: [],
“Tbale_name”: [],
}
)
else:
if data:
if node.get_text().startswith(“表”):
data[-1][“Tbale_name”].append(
node.get_text().strip(“SEQ * ARABIC”).strip(“SEQ”)
)
if (
node.get_text().startswith(“表”)
or node.get_text().startswith(“来源:”)
or node.get_text().startswith(“图”)
):
pass
else:
data[-1]["Content"].append(node.get_text())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)这个为读取的表格信息,我需要这个table数据知道合并规则if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)