可以给我写一个通用的demo吗,因为每个表格合并的规则都不一样
table_row["content"].append(
{
"type": "tableCell",
"attrs": {
"colspan": colspan,
"rowspan": rowspan,
'id': item["block_id"],
},
"content": [
{
"type": "paragraph",
"content": [
{
"type": "text",
"text": cell,
}
],
}
],
}
)
我想处理成这样的格式
可以看下我发布的帖子ma
看下我的帖子?????????
帮我看下我的帖子????
测试.docx (21.0 KB)
处理后的格式需要如下
{
“type”: “tableCell”,
“attrs”: {
“colspan”: colspan,
“rowspan”: rowspan,
‘id’: item[“block_id”],
},
“content”: [
{
“type”: “paragraph”,
“content”: [
{
“type”: “text”,
“text”: cell,
}
],
}
],
}
我的帖子大家看不见嘛?????
你可以看见我的帖子嘛?????
请问可以看见我的帖子嘛?????????????
@hhh1111 要获取行跨度和列跨度,您需要将 docx 文件转换为 HTML 文件,然后像这样收集以下信息:
def test_merged_cells(self):
doc = aw.Document("merged.docx")
with BytesIO() as html_stream:
save_options = aw.saving.HtmlSaveOptions()
save_options.images_folder = ARTIFACTS_DIR
doc.save(html_stream, save_options)
html_stream.seek(0)
xml_doc = xml_tree.parse(html_stream)
tables = xml_doc.findall(".//table")
for table in tables:
table_inf = []
rows = table.findall(".//tr")
for row in rows:
row_inf = []
cells = row.findall(".//td")
for cell in cells:
col_span_attr = cell.get("colspan")
row_span_attr = cell.get("rowspan")
col_span = int(col_span_attr) if col_span_attr else 0
row_span = int(row_span_attr) if row_span_attr else 0
cell_inf = self.CellInfo(col_span, row_span)
row_inf.append(cell_inf)
table_inf.append(row_inf)
self.mTables.append(table_inf)
print("test")
class CellInfo:
def __init__(self, colSpan, rowSpan):
self.ColSpan = colSpan
self.RowSpan = rowSpan
mTables = []
那如果我文件即有表格信息和段落信息呢,我只想把表格信息处理成上面的格式
def aw_read_table(tables):
_table = []
for row in tables.rows:
_row = ""
for cell in row.as_row().cells:
if _row:
_row = _row + "\t" + cell.as_cell().get_text().strip()
else:
_row = cell.as_cell().get_text().strip()
_table.append(_row)
table_info = "\n".join(_table)
return table_info
def aw_extract_headings_and_contents_table_dict(file):
import aspose.words as aw
lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text(),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
)
else:
if data:
if node.get_text().startswith("表"):
data[-1]["Tbale_name"].append(
node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
)
if (
node.get_text().startswith("表")
or node.get_text().startswith("来源:")
or node.get_text().startswith("图")
):
pass
else:
data[-1]["Content"].append(node.get_text())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
不是的 如果我文件是这样的呢
CSR-20240410-h63D4QaGoR.docx (838.1 KB)
我需要table数据为这种格式
def aw_read_table(tables):
_table = []
for row in tables.rows:
_row = ""
for cell in row.as_row().cells:
if _row:
_row = _row + "\t" + cell.as_cell().get_text().strip()
else:
_row = cell.as_cell().get_text().strip()
_table.append(_row)
table_info = "\n".join(_table)
return table_info
def aw_extract_headings_and_contents_table_dict(file):
import aspose.words as aw
lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text(),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
)
else:
if data:
if node.get_text().startswith("表"):
data[-1]["Tbale_name"].append(
node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
)
if (
node.get_text().startswith("表")
or node.get_text().startswith("来源:")
or node.get_text().startswith("图")
):
pass
else:
data[-1]["Content"].append(node.get_text())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
def test_merged_cells(self):
doc = aw.Document("merged.docx")
with BytesIO() as html_stream:
save_options = aw.saving.HtmlSaveOptions()
save_options.images_folder = ARTIFACTS_DIR
doc.save(html_stream, save_options)
html_stream.seek(0)
xml_doc = xml_tree.parse(html_stream)
tables = xml_doc.findall(".//table")
for table in tables:
table_inf = []
rows = table.findall(".//tr")
for row in rows:
row_inf = []
cells = row.findall(".//td")
for cell in cells:
col_span_attr = cell.get("colspan")
row_span_attr = cell.get("rowspan")
col_span = int(col_span_attr) if col_span_attr else 0
row_span = int(row_span_attr) if row_span_attr else 0
cell_inf = self.CellInfo(col_span, row_span)
row_inf.append(cell_inf)
table_inf.append(row_inf)
self.mTables.append(table_inf)
print("test")
class CellInfo:
def __init__(self, colSpan, rowSpan):
self.ColSpan = colSpan
self.RowSpan = rowSpan
mTables = []
这个为table数据 if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
@hhh1111 使用此代码,您可以获得有关所拥有的表的信息。并以您需要的任何格式收集。
doc = aw.Document("input.docx")
word_tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
with BytesIO() as html_stream:
save_options = aw.saving.HtmlSaveOptions()
save_options.images_folder = ARTIFACTS_DIR
doc.save(html_stream, save_options)
html_stream.seek(0)
xml_doc = xml_tree.parse(html_stream)
m_tables = []
tables = xml_doc.findall(".//table")
for table in tables:
table_inf = self.TableInfo()
rows = table.findall(".//tr")
for row in rows:
row_inf = self.RowInfo()
cells = row.findall(".//td")
for cell in cells:
col_span_attr = cell.get("colspan")
row_span_attr = cell.get("rowspan")
col_span = int(col_span_attr) if col_span_attr else 0
row_span = int(row_span_attr) if row_span_attr else 0
cell_inf = self.CellInfo(col_span, row_span)
row_inf.Cells.append(cell_inf)
table_inf.Rows.append(row_inf)
m_tables.append(table_inf)
for table in word_tables:
for row in table.as_table().rows:
for cell in row.as_row().cells:
cell = cell.as_cell()
tab_id = word_tables.index_of(cell.parent_row.parent_table)
row_id = cell.parent_row.parent_table.index_of(cell.parent_row)
cell_id = cell.parent_row.index_of(cell)
col_span = 0
row_span = 0
if tab_id < len(m_tables) and row_id < len(m_tables[tab_id].Rows) and cell_id < len(
m_tables[tab_id].Rows[row_id].Cells):
col_span = m_tables[tab_id].Rows[row_id].Cells[cell_id].ColSpan
row_span = m_tables[tab_id].Rows[row_id].Cells[cell_id].RowSpan
print("{0}.{1}.{2} colspan={3}\t rowspan={4}\t text={5}".format(tab_id, row_id, cell_id, col_span,
row_span, cell.get_text()))
class TableInfo:
def __init__(self):
self.Rows = []
class RowInfo:
def __init__(self):
self.Cells = []
class CellInfo:
def __init__(self, col_span, row_span):
self.ColSpan = col_span
self.RowSpan = row_span
不对呀,我要通过以下代码 ,更改表格数据格式
ic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, “core/Aspose.Total.Product.Family.lic”)
lic.set_license(lic_path)
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
“Title”: node.get_text(),
“Content”: [],
“Level”: level,
“Table”: [],
“Tbale_name”: [],
}
)
else:
if data:
if node.get_text().startswith(“表”):
data[-1][“Tbale_name”].append(
node.get_text().strip(“SEQ * ARABIC”).strip(“SEQ”)
)
if (
node.get_text().startswith(“表”)
or node.get_text().startswith(“来源:”)
or node.get_text().startswith(“图”)
):
pass
else:
data[-1]["Content"].append(node.get_text())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)这个为读取的表格信息,我需要这个table数据知道合并规则if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
able_content = aw_read_table(parent_node)
data[-1][“Table”].append(able_content)
@hhh1111 根据 Microsoft Word 的设计,Microsoft Word 文档中表格的行是完全独立的。这意味着每一行可以有任意数量、任意宽度的单元格。因此,如果您想象第一行有一个宽单元格,第二行有两个窄单元格,那么查看此文档时,第一行的单元格会出现水平合并。但这并不是合并单元格,它只是一个宽单元格。另一种完全正确的情况是第一行有两个单元格。第一个单元格有 CellMerge.First,第二个单元格有 CellMerge.Previous,在这种情况下,它就是一个合并单元格。在这两种情况下,MS Word 中的视觉效果是完全一样的。这两种情况都是有效的。
您可以使用下面的代码,它可以计算出合并单元格跨越了多少列或多少行:
def aw_read_table(self, table, tables):
table_data = []
for row in table.rows:
for cell in row.as_row().cells:
parent_table = table
cell = cell.as_cell()
is_horizontally_merged = cell.cell_format.horizontal_merge != aw.tables.CellMerge.NONE
is_vertically_merged = cell.cell_format.vertical_merge != aw.tables.CellMerge.NONE
table_index = tables.index_of(parent_table)
row_index = parent_table.index_of(row)
cell_index = cell.parent_row.index_of(cell)
row_span = 1
col_span = 1
if row_index < parent_table.rows.count and cell_index < parent_table.rows[row_index].cells.count:
if is_horizontally_merged and is_vertically_merged:
if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
for i in range(cell_index, cell.parent_row.cells.count):
if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
col_span += 1
if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
for i in range(row_index, parent_table.rows.count):
if parent_table.rows[i].cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
row_span += 1
elif is_horizontally_merged:
if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
for i in range(cell_index, cell.parent_row.cells.count):
if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
col_span += 1
elif is_vertically_merged:
if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
for i in range(row_index, parent_table.rows.count):
if parent_table.rows[i].cells[cell_index] is not None and \
parent_table.rows[i].cells[
cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
row_span += 1
table_data.append({
"type": "tableCell",
"attrs": {
"colspan": col_span,
"rowspan": row_span,
"id": f"{table_index}.{row_index}.{cell_index}",
},
"content": [
{
"type": "paragraph",
"content": [
{
"type": "text",
"text": cell.get_text(),
}
],
}
],
})
return table_data
doc = aw.Document("CSR.docx")
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text(),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
)
else:
if data:
if node.get_text().startswith("表"):
data[-1]["Tbale_name"].append(
node.get_text().strip("SEQ \* ARABIC").strip("SEQ")
)
if (
node.get_text().startswith("表")
or node.get_text().startswith("来源:")
or node.get_text().startswith("图")
):
pass
else:
data[-1]["Content"].append(node.get_text())
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
able_content = self.aw_read_table(parent_node, tables)
data[-1]["Table"].append(able_content)
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
希望能帮到你。
表格出来合并的数据不对呀
这里 rowspan不可能是1啊
def aw_extract_headings_and_contents_table_dict_id(file):
import aspose.words as aw
lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)
doc = aw.Document(file)
current_level = 0
data = []
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, False):
block_id = generate_unique_id()
block_id1 = generate_unique_id()
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
# 如果级别更深,将当前标题添加到堆栈
stack.append((current_level, data))
data = []
current_level = level
elif level < current_level:
# 如果级别更浅,将堆栈中的项添加回数据
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = old_data + data
current_level = old_level
data.append(
{
"Title": node.get_text(),
"block_id": str(block_id),
"Content": [],
"Level": level,
"Table": [],
"Tbale_name": [],
}
)
else:
if data:
if node.get_text().strip():
data[-1]["Content"].append(
{"type": "text", "content": node.get_text().strip(),
"block_id": data[-1]["block_id"] + '&&' + str(block_id1),
"parent_block_id": data[-1]["block_id"]})
if data:
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
tables = doc.get_child_nodes(aw.NodeType.TABLE, True)
_able_content = aw_read_table_id(parent_node, tables)
able_content = {"type": "table",
"attrs": {
"id": data[-1]["block_id"] + '&&' + str(block_id1)
},
"content": _able_content}
data[-1]["Content"].append(
{"type": "table",
"content": able_content,
"block_id": data[-1]["block_id"] + '&&' + str(block_id1),
"parent_block_id": data[-1]["block_id"]})
while stack:
old_level, old_data = stack.pop()
data = old_data + data
return data
def aw_read_table_id(table, tables):
import aspose.words as aw
lic = aw.License()
lic_path = os.path.join(settings.BASE_PATH, "core/Aspose.Total.Product.Family.lic")
lic.set_license(lic_path)
table_data = []
for row in table.rows:
content = {
"type": "tableRow",
"content": []
}
for cell in row.as_row().cells:
parent_table = table
cell = cell.as_cell()
is_horizontally_merged = cell.cell_format.horizontal_merge != aw.tables.CellMerge.NONE
is_vertically_merged = cell.cell_format.vertical_merge != aw.tables.CellMerge.NONE
table_index = tables.index_of(parent_table)
row_index = parent_table.index_of(row)
cell_index = cell.parent_row.index_of(cell)
row_span = 1
col_span = 1
if row_index < parent_table.rows.count and cell_index < parent_table.rows[row_index].cells.count:
if is_horizontally_merged and is_vertically_merged:
if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
for i in range(cell_index, cell.parent_row.cells.count):
if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
col_span += 1
if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
for i in range(row_index, parent_table.rows.count):
if parent_table.rows[i].cells[
cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
row_span += 1
elif is_horizontally_merged:
if cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
for i in range(cell_index, cell.parent_row.cells.count):
if cell.parent_row.cells[i].cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
col_span += 1
elif is_vertically_merged:
if cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
for i in range(row_index, parent_table.rows.count):
if parent_table.rows[i].cells[cell_index] is not None and \
parent_table.rows[i].cells[
cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
row_span += 1
cell_content = {
"type": "tableCell",
"attrs": {
"colspan": col_span,
"rowspan": row_span,
"colwidth": None
},
"content": []
}
# Add paragraph content to the cell
paragraph = {
"type": "paragraph",
"content": [
{
"type": "text",
"text": cell.get_text(),
}
]
}
cell_content["content"].append(paragraph)
content["content"].append(cell_content)
table_data.append(content)
return table_data