怎么把table的node节点转换为html格式

hhh1111 · October 17, 2024, 3:26am

def aw_extract_headings_and_contents_table_dict_id(file):
    doc = aw.Document(file)
    current_level = 0
    data = []
    doc.update_list_labels()
    stack = []

    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    if node.node_type == aw.NodeType.FIELD_START:
                        continue
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level
                    label = ''
                    if node.list_format.is_list_item:
                        label = node.list_label.label_string
                    text_without_comments = node.get_text().strip()
                    data.append({"title": label + text_without_comments if label else text_without_comments, "content": [], "tables": []})
                else:
                    if data and node.get_text().strip():
                        data[-1]["content"].append(node.get_text().strip().replace("  SEQ 表 \* ARABIC ", '').replace('TOC \h \c "表" HYPERLINK \l "_Toc14741"', '').replace("\u0013 SEQ 图 \\* ARABIC \u00141\u0015 ", '')+ "\n")

            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                table_content = aw_read_table_id(parent_node)
              
                if data:
                    data[-1]["tables"].append(table_content)

    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data

    return data
def aw_read_table_id(table=None, id=1):
    table.convert_to_horizontally_merged_cells()

    table_data = []
    for row in table.rows:
        content = {
            "type": "tableRow",
            "content": []
        }
        row_index = table.index_of(row)
        cell = row.as_row().first_cell
        row_span = 1
        col_span = 1
        current_cell = cell
        cell_index = 0
        cell_text = ""
        while current_cell is not None:
            cell_index = current_cell.parent_row.index_of(current_cell)
            if current_cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                cell_text = current_cell.get_text()
                current_cell = current_cell.next_cell
                for i in range(row_index, table.rows.count):
                    if table.rows[i].cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                        row_span += 1
                while current_cell is not None and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                    col_span = col_span + 1
                    current_cell = current_cell.next_cell
            elif current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.FIRST:
                cell_text = current_cell.get_text()
                current_cell = current_cell.next_cell
                while current_cell is not None and current_cell.cell_format.horizontal_merge == aw.tables.CellMerge.PREVIOUS:
                    col_span = col_span + 1
                    current_cell = current_cell.next_cell
            elif current_cell.cell_format.vertical_merge == aw.tables.CellMerge.FIRST:
                cell_text = current_cell.get_text()
                cell_index = current_cell.parent_row.index_of(current_cell)
                for i in range(row_index, table.rows.count):
                    if table.rows[i].cells[cell_index].cell_format.vertical_merge == aw.tables.CellMerge.PREVIOUS:
                        row_span += 1
                current_cell = current_cell.next_cell
            else:

                cell_text = current_cell.get_text()
                current_cell = current_cell.next_cell

            cell_content = {
                "type": "tableCell",
                "attrs": {
                    "colspan": col_span,
                    "rowspan": row_span,
                    "colwidth": None
                },
                "content": []
            }

            paragraph = {
                "type": "paragraph",
                "content": [
                    {
                        "type": "text",
                        "text": cell_text,
                    }
                ]
            }
            cell_content["content"].append(paragraph)
            content["content"].append(cell_content)

            col_span = 1
            row_span = 1

        table_data.append(content)
    return_data = {
        "type": "table",
        "attrs": {
            "id": id
        },
        "content": table_data
    }
    print(1111,return_data)
    return return_data

上面aw_read_table_id的这个函数怎么返回一个html表格的数据

Professionalize.Discourse · October 17, 2024, 3:26am

@hhh1111

要将表格的节点转换为HTML格式，您可以使用Aspose.Words库中的功能来实现。以下是一个示例代码，展示了如何将表格转换为HTML格式：

import aspose.words as aw

def table_to_html(file_path):
    # 加载文档
    doc = aw.Document(file_path)
    
    # 获取文档中的第一个表格
    table = doc.get_child(aw.NodeType.TABLE, 0, True)
    
    # 将表格转换为HTML
    html = table.to_string(aw.SaveFormat.HTML)
    
    return html

# 使用示例
file_path = "path/to/your/document.docx"
html_output = table_to_html(file_path)
print(html_output)

在这个示例中，我们首先加载一个Word文档，然后获取文档中的第一个表格。接着，我们使用to_string方法将表格转换为HTML格式。您可以根据需要调整文件路径和表格索引。

如果您需要处理多个表格或特定的表格，可以在代码中进行相应的修改。希望这能帮助您实现将表格节点转换为HTML格式的需求！

hhh1111 · October 17, 2024, 3:27am

还有就是我不需要导航标题信息

hhh1111 · October 17, 2024, 3:29am

为什么我获取段落信息。表格数据也出现在段落信息里面了

hhh1111 · October 17, 2024, 3:31am

为什么我获取段落信息。表格数据也出现在段落信息里面了

hhh1111 · October 17, 2024, 3:35am

def aw_extract_headings_and_contents_table_dict_id(file):
    doc = aw.Document(file)
    current_level = 0
    data = []
    doc.update_list_labels()
    stack = []

    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    if node.node_type == aw.NodeType.FIELD_START:
                        continue
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level
                    label = ''
                    if node.list_format.is_list_item:
                        label = node.list_label.label_string
                    text_without_comments = node.get_text().strip()
                    data.append(
                        {"title": label + text_without_comments if label else text_without_comments, "content": [],
                         "tables": []})
                else:
                    if data and node.get_text().strip():
                        data[-1]["content"].append(node.get_text().strip().replace("  SEQ 表 \* ARABIC ", '').replace(
                            'TOC \h \c "表" HYPERLINK \l "_Toc14741"', '').replace(
                            "\u0013 SEQ 图 \\* ARABIC \u00141\u0015 ", '') + "\n")

            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                table_content = aw_read_table_id(parent_node)
                if data:
                    data[-1]["tables"].append(table_content)

    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data

    return data

不要把表格数据出现在段落内容的content里面

vyacheslav.deryushev · October 17, 2024, 7:45am

@hhh1111 这是因为您可以获取文档中的所有节点。段落节点也是表格的一部分，因此在获取每个段落时，也会获取表格中的段落。在这种情况下，你可以仔细检查这个段落是不是表格的一部分：

if node.node_type == aw.NodeType.PARAGRAPH and node.get_ancestor(aw.NodeType.TABLE) is None:

hhh1111 · October 18, 2024, 2:02am

请问怎么提取出来段落内容是绿色字体的的

hhh1111 · October 18, 2024, 2:37am

请问怎么提取出来段落内容是绿色字体的的

hhh1111 · October 18, 2024, 2:45am

def aw_extract_headings_and_contents_table_dict_id(file):
    doc = aw.Document(file)
    current_level = 0
    data = {}
    doc.update_list_labels()
    stack = []

    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    if node.node_type == aw.NodeType.FIELD_START:
                        continue
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:
                        stack.append((current_level, data))
                        data = {}
                        current_level = level
                    elif level < current_level:
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = {**old_data, **data}
                            current_level = old_level
                    label = ''
                    if node.list_format.is_list_item:
                        label = node.list_label.label_string
                    text_without_comments = node.get_text().strip()
                    current_key = label + text_without_comments if label else text_without_comments

                    new_run = node.as_run()
                    # new_run.text = i + aw.ControlChar.PARAGRAPH_BREAK_CHAR
                    print(new_run)

                    if current_key not in data:
                        data[current_key] = ""
                else:
                    if node.list_format.is_list_item:
                        label = node.list_label.label_string
                    if node.get_text().strip() and not node.get_ancestor(
                            aw.NodeType.TABLE) and not node.get_ancestor(aw.NodeType.FIELD_START) and data:
                        last_key = list(data.keys())[-1]
                        data[last_key] += label + node.get_text().strip() if label else node.get_text().strip() + "\n"

            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                table_content = aw_read_table_as_markdown(parent_node)

                if data:
                    last_key = list(data.keys())[-1]
                    data[last_key] += table_content + "\n"

    while stack:
        old_level, old_data = stack.pop()
        data = {**old_data, **data}

    return data

hhh1111 · October 18, 2024, 3:56am

比如一段话是有绿的色内容。提取出来给这段话前后加一个标签

vyacheslav.deryushev · October 18, 2024, 8:06am

@hhh1111 字体颜色有几种情况。

如果段落包含带有字体颜色的run，则需要使用：

doc = aw.Document("input.docx")

for run in doc.get_child_nodes(aw.NodeType.RUN, True):
    run = run.as_run()
    if run.font.color == drawing.Color.green:
        new_run_1 = aw.Run(doc, "New text 1")
        new_run_2 = aw.Run(doc, "New text 2")

        para = run.parent_paragraph
        para.insert_before(new_run_1, para.first_child)
        para.append_child(new_run_2)

doc.save("output.docx")

如果你需要找到字体颜色的段落，你需要使用：

doc = aw.Document("input.docx")

for para in doc.get_child_nodes(aw.NodeType.PARAGRAPH, True):
    para = para.as_paragraph()
    if para.paragraph_break_font.color == drawing.Color.green:
        new_run = aw.Run(doc, "New text")
        new_para_1 = aw.Paragraph(doc)
        new_para_1.append_child(new_run)
        new_para_2 = new_para_1.clone(True)

        para.parent_node.insert_before(new_para_1, para)
        para.parent_node.insert_after(new_para_2, para)

doc.save("output.docx")

hhh1111 · November 6, 2024, 7:36am

怎么给文本设置绿色呢？？？？？

vyacheslav.deryushev · November 6, 2024, 7:58am

@hhh1111 你应该为段落中的跑步设置颜色。例如：

ParagraphCollection paragraphs = doc.getFirstSection().getBody().getParagraphs();
for (Paragraph paragraph : paragraphs) {
    for (Run run : paragraph.getRuns()) {
        run.getFont().setColor(Color.GREEN);
    }
}

runs = doc.get_child_nodes(aw.NodeType.RUN, True)
for run in runs:
    run = run.as_run()
    run.font.color = drawing.Color.green

hhh1111 · November 6, 2024, 8:34am

vyacheslav.deryushev:

doc = aw.Document("input.docx")

for para in doc.get_child_nodes(aw.NodeType.PARAGRAPH, True):
    para = para.as_paragraph()
    if para.paragraph_break_font.color == drawing.Color.green:
        new_run = aw.Run(doc, "New text")
        new_para_1 = aw.Paragraph(doc)
        new_para_1.append_child(new_run)
        new_para_2 = new_para_1.clone(True)

        para.parent_node.insert_before(new_para_1, para)
        para.parent_node.insert_after(new_para_2, para)

doc.save("output.docx")

我还有一个问题 for part in parts:
if part.startswith(‘<table’):
# 处理表格内容
builder.insert_html(part) 怎么给插入的insert_html table重新设置样式呢

hhh1111 · November 6, 2024, 8:40am

builder.insert_html(part) 我需要对插入的表格重新设置表格样式

vyacheslav.deryushev · November 6, 2024, 9:48am

@hhh1111 您需要找到插入的表格并清除样式：

table = doc.first_section.body.tables[0]
table.clear_borders()
table.clear_shading()

之后，您可以使用例如table.style_identifier = aw.StyleIdentifier.TABLE_GRID来设置新的表格样式。

hhh1111 · November 6, 2024, 10:16am

可是不能只获取一个table吧我的代码如下我需要在这里重置表格样式 # 处理表格内容

import aspose.words as aw
import jinja2

# 激活 Aspose.Words 许可证
lic = aw.License()
lic_path = "../Aspose.Total.Product.Family.lic"
lic.set_license(lic_path)
import aspose.pydrawing as drawing

import re
def set_paragraph_color(builder, color):
    if color == "green":
        builder.font.color = drawing.Color.green
    elif color == "blue":
        builder.font.color = drawing.Color.blue
    else:
        builder.font.clear_formatting()


def insert_title_content(doc_path, insertions):
    doc = aw.Document(doc_path)
    builder = aw.DocumentBuilder(doc)

    builder.paragraph_format.clear_formatting()

    for paragraph in doc.get_child_nodes(aw.NodeType.PARAGRAPH, True):
        paragraph = paragraph.as_paragraph()
        if "start_insert" in paragraph.get_text():
            # 清除当前段落的内容
            paragraph.get_child_nodes(aw.NodeType.RUN, True).clear()
            builder.move_to(paragraph)
            builder.paragraph_format.clear_formatting()
            for title, content in insertions.items():
                # 插入标题
                heading_level = 1  # 默认标题级别为1
                builder.paragraph_format.style_name = f"Heading {heading_level}"
                builder.writeln(title.strip())

                # 处理和插入内容
                if content.strip():
                    # 用正则表达式匹配表格和blue标签的内容
                    parts = re.split(r'(<table.*?>.*?</table>|<blue>.*?</blue>|<green>.*?</green>)', content, flags=re.S)

                    for part in parts:
                        if part.startswith('<table'):
                            # 处理表格内容
                            builder.insert_html(part)
                        elif part.startswith('<blue>'):
                            # 处理blue标签内容
                            color_content = re.sub(r'</?blue>', '', part)  # 去掉<blue>标签
                            set_paragraph_color(builder, "blue")  # 设置绿色
                            builder.paragraph_format.style_name = "Normal"
                            builder.write(color_content.strip('<br>'))
                            set_paragraph_color(builder, None)  # 恢复默认配色
                        elif part.startswith('<green>'):
                            # 处理blue标签内容
                            color_content = re.sub(r'</?green>', '', part)  # 去掉<blue>标签
                            set_paragraph_color(builder, "green")  # 设置绿色
                            builder.paragraph_format.style_name = "Normal"
                            builder.write(color_content.strip('<br>'))
                            set_paragraph_color(builder, None)  # 恢复默认配色
                        else:
                            # 处理普通内容
                            builder.paragraph_format.style_name = "Normal"
                            builder.writeln(part.strip())



    doc.save('result.docx')

import json

if name == ‘main’:
with open(‘result.json’, ‘r’, encoding=‘utf-8’) as file:
data = json.load(file)
insert_title_content(‘a.docx’, data)

vyacheslav.deryushev · November 6, 2024, 12:28pm

@hhh1111 您可以在插入html表后获取当前段落，并获取前一个节点，该节点将是一个表。例如：

builder.insert_html(html_content);
table_node = builder.current_paragraph.previous_sibling
if table_node.node_type == aw.NodeType.TABLE:
    table = table_node.as_table()
    table.clear_borders()
    table.clear_shading()

hhh1111 · November 7, 2024, 2:19am

哪个属性是设置段前行间距的属性builder.paragraph_format.段前