怎么删除标题下面指定内容的表和段落还有图片信息

vyacheslav.deryushev · May 28, 2025, 12:15pm

@Tiaohh 这可能是因为其他条件不允许找到该字段。请提供有此字段的文件。

Tiaohh · May 29, 2025, 1:12am

vyacheslav.deryushev · May 29, 2025, 7:20am

@Tiaohh 在这份文件中，我只看到了 REF 字段，但没有看到 “见第 HYPERLINK \l “_生命体征”\u0014 9.5.1.2.2 节”。对于 REF 字段，您可以使用下面的代码或删除 field_type 条件，只保留 node.get_field().result 中没有的"_Toc"：

for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
    if node.node_type == aw.NodeType.FIELD_START:
        node = node.as_field_start()
        if (node.field_type == aw.fields.FieldType.FIELD_HYPERLINK or node.field_type == aw.fields.FieldType.FIELD_REF) and "_Toc" not in node.get_field().result:

Tiaohh · June 3, 2025, 2:47am

import json
import os
import re

import aspose.words as aw

lic = aw.License()
lic_path = "../Aspose.Total.Product.Family.lic"
lic.set_license(lic_path)
import jinja2

import json
import os
import re

import aspose.words as aw
lic = aw.License()
lic_path = "../Aspose.Total.Product.Family.lic"
lic.set_license(lic_path)
import jinja2

#
# def remove_illegal_characters(value):
#     if not value:
#         return None
#
#     original_value = value  # 保存原始文本以便比较
#
#     # 临时替换函数：提取HYPERLINK中的实际文本
#     def replace_hyperlink(match):
#         # 查找HYPERLINK格式中的实际文本
#         text_match = re.search(r'\u0014(.*?)\u0015', match.group(0))
#         if text_match:
#             return text_match.group(1)  # 返回实际文本
#         return ''  # 如果没找到文本，返回空字符串
#
#     # 第一轮：替换超链接格式为其文本内容
#     value = re.sub(r'\u0013\s*HYPERLINK[^\u0015]*?\u0014(.*?)\u0015',
#                    replace_hyperlink,
#                    value,
#                    flags=re.IGNORECASE | re.DOTALL)
#
#     # 移除其他格式代码
#     patterns_to_remove = [
#         r'HYPERLINK\s*\\?l\s*"[^"]*"',  # 超链接标记
#         r'\*\s*MERGEFORMAT',  # 格式标记
#         r'[\000-\010\013\014\016-\037]',  # 控制字符
#         r'\\',  # 反斜杠
#     ]
#
#     for pattern in patterns_to_remove:
#         value = re.sub(pattern, '', value, flags=re.IGNORECASE | re.DOTALL)
#
#     # 将换行符和多个空格替换为单个空格
#     value = re.sub(r'\s+', ' ', value)
#     value = value.strip()
#
#     # 检查是否丢失了引用编号
#     original_refs = re.findall(r'\d+(?:\s*,\s*\d+)*', original_value)
#     cleaned_refs = re.findall(r'\d+(?:\s*,\s*\d+)*', value)
#
#     # 检查原始文本中有但清理后文本中没有的引用
#     for ref in original_refs:
#         if ref.strip() not in ' '.join(cleaned_refs) and ref.strip().isdigit():
#             # 如果引用数字不在清理后文本中，添加回去
#             if value[-1] not in ('.', ',', ':', ';'):
#                 value += ' ' + ref.strip()
#             else:
#                 value += ref.strip()
#
#     return value

def remove_illegal_characters(value):
    if not value:
        return None

    # 第一轮：处理特定格式
    specific_patterns = [
        r'\u0013.*?\u0015',  # 删除域代码及其内容

    ]

    for pattern in specific_patterns:
        value = re.sub(pattern, '', value, flags=re.IGNORECASE | re.DOTALL)

    # 第二轮：处理一般格式
    general_patterns = [

        r'[\000-\010\013\014\016-\037]',

        r'[\000-\010\013\014\016-\037]',
        r'\\',  # 反斜杠
    ]

    for pattern in general_patterns:
        value = re.sub(pattern, '', value, flags=re.IGNORECASE | re.DOTALL)



    return value


# 激活 Aspose.Words 许可证
lic = aw.License()
lic_path = "../Aspose.Total.Product.Family.lic"
lic.set_license(lic_path)
from bs4 import BeautifulSoup


# 读取表格数据函数
def aw_read_table_id(table=None):

    # html = table.to_string(aw.SaveFormat.HTML)
    # return html
    # 将表格转换为HTML
    html = table.to_string(aw.SaveFormat.HTML)

    # 使用BeautifulSoup解析HTML
    soup = BeautifulSoup(html, 'html.parser')


    # 删除不需要的标签和属性，仅保留基本结构和合并信息
    allowed_tags = ['table', 'tr', 'td']
    allowed_attrs = ['rowspan', 'colspan']

    for tag in soup.find_all():
        if tag.name not in allowed_tags:
            tag.unwrap()  # 只保留标签中的内容，删除标签本身
        else:
            # 只保留允许的属性
            tag.attrs = {key: value for key, value in tag.attrs.items() if key in allowed_attrs}

    # 返回处理后的HTML字符串
    clean_html = str(soup)
    return clean_html



def a(file):
    doc = aw.Document(file)
    current_level = 0
    data = []
    doc.update_list_labels()
    stack = []
    has_outline = False  # 用于标记是否存在大纲级别
    all_content = ""
    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                node.get_child_nodes(aw.NodeType.COMMENT, True).clear()
                if node.get_text().strip() and not node.get_ancestor(
                        aw.NodeType.FIELD_START) and not node.get_ancestor(
                    aw.NodeType.TABLE):
                    clean_text = remove_illegal_characters(node.get_text().strip())
                    all_content += clean_text + "\n"

            elif node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                table_content = aw_read_table_id(parent_node)
                all_content += table_content.replace(' ', '') + "\n"

    return {"document_content": all_content}

# 提取文档中的标题和内容函数
def aw_extract_headings_and_contents_table_dict_id(file):
    doc = aw.Document(file)
    current_level = 0
    data = []
    doc.update_list_labels()
    stack = []
    has_outline = False  # 用于标记是否存在大纲级别

    # 首先检查文档是否包含大纲级别的段落
    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    has_outline = True
                    break

    # 如果没有大纲级别，则读取整个文档内容
    if not has_outline:
        all_content = ""
        for s in doc.sections:
            sect = s.as_section()
            for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
                if node.node_type == aw.NodeType.PARAGRAPH:
                    node = node.as_paragraph()
                    node.get_child_nodes(aw.NodeType.COMMENT, True).clear()
                    if node.get_text().strip() and not node.get_ancestor(
                            aw.NodeType.FIELD_START) and not node.get_ancestor(
                        aw.NodeType.TABLE):
                        clean_text = remove_illegal_characters(node.get_text().strip())
                        all_content += clean_text + "\n"

                elif node.node_type == aw.NodeType.TABLE:
                    parent_node = node.as_table()
                    table_content = aw_read_table_id(parent_node)
                    all_content += table_content.replace(' ', '') + "\n"

        return {"document_content": all_content}

    # 如果有大纲级别，使用原来的处理逻辑
    for s in doc.sections:
        sect = s.as_section()
        for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
            if node.node_type == aw.NodeType.PARAGRAPH:
                node = node.as_paragraph()
                if node.node_type == aw.NodeType.FIELD_START:
                    node = node.as_field_start()
                    if node.field_type == aw.fields.FieldType.FIELD_HYPERLINK and "_Toc" not in node.get_field().result:
                        node.get_field().unlink()
                if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
                    if node.node_type == aw.NodeType.FIELD_START:
                        continue
                    level = int(node.paragraph_format.outline_level) + 1
                    if level > current_level:
                        stack.append((current_level, data))
                        data = []
                        current_level = level
                    elif level < current_level:
                        while stack and stack[-1][0] >= level:
                            old_level, old_data = stack.pop()
                            data = old_data + data
                            current_level = old_level
                    label = ''
                    if node.list_format.is_list_item:
                        label = node.list_label.label_string
                    node.get_child_nodes(aw.NodeType.COMMENT, True).clear()
                    text_without_comments = node.get_text().strip()
                    data.append({label + text_without_comments: ''})
                else:
                    node.get_child_nodes(aw.NodeType.COMMENT, True).clear()
                    label = ''
                    if node.list_format.is_list_item:
                        label = node.list_label.label_string
                    node.get_child_nodes(aw.NodeType.COMMENT, True).clear()

                    if node.get_text().strip() and not node.get_ancestor(
                            aw.NodeType.TABLE) and not node.get_ancestor(aw.NodeType.FIELD_START) and data:
                        clean_text = (node.get_text().strip())
                        data[-1][list(data[-1].keys())[0]] += label + clean_text.replace('  SEQ 表 \* ARABIC ',
                                                                                         '').replace(
                            'TOC \h \c "表" HYPERLINK \l "_Toc14741"', '').replace(
                            '\u0013 SEQ 图 \\* ARABIC \u00141\u0015 ', '') + "\n"

            if node.node_type == aw.NodeType.TABLE:
                parent_node = node.as_table()
                table_content = aw_read_table_id(parent_node)
                if data:
                    data[-1][list(data[-1].keys())[0]] += table_content.replace(' ', '')

    while stack:
        old_level, old_data = stack.pop()
        data = old_data + data

    merged_dict = {}
    for small_dict in data:
        for key, value in small_dict.items():
            if key:
                merged_dict[key] = value
    return merged_dict


# 主函数



# 主函数
def process_document(input_file, output_file, force_txt=False):
    """
    处理文档并保存内容

    Args:
        input_file: 输入文档路径
        output_file: 输出文件路径
        force_txt: 是否强制以纯文本格式保存
    """
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    data = aw_extract_headings_and_contents_table_dict_id(input_file)
    # data=a(input_file)

    if (len(data) == 1 and "document_content" in data):
        # 如果强制使用txt格式或文档没有目录结构
        content = data.get('document_content', '')

        with open(output_file, "w", encoding="utf-8") as f:
            f.write(content)
        print(f"文档内容已保存为纯文本格式: {output_file}")
    else:
        # 保存为JSON格式
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"文档内容已保存为JSON格式: {output_file}")


input_file = "./数据处理(2)/01 AL8326-CN-011_临床试验方案 （版本号1.0 版本日期2024-07-25）_签字版.docx"
output_file = "result/01 AL8326-CN-011_临床试验方案 （版本号1.0 版本日期2024-07-25）_签字版.json"

try:
    # 普通处理
    process_document(input_file, output_file)

    # 或强制使用txt格式

    # process_document(input_file, output_file, force_txt=True)
except Exception as e:
    print(f"处理文档时出错: {e}")

不对你这个代码要放在那里进行处理

vyacheslav.deryushev · June 3, 2025, 6:37am

@Tiaohh 您正在使用if node.node_type == aw.NodeType.PARAGRAPH: 中的代码，因此没有机会获得字段节点，因为它已经是段落节点了。在我看来，您根本不需要文档中的字段，因此我建议您在打开文件后尝试使用 doc.unlink_fields() 删除文档中的字段，只留下文本。

Tiaohh · June 12, 2025, 8:07am

[quote=“vyacheslav.deryushev, post:25, topic:313060”]
因此没有机会获得字段节点，因为它已经是段落节点了。在我看来，您根本不需要文档中的字段，因此我建议您在打开文件后尝试使用 doc.unlink_fields() 删除文档中的字段，只留下文本。
[/quot
4. 1102002_final_tables_20250611.docx (232.4 KB)

q请问这个文件怎么切割成一个table为一个文件

vyacheslav.deryushev · June 12, 2025, 6:29pm

@Tiaohh 这项任务没有现成的解决方案。对于本文档，您可以获取第一个表格段落 “表 14.1.1”，然后使用 LayoutCollector.get_start_page_index method | Aspose.Words for Python 获取当前表格所在的页码（直到下一个 “表 14.1.2”，以此类推）。获得页码后，您就可以使用 Document.extract_pages method | Aspose.Words for Python 创建包含当前表格的文件。