@hhh1111 这是因为您可以获取文档中的所有节点。段落节点也是表格的一部分,因此在获取每个段落时,也会获取表格中的段落。在这种情况下,你可以仔细检查这个段落是不是表格的一部分:
if node.node_type == aw.NodeType.PARAGRAPH and node.get_ancestor(aw.NodeType.TABLE) is None:
@hhh1111 这是因为您可以获取文档中的所有节点。段落节点也是表格的一部分,因此在获取每个段落时,也会获取表格中的段落。在这种情况下,你可以仔细检查这个段落是不是表格的一部分:
if node.node_type == aw.NodeType.PARAGRAPH and node.get_ancestor(aw.NodeType.TABLE) is None:
请问怎么提取出来段落内容是绿色字体的的
请问怎么提取出来段落内容是绿色字体的的
def aw_extract_headings_and_contents_table_dict_id(file):
doc = aw.Document(file)
current_level = 0
data = {}
doc.update_list_labels()
stack = []
for s in doc.sections:
sect = s.as_section()
for node in sect.body.get_child_nodes(aw.NodeType.ANY, True):
if node.node_type == aw.NodeType.PARAGRAPH:
node = node.as_paragraph()
if node.paragraph_format.outline_level in [0, 1, 2, 3, 4, 5]:
if node.node_type == aw.NodeType.FIELD_START:
continue
level = int(node.paragraph_format.outline_level) + 1
if level > current_level:
stack.append((current_level, data))
data = {}
current_level = level
elif level < current_level:
while stack and stack[-1][0] >= level:
old_level, old_data = stack.pop()
data = {**old_data, **data}
current_level = old_level
label = ''
if node.list_format.is_list_item:
label = node.list_label.label_string
text_without_comments = node.get_text().strip()
current_key = label + text_without_comments if label else text_without_comments
new_run = node.as_run()
# new_run.text = i + aw.ControlChar.PARAGRAPH_BREAK_CHAR
print(new_run)
if current_key not in data:
data[current_key] = ""
else:
if node.list_format.is_list_item:
label = node.list_label.label_string
if node.get_text().strip() and not node.get_ancestor(
aw.NodeType.TABLE) and not node.get_ancestor(aw.NodeType.FIELD_START) and data:
last_key = list(data.keys())[-1]
data[last_key] += label + node.get_text().strip() if label else node.get_text().strip() + "\n"
if node.node_type == aw.NodeType.TABLE:
parent_node = node.as_table()
table_content = aw_read_table_as_markdown(parent_node)
if data:
last_key = list(data.keys())[-1]
data[last_key] += table_content + "\n"
while stack:
old_level, old_data = stack.pop()
data = {**old_data, **data}
return data
比如 一段话是有绿的色内容。提取出来给这段话前后加一个标签
@hhh1111 字体颜色有几种情况。
如果段落包含带有字体颜色的run,则需要使用:
doc = aw.Document("input.docx")
for run in doc.get_child_nodes(aw.NodeType.RUN, True):
run = run.as_run()
if run.font.color == drawing.Color.green:
new_run_1 = aw.Run(doc, "New text 1")
new_run_2 = aw.Run(doc, "New text 2")
para = run.parent_paragraph
para.insert_before(new_run_1, para.first_child)
para.append_child(new_run_2)
doc.save("output.docx")
如果你需要找到字体颜色的段落,你需要使用:
doc = aw.Document("input.docx")
for para in doc.get_child_nodes(aw.NodeType.PARAGRAPH, True):
para = para.as_paragraph()
if para.paragraph_break_font.color == drawing.Color.green:
new_run = aw.Run(doc, "New text")
new_para_1 = aw.Paragraph(doc)
new_para_1.append_child(new_run)
new_para_2 = new_para_1.clone(True)
para.parent_node.insert_before(new_para_1, para)
para.parent_node.insert_after(new_para_2, para)
doc.save("output.docx")
怎么给文本设置绿色呢?????
@hhh1111 你应该为段落中的跑步设置颜色。例如:
ParagraphCollection paragraphs = doc.getFirstSection().getBody().getParagraphs();
for (Paragraph paragraph : paragraphs) {
for (Run run : paragraph.getRuns()) {
run.getFont().setColor(Color.GREEN);
}
}
runs = doc.get_child_nodes(aw.NodeType.RUN, True)
for run in runs:
run = run.as_run()
run.font.color = drawing.Color.green
我还有一个问题 for part in parts:
if part.startswith(‘<table’):
# 处理表格内容
builder.insert_html(part) 怎么给插入的insert_html table重新设置样式呢
builder.insert_html(part) 我需要对插入的表格重新设置表格样式
@hhh1111 您需要找到插入的表格并清除样式:
table = doc.first_section.body.tables[0]
table.clear_borders()
table.clear_shading()
之后,您可以使用例如table.style_identifier = aw.StyleIdentifier.TABLE_GRID
来设置新的表格样式。
可是不能只获取一个table吧 我的代码如下 我需要在这里重置表格样式 # 处理表格内容
import aspose.words as aw
import jinja2
# 激活 Aspose.Words 许可证
lic = aw.License()
lic_path = "../Aspose.Total.Product.Family.lic"
lic.set_license(lic_path)
import aspose.pydrawing as drawing
import re
def set_paragraph_color(builder, color):
if color == "green":
builder.font.color = drawing.Color.green
elif color == "blue":
builder.font.color = drawing.Color.blue
else:
builder.font.clear_formatting()
def insert_title_content(doc_path, insertions):
doc = aw.Document(doc_path)
builder = aw.DocumentBuilder(doc)
builder.paragraph_format.clear_formatting()
for paragraph in doc.get_child_nodes(aw.NodeType.PARAGRAPH, True):
paragraph = paragraph.as_paragraph()
if "start_insert" in paragraph.get_text():
# 清除当前段落的内容
paragraph.get_child_nodes(aw.NodeType.RUN, True).clear()
builder.move_to(paragraph)
builder.paragraph_format.clear_formatting()
for title, content in insertions.items():
# 插入标题
heading_level = 1 # 默认标题级别为1
builder.paragraph_format.style_name = f"Heading {heading_level}"
builder.writeln(title.strip())
# 处理和插入内容
if content.strip():
# 用正则表达式匹配表格和blue标签的内容
parts = re.split(r'(<table.*?>.*?</table>|<blue>.*?</blue>|<green>.*?</green>)', content, flags=re.S)
for part in parts:
if part.startswith('<table'):
# 处理表格内容
builder.insert_html(part)
elif part.startswith('<blue>'):
# 处理blue标签内容
color_content = re.sub(r'</?blue>', '', part) # 去掉<blue>标签
set_paragraph_color(builder, "blue") # 设置绿色
builder.paragraph_format.style_name = "Normal"
builder.write(color_content.strip('<br>'))
set_paragraph_color(builder, None) # 恢复默认配色
elif part.startswith('<green>'):
# 处理blue标签内容
color_content = re.sub(r'</?green>', '', part) # 去掉<blue>标签
set_paragraph_color(builder, "green") # 设置绿色
builder.paragraph_format.style_name = "Normal"
builder.write(color_content.strip('<br>'))
set_paragraph_color(builder, None) # 恢复默认配色
else:
# 处理普通内容
builder.paragraph_format.style_name = "Normal"
builder.writeln(part.strip())
doc.save('result.docx')
import json
if name == ‘main’:
with open(‘result.json’, ‘r’, encoding=‘utf-8’) as file:
data = json.load(file)
insert_title_content(‘a.docx’, data)
@hhh1111 您可以在插入html表后获取当前段落,并获取前一个节点,该节点将是一个表。例如:
builder.insert_html(html_content);
table_node = builder.current_paragraph.previous_sibling
if table_node.node_type == aw.NodeType.TABLE:
table = table_node.as_table()
table.clear_borders()
table.clear_shading()
哪个属性是设置段前行间距的属性builder.paragraph_format.段前
import aspose.words as aw
import jinja2
# 激活 Aspose.Words 许可证
lic = aw.License()
lic_path = "../Aspose.Total.Product.Family.lic"
lic.set_license(lic_path)
import aspose.pydrawing as drawing
import re
def set_paragraph_color(builder, color):
if color == "green":
builder.font.color = drawing.Color.green
elif color == "blue":
builder.font.color = drawing.Color.blue
else:
builder.font.clear_formatting()
def insert_title_content(doc_path, insertions):
doc = aw.Document(doc_path)
builder = aw.DocumentBuilder(doc)
for paragraph in doc.get_child_nodes(aw.NodeType.PARAGRAPH, True):
paragraph = paragraph.as_paragraph()
builder.paragraph_format.clear_formatting()
if "start_insert" in paragraph.get_text():
# 清除当前段落的内容
paragraph.get_child_nodes(aw.NodeType.RUN, True).clear()
builder.move_to(paragraph)
builder.paragraph_format.clear_formatting()
for title, content in insertions.items():
# 插入标题
builder.paragraph_format.space_before = 0
heading_level = 1 # 默认标题级别为1
builder.paragraph_format.style_identifier = getattr(aw.StyleIdentifier, f"HEADING{heading_level}")
builder.paragraph_format.line_unit_after = 0
builder.paragraph_format.space_after = 0
builder.writeln(title.strip())
# 处理和插入内容
content = content.replace("\n\n", "\n")
if content.strip('<br/').strip('<html>').strip('<br>'):
# 用正则表达式匹配表格和blue标签的内容
parts = re.split(
r'(<table.*?>.*?</table>|<blue>.*?</blue>|<green>.*?</green>|<sup>.*?</sup>|<small>.*?</small>)',
content,
flags=re.S)
for part in parts:
part = part.replace("<html>", '').replace("</html>", '')
builder.paragraph_format.style_identifier = aw.StyleIdentifier.NORMAL
if part.startswith('<table'):
# 处理表格内容
builder.insert_html(part)
table_node = builder.current_paragraph.previous_sibling
if table_node.node_type == aw.NodeType.TABLE:
table = table_node.as_table()
table.clear_borders()
table.clear_shading()
builder.cell_format.vertical_merge = aw.tables.CellMerge.NONE
table.style_identifier = aw.StyleIdentifier.TABLE_GRID
table.auto_fit(aw.tables.AutoFitBehavior.AUTO_FIT_TO_WINDOW)
elif part.startswith('<blue>'):
# 处理blue标签内容
color_content = re.sub(r'</?blue>', '', part) # 去掉<blue>标签
set_paragraph_color(builder, "blue") # 蓝色
builder.paragraph_format.style_name = "Normal"
builder.write(color_content.strip('<br>'))
set_paragraph_color(builder, None) # 恢复默认配色
elif part.startswith('<green>'):
# 处理blue标签内容
builder.paragraph_format.style_identifier = aw.StyleIdentifier.NORMAL
color_content = re.sub(r'</?green>', '', part) # 去掉<blue>标签
set_paragraph_color(builder, "green") #
builder.paragraph_format.style_name = "Normal"
builder.write(color_content.strip('<br>'))
set_paragraph_color(builder, None) # 恢复默认配色
elif part.startswith('<sup>'):
# 处理上标内容
super_content = re.sub(r'</?sup>', '', part) # 去掉<sup>标签
builder.font.superscript = True
builder.paragraph_format.style_name = "Normal"
builder.write(super_content.strip())
builder.paragraph_format.clear_formatting()
builder.font.superscript = False
elif part.startswith('<small>'):
# 处理small标签内容
small_content = re.sub(r'</?small>', '', part) # 去掉<small>标签
builder.font.size = 8 # 设置为小字体
builder.paragraph_format.style_name = "Normal"
builder.write(small_content.strip('<br>'))
builder.font.size = 12 # 恢复默认字体大小
else:
# 处理普通内容
builder.paragraph_format.style_identifier = aw.StyleIdentifier.NORMAL
builder.paragraph_format.style_name = "Normal"
builder.writeln(part.strip())
builder.paragraph_format.line_unit_after = 0
builder.paragraph_format.clear_formatting()
doc.save('result.docx')
import json
with open("result.json", "r", encoding="utf-8") as f:
data = json.load(f)
insert_title_content('a.docx', data)
为什么写完# 处理普通内容 在写蓝色或者其他颜色数据都换行写入了
@hhh1111 这很难说,但也许是因为当你从内容中获得parts
时,你有空数据,下面的代码在此基础上创建了一个新行。
else:
# 处理普通内容
builder.paragraph_format.style_identifier = aw.StyleIdentifier.NORMAL
builder.paragraph_format.style_name = "Normal"
builder.writeln(part.strip())
def read_table_docx(rtf_path):
_doc = aw.Document(rtf_path)
html = _doc.to_string(aw.SaveFormat.HTML)
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(html, "html.parser")
_table = []
# 定义允许的标签及其需要保留的属性
allowed_tags = {
'table': [],
'tr': ['rowspan', 'colspan'],
'td': ['rowspan', 'colspan']
}
for tag in soup.find_all(True):
if tag.name not in allowed_tags:
tag.unwrap() # 只保留标签中的内容,删除标签本身
else:
tag.attrs = {key: tag.attrs[key] for key in allowed_tags[tag.name] if key in tag.attrs}
# 返回处理后的 HTML 字符串
clean_html = str(soup)
print(clean_html)
return clean_html
read_table_docx("表14.1.3.2 疾病基线特征 -FAS.rtf")
怎么只读取前3页的rtf文件为html
没有空数据 还是会写入空行