你好,
我们项目需求将doc转换成html并且分割文档,参考官方文档使用了提取doc内容的方法,但是提取完样式有问题,p和table标签的margin bottom都被改变成了0pt。
参考文档:Extract Content Between Document Nodes|Aspose.Words for .NET
public void Get()
{
try
{
HtmlSaveOptions options = new HtmlSaveOptions();
options.ExportRoundtripInformation = true;
options.ExportImagesAsBase64 = true;
options.CssStyleSheetType = CssStyleSheetType.External;
StreamReader sr = new StreamReader("C://wordtohtml//demo.docx", Encoding.Default);
Stream stream = sr.BaseStream;
LoadOptions loadOptions = new LoadOptions
{
WarningCallback = new DocumentLoadingWarningCallback(_logger)
};
Document doc = new Document(stream, loadOptions);
doc.Save("C://wordtohtml//demo.html", options);
//Close the stream now, it is no longer needed because the document is in memory.
stream.Close();
ArrayList head2list = new ArrayList();
var heading2 = doc
.GetChildNodes(NodeType.Paragraph, true)
.Cast<Paragraph>()
.ToArray()
.Where(p => p.ParagraphFormat.StyleIdentifier == StyleIdentifier.Heading2);
foreach (var head2 in heading2)
{
head2list.Add(head2);
}
// get extractedNodes
List<Node> pprList = _asposeService.ExtractContent((Node)head2list[4], (Node)head2list[5], false);
Document pprDoc = _asposeService.GenerateDocument(doc, pprList);
pprDoc.Save("C://wordtohtml//ppr.html", options);
}
catch (Exception e)
{
}
}
public List<Node> ExtractContent(Node startNode, Node endNode, bool isInclusive)
{
// First, check that the nodes passed to this method are valid for use.
VerifyParameterNodes(startNode, endNode);
// Create a list to store the extracted nodes.
List<Node> nodes = new List<Node>();
// If either marker is part of a comment, including the comment itself, we need to move the pointer
// forward to the Comment Node found after the CommentRangeEnd node.
if (endNode.NodeType == NodeType.CommentRangeEnd && isInclusive)
{
Node node = FindNextNode(NodeType.Comment, endNode.NextSibling);
if (node != null)
endNode = node;
}
// Keep a record of the original nodes passed to this method to split marker nodes if needed.
Node originalStartNode = startNode;
Node originalEndNode = endNode;
// Extract content based on block-level nodes (paragraphs and tables). Traverse through parent nodes to find them.
// We will split the first and last nodes' content, depending if the marker nodes are inline.
startNode = GetAncestorInBody(startNode);
endNode = GetAncestorInBody(endNode);
bool isExtracting = true;
bool isStartingNode = true;
// The current node we are extracting from the document.
Node currNode = startNode;
// Begin extracting content. Process all block-level nodes and specifically split the first
// and last nodes when needed, so paragraph formatting is retained.
// Method is a little more complicated than a regular extractor as we need to factor
// in extracting using inline nodes, fields, bookmarks, etc. to make it useful.
while (isExtracting)
{
// Clone the current node and its children to obtain a copy.
Node cloneNode = currNode.Clone(true);
bool isEndingNode = currNode.Equals(endNode);
if (isStartingNode || isEndingNode)
{
// We need to process each marker separately, so pass it off to a separate method instead.
// End should be processed at first to keep node indexes.
if (isEndingNode)
{
// !isStartingNode: don't add the node twice if the markers are the same node.
ProcessMarker(cloneNode, nodes, originalEndNode, currNode, isInclusive,
false, !isStartingNode, false);
isExtracting = false;
}
// Conditional needs to be separate as the block level start and end markers, maybe the same node.
if (isStartingNode)
{
ProcessMarker(cloneNode, nodes, originalStartNode, currNode, isInclusive,
true, true, false);
isStartingNode = false;
}
}
else
// Node is not a start or end marker, simply add the copy to the list.
nodes.Add(cloneNode);
// Move to the next node and extract it. If the next node is null,
// the rest of the content is found in a different section.
if (currNode.NextSibling == null && isExtracting)
{
// Move to the next section.
Section nextSection = (Section)currNode.GetAncestor(NodeType.Section).NextSibling;
currNode = nextSection.Body.FirstChild;
}
else
{
// Move to the next node in the body.
currNode = currNode.NextSibling;
}
}
// For compatibility with mode with inline bookmarks, add the next paragraph (empty).
if (isInclusive && originalEndNode == endNode && !originalEndNode.IsComposite)
IncludeNextParagraph(endNode, nodes);
// Return the nodes between the node markers.
return nodes;
}
public Document GenerateDocument(Document srcDoc, List<Node> nodes)
{
Document dstDoc = new Document();
// Remove the first paragraph from the empty document.
dstDoc.FirstSection.Body.RemoveAllChildren();
// Import each node from the list into the new document. Keep the original formatting of the node.
NodeImporter importer = new NodeImporter(srcDoc, dstDoc, ImportFormatMode.KeepSourceFormatting);
foreach (Node node in nodes)
{
Node importNode = importer.ImportNode(node, true);
dstDoc.FirstSection.Body.AppendChild(importNode);
}
return dstDoc;
}