I’ve tried to simplify it as much as possible.
Here is the main code:
var doc = new Aspose.Words.Document(@"E:\WordTest\26.docx");
var sb = new StringBuilder();
foreach (Aspose.Words.Section section in doc.Sections)
{
GetHtml(section.Body, sb);
}
File.WriteAllText(@"E:\WordTest\26.htm",
sb.ToString());
new Aspose.Words.Document(@"E:\WordTest\26.htm")
.Save(@"E:\WordTest\26_2.docx", SaveFormat.Docx);
and here is the GetHtml code:
private static void GetHtml(CompositeNode prentNode,StringBuilder sbHtml )
{
if (prentNode != null)
{
foreach (Aspose.Words.Node node in prentNode.ChildNodes)
{
if (node.NodeType == NodeType.StructuredDocumentTag)
{
var structuredDocumentTag = (StructuredDocumentTag)node;
bool containsSubStructuredDocumentTag = false;
foreach (Aspose.Words.Node cNode in structuredDocumentTag.ChildNodes)
{
if (cNode.NodeType == NodeType.StructuredDocumentTag)
{
containsSubStructuredDocumentTag = true;
break;
}
}
if (containsSubStructuredDocumentTag)
{
if (node is CompositeNode)
{
if (((CompositeNode)node).ChildNodes != null
&& ((CompositeNode)node).ChildNodes.Count > 0)
{
GetHtml(node as CompositeNode, sbHtml);
}
}
}
else
{
string value = null;
if ((structuredDocumentTag.IsShowingPlaceholderText == false)
|| (structuredDocumentTag.SdtType == SdtType.Date))
{
switch (structuredDocumentTag.SdtType)
{
case SdtType.Checkbox:
value = structuredDocumentTag.Checked.ToString();
break;
case SdtType.Date:
if (structuredDocumentTag.FullDate != DateTime.MinValue)
{
try
{
value = Convert.ToDouble(structuredDocumentTag.FullDate.Ticks).ToString();
}
catch (Exception ex)
{
value = "0";
}
}
else
{
value = "0";
}
break;
case SdtType.DropDownList:
{
value = structuredDocumentTag.ListItems.SelectedValue.Value;
}
break;
case SdtType.PlainText:
case SdtType.RichText:
{
if (structuredDocumentTag.SdtType == SdtType.RichText)
{
value = ReadAllNodesFromField(structuredDocumentTag, SaveFormat.Html);
}
else
{
value = ReadAllNodesFromField(structuredDocumentTag, SaveFormat.Text);
}
}
break;
default:
value = null;
break;
}
}
if (!string.IsNullOrEmpty(value))
{
sbHtml.AppendLine(value);
}
}
}
else
{
if (node is CompositeNode)
{
if (((CompositeNode)node).ChildNodes != null
&& ((CompositeNode)node).ChildNodes.Count > 0)
{
GetHtml(node as CompositeNode, sbHtml);
}
}
}
}
}
}
private static string ReadAllNodesFromField(StructuredDocumentTag structuredDocumentTag, SaveFormat format)
{
string text = string.Empty;
if (format == SaveFormat.Html)
{
var saveOptions = new HtmlSaveOptions
{
HtmlVersion = Aspose.Words.Saving.HtmlVersion.Html5,
ExportImagesAsBase64 = true,
ExportHeadersFootersMode = Aspose.Words.Saving.ExportHeadersFootersMode.None,
ExportListLabels = Aspose.Words.Saving.ExportListLabels.AsInlineText
};
foreach (Aspose.Words.Node textNode in structuredDocumentTag.ChildNodes)
{
text += textNode.ToString(saveOptions);
}
}
else
{
foreach (Aspose.Words.Node textNode in structuredDocumentTag.ChildNodes)
{
text += textNode.ToString(format);
}
}
return text;
}
Attached the original word file (26.docx) .
26.zip (23.1 KB)
As you can see, the original file (26.docx) keeps its numbering formatting, but the new one (26_2.docx) lose it when converting to HTML and back.
Thanks!