I use the code below to extract the document contents & return a html string. When I did that for the attached document there were 2 issues
1 - it returned the string with irregular characters at the end ("\0\0\0\0\0\0\0\0").
2 - it didn’t keep the numbering for the numbered list. Instead it read each list as a separate list ("<ol type="1"><li value="1"><span>Elevated blood pressure with a diagnosis of hypertension, rule out white coat syndrome. </span></li></ol><ol type="1"><li value="1"><span>Bilateral ankle lipomas.</span></li></ol>"
)
Please advise.
Thank you
Code:
private static string GetHtmlFromBookmark(string bookmarkName, Document doc, SaveFormat ThisSaveFormat)
{
Document docClone = doc.Clone();
Document doc1 = new Document();
Bookmark mark = docClone.Range.Bookmarks[bookmarkName];
Node node = mark.BookmarkStart.ParentNode;
Node endNode = mark.BookmarkEnd.ParentNode.NextSibling;
while (!node.Equals(endNode))
{
if ((node as CompositeNode).ChildNodes.Contains(mark.BookmarkStart))
{
Node child = (node as CompositeNode).FirstChild;
Node endChild = mark.BookmarkStart.NextSibling;
while (!child.Equals(endChild))
{
child = child.NextSibling;
child.PreviousSibling.Remove();
}
}
if ((node as CompositeNode).ChildNodes.Contains(mark.BookmarkEnd))
{
Node child = mark.BookmarkEnd;
while (!child.Equals(child.ParentNode.LastChild))
{
child = child.NextSibling;
child.PreviousSibling.Remove();
}
child.Remove();
}
doc1.FirstSection.Body.AppendChild(doc1.ImportNode(node, true, ImportFormatMode.KeepSourceFormatting));
node = node.NextSibling;
if (node == null)
break;
}
MemoryStream stream = new MemoryStream();
doc1.Save(stream, ThisSaveFormat);
string html = Encoding.UTF8.GetString(stream.GetBuffer());
return html;
}
Hi
Thanks for your inquiry.
- Please try using the following code to convert document to HTML string:
private string ConvertDocumentToHtml(Document doc)
{
string html = string.Empty;
// Save docuemnt to MemoryStream in Hml format
using(MemoryStream htmlStream = new MemoryStream())
{
doc.Save(htmlStream, SaveFormat.Html);
// Get Html string
html = Encoding.UTF8.GetString(htmlStream.GetBuffer(), 0, (int) htmlStream.Length);
}
return html;
}
- The problem with lists occurs because you import each paragraph between bookmarks into a separate document, during importing List is lost, and each list item in the destination document is item of separate list. As a workaround, you can try remove all except bookmark from the document and then convert it to HTML.
Document doc = new Document(@"Test127\Doc1.doc");
string html = GetHtmlFromBookmark("Body", doc);
Console.WriteLine(html);
private string GetHtmlFromBookmark(string bookmarkName, Document doc)
{
// Clone the original document
Document docClone = doc.Clone();
// Get bookmark
Bookmark mark = docClone.Range.Bookmarks[bookmarkName];
// Remove content before bookmark
RemoveContentBeforeNode(docClone, mark.BookmarkStart);
// Remove content after bookmark
RemoveContentAfterNode(docClone, mark.BookmarkEnd);
// Convert document to HTML
string html = ConvertDocumentToHtml(docClone);
docClone.Save(@"Test127\out.doc");
return html;
}
///
/// Removes all content before specified node
///
private void RemoveContentBeforeNode(Document doc, Node endNode)
{
Node curNode = endNode.PreviousPreOrder(doc);
while (curNode != null)
{
// Move to next node
Node nextNode = curNode.PreviousPreOrder(doc);
// Check whether current contains end node
if (curNode.IsComposite)
{
if (!(curNode as CompositeNode).GetChildNodes(NodeType.Any, true).Contains(endNode))
{
nextNode = curNode.PreviousSibling;
curNode.Remove();
}
}
else
{
curNode.Remove();
}
curNode = nextNode;
}
}
///
/// Removes all content after specified node
///
private void RemoveContentAfterNode(Document doc, Node startNode)
{
Node curNode = startNode.NextPreOrder(doc);
while (curNode != null)
{
// Move to next node
Node nextNode = curNode.NextPreOrder(doc);
// Check whether current contains start node
if (curNode.IsComposite)
{
if (!(curNode as CompositeNode).GetChildNodes(NodeType.Any, true).Contains(startNode))
{
nextNode = curNode.NextSibling;
curNode.Remove();
}
}
else
{
curNode.Remove();
}
curNode = nextNode;
}
}
Best regards.
The issues you have found earlier (filed as WORDSNET-5430) have been fixed in this .NET update and this Java update.
This message was posted using Notification2Forum from Downloads module by aspose.notifier.
(5)