We're sorry Aspose doesn't work properply without JavaScript enabled.

Free Support Forum - aspose.com

How to read TOC (Table of content) in word document outline formate text

Hi

Please post a sample Vb.net code to read TOC content from word document. I have attached image of a content and need to read in same way to display content in web page.

Note: Please find the attached image of sample TOC content.

Thanks

Anil K.

Hi Anil,

Thanks for your inquiry. Please check following sample code snippet to read TOC contents. It will help you to accomplish the task.

Public Sub ReadTOC()
Dim doc As New Document("D:/Downloads/Aspose_updated (1).docx")
Dim tocTable As DataTable = TableOfContentsToDataTable(doc, 0)
For Each row As DataRow In tocTable.Rows
Console.WriteLine(String.Format("Entry name: {0}, Heading Level: {1}, Page number: {2}", row("EntryName"), DirectCast(row("EntryStyle"), Style).StyleIdentifier, row("Page")))
Next
End Sub

Public Function TableOfContentsToDataTable(ByVal doc As Document, ByVal tocIndex As Integer) As DataTable
Dim table As New DataTable()
table.TableName = "Toc " + tocIndex.ToString
'******* Needed for Aspose's code 
table.Columns.Add("EntryRef")
'****** end 
table.Columns.Add("EntryName")
table.Columns.Add("ResultStartNode", GetType(Node))
table.Columns.Add("ResultRuns", GetType(List(Of Run)))
table.Columns.Add("EntryStyle", GetType(Style))
table.Columns.Add("PageRef")
table.Columns.Add("Page")
' Get the FieldStart of the specified TOC.
Dim currentNode As Node = DirectCast(FindTocStartFromIndex(doc, tocIndex), Node)
' Skip forward to the first field separator (after the TOC field code).
While currentNode.NodeType <> NodeType.FieldSeparator
currentNode = currentNode.NextPreOrder(doc)
End While
' First node of the paragraph
currentNode = currentNode.NextPreOrder(doc)
Dim isCollecting As Boolean = True
Dim countOfFieldItems As Integer = 0
Dim isAfterFirstTocEntry As Boolean = False
Dim isHyperlinked As Boolean = currentNode.NodeType = NodeType.FieldStart
While isCollecting
Dim entryRefCode As New StringBuilder()
Dim entryText As New StringBuilder()
Dim pageRefCode As New StringBuilder()
Dim pageText As New StringBuilder()
' Ensures that first entry is gotten from TOC
If Not isAfterFirstTocEntry Then
' Skip nodes until encounters a run
While currentNode.NodeType <> NodeType.Run
currentNode = currentNode.NextPreOrder(doc)
End While
isAfterFirstTocEntry = True
End If
If isHyperlinked Then
' Collect all runs in the field code until we encounter the field separator
While currentNode.NodeType <> NodeType.FieldSeparator
entryRefCode.Append(currentNode.Range.Text.Trim())
currentNode = currentNode.NextPreOrder(doc)
End While
' Skip past field separator
currentNode = currentNode.NextPreOrder(doc)
End If
' Break if no data products in IDMP
If currentNode.Range.Text.Contains("No table of contents entries found.") Then
table.Columns.Clear()
Return table
End If
Dim entryPositionNode As Node = Nothing
Dim fieldResultRuns As New List(Of Run)()
Dim entryStyle As Style = Nothing
While currentNode.NodeType <> NodeType.FieldStart
countOfFieldItems += 1
If currentNode.NodeType = NodeType.Run Then
If entryPositionNode Is Nothing Then
entryPositionNode = currentNode.PreviousPreOrder(doc)
End If
fieldResultRuns.Add(DirectCast(currentNode.Clone(False), Run))
entryStyle = DirectCast(currentNode, Run).ParentParagraph.ParagraphFormat.Style
End If
entryText.Append(currentNode.Range.Text.Trim())
currentNode = currentNode.NextPreOrder(doc)
End While
countOfFieldItems = 0
' Skip nodes until FieldStart (of PAGEREF)
While currentNode.NodeType <> NodeType.FieldStart
currentNode = currentNode.NextPreOrder(doc)
End While
currentNode = currentNode.NextPreOrder(doc)
pageRefCode.Append(currentNode.Range.Text)
' Skip nodes until FieldSeparator (of PAGEREF)
While currentNode.NodeType <> NodeType.FieldSeparator
currentNode = currentNode.NextPreOrder(doc)
End While
' Add the runs from the field which should be the page number
currentNode = currentNode.NextPreOrder(doc)
pageText.Append(currentNode.Range.Text)
' Add to datatable
table.Rows.Add(New Object() {entryRefCode.ToString(), entryText.ToString(), entryPositionNode, fieldResultRuns, entryStyle, pageRefCode.ToString(), _
pageText.ToString()})
currentNode = currentNode.NextPreOrder(doc)
' Skip to the first run of the the next paragraph (should be next entry). Check if a TOC field end is found at the same time
Dim isNextPara As Boolean = False
Dim isChecking As Boolean = True
While isChecking
currentNode = currentNode.NextPreOrder(doc)
' No node found, break.
If currentNode Is Nothing Then
isCollecting = False
Exit While
End If
' Passed a new paragraph
If currentNode.NodeType = NodeType.Paragraph Then
isNextPara = True
End If
' Found first run of a new paragraph
If isNextPara AndAlso currentNode.NodeType = NodeType.Run Then
isChecking = False
End If
' Once we encounter a FieldEnd node of type FieldTOC then we know we are at the end
' of the current TOC and we can stop here.
If currentNode.NodeType = NodeType.FieldEnd Then
Dim fieldEnd As Aspose.Words.Fields.FieldEnd = DirectCast(currentNode, Aspose.Words.Fields.FieldEnd)
If fieldEnd.FieldType = Aspose.Words.Fields.FieldType.FieldTOC Then
isCollecting = False
Exit While
End If
End If
End While
End While
Return table
End Function

Public Function FindTocStartFromIndex(ByVal doc As Document, ByVal tocIndex As Integer) As FieldStart
' Store the FieldStart nodes of TOC fields in the document for quick access.
Dim fieldStarts As New ArrayList()
' This is a list to store the nodes found inside the specified TOC. They will be removed
' at thee end of this method.
Dim nodeList As New ArrayList()
For Each start As FieldStart In doc.GetChildNodes(NodeType.FieldStart, True)
If start.FieldType = FieldType.FieldTOC Then
' Add all FieldStarts which are of type FieldTOC.
fieldStarts.Add(start)
End If
Next
' Ensure the TOC specified by the passed index exists.
If tocIndex > fieldStarts.Count - 1 Then
Throw New ArgumentOutOfRangeException("TOC index is out of range")
End If
Return DirectCast(fieldStarts(tocIndex), FieldStart)
End Function

Please feel free to contact us for any further assistance.

Best Regards,

Hi Ahmad,

Thanks for your replay, While reading my doc document I’m getting error like “TOC index is out of range”. This error is throughout in method "FindTocStartFromIndex “. At bellow code this if condition " If start.FieldType = FieldType.FieldTOC Then” is not satisfied.

For Each start As FieldStart In doc.GetChildNodes(NodeType.FieldStart, True)

If start.FieldType = FieldType.FieldTOC Then

’ Add all FieldStarts which are of type FieldTOC.

fieldStarts.Add(start)

End If

Next

Please find the attached source document.

Hi there,

Thanks for your feedback. It seems it is expected result, as your document does not contain any TOC entry. You may double check it in MS Word or Aspose.Word Document Explorer.

Best Regards,

Hi

Open attached word document and go to “View” option of word and then click on "Outline " of ms word then you can able to see TOC content in document. in normal view it is visible as normal text. if you go to “Outline” view you will find TOC.

Note : I want to read this kind of data, is it possible to read with ASPOSE .

Thanks

Anil k.

Hi Anil,

Thanks for sharing additional information. Please check following documentation link, extracting contents between bookmarks will help you to extract Outline text.

Best Regards,