Free Support Forum - aspose.com

How to extract html from a word document

I’m evalulating Aspose.Words version 3.5.2.0 for my compagny.

I need to extract images and text from the word document. It is possible for me to extract all images from the document, but I have difficulties extracting text from the document.

I have tried the following:
Dim Doc As New Document(Application.StartupPath & "\Test.doc")
Doc.GetText

But I am not interested in all the special characters returned.

I have tried to build the ExampleVisitor (shown below) from your documentation. But I can not find out how to use it…

Can you provide me with an example how to use the ExampleVistor to extract text from header, footer and body in the word document.

Regards.

Imports System
Imports System.IO
Imports System.Text
Imports Aspose.Words

Namespace IDocumentVisitorExample
Public Class ExampleVisitor
Inherits DocumentVisitor

Private mPrimaryHeader As StringBuilder = New StringBuilder
Private mPrimaryFooter As StringBuilder = New StringBuilder
Private mFirstPageHeader As StringBuilder = New StringBuilder
Private mFirstPageFooter As StringBuilder = New StringBuilder
Private mEvenPagesHeader As StringBuilder = New StringBuilder
Private mEvenPagesFooter As StringBuilder = New StringBuilder
Private mMainText As StringBuilder = New StringBuilder

Private mIsExtracting As Boolean
Private mExtractingType As StoryType
Dim mImageIndex As Integer = 0

Public Sub DocumentStart(ByVal doc As Document)
Console.WriteLine("Document enumeration started.")
End Sub

Public Sub DocumentEnd()
Console.WriteLine("Document enumeration finished.")
End Sub

Public Sub SectionStart(ByVal pageSetup As PageSetup)
Console.WriteLine("Section started.")
End Sub

Public Sub SectionEnd()
Console.WriteLine("Section finished.")
End Sub

Public Sub StoryStart(ByVal storyType As StoryType)
Select Case storyType
Case storyType.PrimaryHeader
Case storyType.PrimaryFooter
Case storyType.MainText
Case storyType.FirstPageHeader
Case storyType.FirstPageFooter
Case storyType.EvenPagesFooter
mIsExtracting = True
mExtractingType = storyType
Case storyType.EvenPagesHeader
End Select
End Sub

Public Sub StoryEnd()
mIsExtracting = False
End Sub

Public Sub ParagraphStart(ByVal paragraphFormat As ParagraphFormat)
Console.WriteLine("Paragraph started.")
End Sub

Public Sub ParagraphEnd()
Console.WriteLine("Paragraph finished.")
End Sub

Public Sub RunOfText(ByVal font As Aspose.Words.Font, ByVal text As String)
If mIsExtracting Then
Select Case mExtractingType
Case StoryType.PrimaryHeader
mPrimaryHeader.Append(text)
Case StoryType.PrimaryFooter
mPrimaryFooter.Append(text)
Case StoryType.FirstPageHeader
mFirstPageHeader.Append(text)
Case StoryType.FirstPageFooter
mFirstPageFooter.Append(text)
Case StoryType.EvenPagesHeader
mEvenPagesHeader.Append(text)
Case StoryType.FirstPageFooter
mEvenPagesFooter.Append(text)
Case StoryType.MainText
mMainText.Append(text)
End Select
End If
End Sub

Public Sub Image(ByVal imageBytes As Byte())
Dim stream As MemoryStream = New MemoryStream(imageBytes)
Dim image As System.Drawing.Image = System.Drawing.Image.FromStream(stream)
image.Save(String.Format("{0}.jpg", mImageIndex)) 'mImageIndex++
End Sub

Public Sub TableStart()
Console.WriteLine("Table started.")
End Sub

Public Sub TableEnd()
Console.WriteLine("Table finished.")
End Sub

Public Sub RowStart(ByVal rowFormat As RowFormat)
Console.WriteLine("Table row started.")
End Sub

Public Sub RowEnd()
Console.WriteLine("Table row finished.")
End Sub

Public Sub CellStart(ByVal cellFormat As CellFormat)
Console.WriteLine("Table cell started.")
End Sub

Public Sub CellEnd()
Console.WriteLine("Table cell finished.")
End Sub

Public Sub FieldStart(ByVal fieldType As FieldType)
Console.WriteLine("Field started.")
End Sub

Public Sub FieldSeparator()
Console.WriteLine("Field separator found.")
End Sub

Public Sub FieldEnd()
Console.WriteLine("Field finished.")
End Sub

Public Sub FormField(ByVal formField As FormField)
Console.WriteLine("Form field found.")
End Sub

Public Sub WritePlainText()
Dim writer As StreamWriter

Try
writer = New StreamWriter("plain.txt")

If mPrimaryHeader.Length > 0 Then
writer.WriteLine("--- Primary header ---")
writer.WriteLine(mPrimaryHeader)
writer.WriteLine()
End If

If mPrimaryFooter.Length > 0 Then
writer.WriteLine("--- Primary footer ---")
writer.WriteLine(mPrimaryFooter)
writer.WriteLine()
End If

If mFirstPageHeader.Length > 0 Then
writer.WriteLine("--- First page header ---")
writer.WriteLine(mFirstPageHeader)
writer.WriteLine()
End If

If mFirstPageFooter.Length > 0 Then
writer.WriteLine("--- First page footer ---")
writer.WriteLine(mFirstPageFooter)
writer.WriteLine()
End If

If mEvenPagesHeader.Length > 0 Then
writer.WriteLine("--- Even pages header ---")
writer.WriteLine(mEvenPagesHeader)
writer.WriteLine()
End If

If mEvenPagesFooter.Length > 0 Then
writer.WriteLine("--- Even pages footer ---")
writer.WriteLine(mEvenPagesFooter)
writer.WriteLine()
End If

If mMainText.Length > 0 Then
writer.WriteLine("--- Main text ---")
writer.WriteLine(mMainText)
writer.WriteLine()
End If
Catch ex As Exception
Console.Write(ex)
Finally
If Not writer Is Nothing Then writer.Close()
End Try
End Sub
End Class

Public Class MainClass
Public Sub New(ByVal args As String)
If args.Length = 0 Then
Console.WriteLine("Please specify the document file name.")
Return
End If

DoExtraction(args)
End Sub

Private Sub DoExtraction(ByVal fileName As String)
Dim doc As New Document(fileName)
Dim visitor As New ExampleVisitor

doc.Accept(visitor)
visitor.WritePlainText()
End Sub
End Class
End Namespace

Have you tried using

Document.Save(filename, SaveFormat.FormatText)