I’m evalulating Aspose.Words version 3.5.2.0 for my compagny.
I need to extract images and text from the word document. It is possible for me to extract all images from the document, but I have difficulties extracting text from the document.
I have tried the following:
Dim Doc As New Document(Application.StartupPath & "\Test.doc")
Doc.GetText
But I am not interested in all the special characters returned.
I have tried to build the ExampleVisitor (shown below) from your documentation. But I can not find out how to use it…
Can you provide me with an example how to use the ExampleVistor to extract text from header, footer and body in the word document.
Regards.
Imports System
Imports System.IO
Imports System.Text
Imports Aspose.Words
Namespace IDocumentVisitorExample
Public Class ExampleVisitor
Inherits DocumentVisitor
Private mPrimaryHeader As StringBuilder = New StringBuilder
Private mPrimaryFooter As StringBuilder = New StringBuilder
Private mFirstPageHeader As StringBuilder = New StringBuilder
Private mFirstPageFooter As StringBuilder = New StringBuilder
Private mEvenPagesHeader As StringBuilder = New StringBuilder
Private mEvenPagesFooter As StringBuilder = New StringBuilder
Private mMainText As StringBuilder = New StringBuilder
Private mIsExtracting As Boolean
Private mExtractingType As StoryType
Dim mImageIndex As Integer = 0
Public Sub DocumentStart(ByVal doc As Document)
Console.WriteLine("Document enumeration started.")
End Sub
Public Sub DocumentEnd()
Console.WriteLine("Document enumeration finished.")
End Sub
Public Sub SectionStart(ByVal pageSetup As PageSetup)
Console.WriteLine("Section started.")
End Sub
Public Sub SectionEnd()
Console.WriteLine("Section finished.")
End Sub
Public Sub StoryStart(ByVal storyType As StoryType)
Select Case storyType
Case storyType.PrimaryHeader
Case storyType.PrimaryFooter
Case storyType.MainText
Case storyType.FirstPageHeader
Case storyType.FirstPageFooter
Case storyType.EvenPagesFooter
mIsExtracting = True
mExtractingType = storyType
Case storyType.EvenPagesHeader
End Select
End Sub
Public Sub StoryEnd()
mIsExtracting = False
End Sub
Public Sub ParagraphStart(ByVal paragraphFormat As ParagraphFormat)
Console.WriteLine("Paragraph started.")
End Sub
Public Sub ParagraphEnd()
Console.WriteLine("Paragraph finished.")
End Sub
Public Sub RunOfText(ByVal font As Aspose.Words.Font, ByVal text As String)
If mIsExtracting Then
Select Case mExtractingType
Case StoryType.PrimaryHeader
mPrimaryHeader.Append(text)
Case StoryType.PrimaryFooter
mPrimaryFooter.Append(text)
Case StoryType.FirstPageHeader
mFirstPageHeader.Append(text)
Case StoryType.FirstPageFooter
mFirstPageFooter.Append(text)
Case StoryType.EvenPagesHeader
mEvenPagesHeader.Append(text)
Case StoryType.FirstPageFooter
mEvenPagesFooter.Append(text)
Case StoryType.MainText
mMainText.Append(text)
End Select
End If
End Sub
Public Sub Image(ByVal imageBytes As Byte())
Dim stream As MemoryStream = New MemoryStream(imageBytes)
Dim image As System.Drawing.Image = System.Drawing.Image.FromStream(stream)
image.Save(String.Format("{0}.jpg", mImageIndex)) 'mImageIndex++
End Sub
Public Sub TableStart()
Console.WriteLine("Table started.")
End Sub
Public Sub TableEnd()
Console.WriteLine("Table finished.")
End Sub
Public Sub RowStart(ByVal rowFormat As RowFormat)
Console.WriteLine("Table row started.")
End Sub
Public Sub RowEnd()
Console.WriteLine("Table row finished.")
End Sub
Public Sub CellStart(ByVal cellFormat As CellFormat)
Console.WriteLine("Table cell started.")
End Sub
Public Sub CellEnd()
Console.WriteLine("Table cell finished.")
End Sub
Public Sub FieldStart(ByVal fieldType As FieldType)
Console.WriteLine("Field started.")
End Sub
Public Sub FieldSeparator()
Console.WriteLine("Field separator found.")
End Sub
Public Sub FieldEnd()
Console.WriteLine("Field finished.")
End Sub
Public Sub FormField(ByVal formField As FormField)
Console.WriteLine("Form field found.")
End Sub
Public Sub WritePlainText()
Dim writer As StreamWriter
Try
writer = New StreamWriter("plain.txt")
If mPrimaryHeader.Length > 0 Then
writer.WriteLine("--- Primary header ---")
writer.WriteLine(mPrimaryHeader)
writer.WriteLine()
End If
If mPrimaryFooter.Length > 0 Then
writer.WriteLine("--- Primary footer ---")
writer.WriteLine(mPrimaryFooter)
writer.WriteLine()
End If
If mFirstPageHeader.Length > 0 Then
writer.WriteLine("--- First page header ---")
writer.WriteLine(mFirstPageHeader)
writer.WriteLine()
End If
If mFirstPageFooter.Length > 0 Then
writer.WriteLine("--- First page footer ---")
writer.WriteLine(mFirstPageFooter)
writer.WriteLine()
End If
If mEvenPagesHeader.Length > 0 Then
writer.WriteLine("--- Even pages header ---")
writer.WriteLine(mEvenPagesHeader)
writer.WriteLine()
End If
If mEvenPagesFooter.Length > 0 Then
writer.WriteLine("--- Even pages footer ---")
writer.WriteLine(mEvenPagesFooter)
writer.WriteLine()
End If
If mMainText.Length > 0 Then
writer.WriteLine("--- Main text ---")
writer.WriteLine(mMainText)
writer.WriteLine()
End If
Catch ex As Exception
Console.Write(ex)
Finally
If Not writer Is Nothing Then writer.Close()
End Try
End Sub
End Class
Public Class MainClass
Public Sub New(ByVal args As String)
If args.Length = 0 Then
Console.WriteLine("Please specify the document file name.")
Return
End If
DoExtraction(args)
End Sub
Private Sub DoExtraction(ByVal fileName As String)
Dim doc As New Document(fileName)
Dim visitor As New ExampleVisitor
doc.Accept(visitor)
visitor.WritePlainText()
End Sub
End Class
End Namespace