Extracting headers- footers- and content seperately


#1

Hello,

I am evaluating your product while doing some research for a project my company may be doing. We are going to need to independently determine the headers, footers, and the body content for a large batch of word documents, and extract the content as plain text with no formatting. I have found how to extract the entire content from the word document, by using doc.Range, but I cannot see how to extract just the headers (of different types) and the content seperately. Is there a way to do this in Aspose.Word?

Clay S.


#2

Hi,
Sure. Create a class that implements the IDocumentVisitor interface. Use its methods to handle start and end of the document stories and to extract the stories text. Pass this class to the Document.Accept method.
Here’s an example of how to extract primary header, primary footer and the content separately.

 

public void ExtractDocumentStories()

{

Document doc = new Document(“new.doc”);

DocumentStoriesExtractingVisitor visitor = new DocumentStoriesExtractingVisitor();

doc.Accept(visitor);

string header = visitor.GetExtractedHeader();

string footer = visitor.GetExtractedFooter();

string mainText = visitor.GetMainText();

}


public class DocumentStoriesExtractingVisitor : IDocumentVisitor

{

private bool isExtracting;

private StoryType extractingType;

private System.Text.StringBuilder primaryHeader;

private System.Text.StringBuilder primaryFooter;

private System.Text.StringBuilder mainText;

public void DocumentStart(Document doc)

{

primaryHeader = new System.Text.StringBuilder();

primaryFooter = new System.Text.StringBuilder();

mainText = new System.Text.StringBuilder();

}

public void DocumentEnd()

{

// Do nothing

}

public void SectionStart(PageSetup pageSetup)

{

// Do nothing

}

public void SectionEnd()

{

// Do nothing

}

public void StoryStart(StoryType storyType)

{

switch (storyType)

{

case StoryType.PrimaryHeaderStory:

case StoryType.PrimaryFooterStory:

case StoryType.MainTextStory:

isExtracting = true;

extractingType = storyType;

break;

}

}

public void StoryEnd()

{

isExtracting = false;

}

public void ParagraphStart(ParagraphFormat paragraphFormat)

{

// Do nothing

}

public void ParagraphEnd()

{

// Do nothing

}

public void RunOfText(Font font, string text)

{

if (isExtracting)

{

switch (extractingType)

{

case StoryType.PrimaryHeaderStory:

primaryHeader.Append(text);

break;

case StoryType.PrimaryFooterStory:

primaryFooter.Append(text);

break;

case StoryType.MainTextStory:

mainText.Append(text);

break;

}

}

}

public void Image(byte[] imageBytes)

{

// Do nothing

}

public void TableStart()

{

// Do nothing

}

public void TableEnd()

{

// Do nothing

}

public void RowStart(RowFormat rowFormat)

{

// Do nothing

}

public void RowEnd()

{

// Do nothing

}

public void CellStart(CellFormat cellFormat)

{

// Do nothing

}

public void CellEnd()

{

// Do nothing

}

public void FieldStart(FieldType fieldType)

{

// Do nothing

}

public void FieldSeparator()

{

// Do nothing

}

public void FieldEnd()

{

// Do nothing

}

public string GetExtractedHeader()

{

return primaryHeader.ToString();

}

public string GetExtractedFooter()

{

return primaryFooter.ToString();

}

public string GetMainText()

{

return mainText.ToString();

}

}