CharactersWithSpaces always returns 0


#1

I’m trying to get an accurate count of the number of characters in a document, including all white space. Document.BuiltInProperties.Characters returns the number of characters (not including spaces), as expected. However, when I try to use Document.BuiltInProperties.CharactersWithSpaces, it returns 0.

public static int GetCharacterCountFromFile(string path)
{
Aspose.Word.Document doc = new Aspose.Word.Document(path);
return doc.BuiltInDocumentProperties.CharactersWithSpaces;
}

How can I fix this? Or, is there a better way to get the character count for a file? I really need to be able to determine whether to include tabs, spaces, and newlines (independently), and whether to include headers and footers in my counts.

Thanks in advance.


#2

Accessing a document property does not actually evaluate anything, it just returns the value stored in the document. In your code you just open a document and retrieve the property. Basically the value you get is the value stored in the document.

The Document class provides the UpdateWordCount method that recalculates Characters, Words and Paragraphs properties. You can call this to make sure the counts are up to date with the document contents. Unfortunately it does not yet update CharactersWithSpaces because this property was only added recently, but we will implement this soon.

If you want a custom word or anything else count, just enumerate through the document using DocumentVisitor for example.

For the sake of example I include WordCounter.cs from Aspose.Word source code that you can use as a baseline for your custom counter:

///

/// Updates word count properties of the document.

///

///

///

UpdateWordCount recalculates and updates Characters, Words and Paragraphs

/// properties in the collection of the Document.

///

///

Note that UpdateWordCount does not update number of lines and pages properties.

///

public void UpdateWordCount()

{

WordCounter counter = new WordCounter();

this.Accept(counter);

}

------------------------------------

using System;

namespace Aspose.Word

{

///

/// Calculates total number of characters, words and paragraphs in the document and

/// updates the appropriate properties.

///

internal class WordCounter : DocumentVisitor

{

public override VisitorAction VisitDocumentStart(Document doc)

{

mListNumberGenerator = new ListNumberGenerator(doc);

return VisitorAction.Continue;

}

public override VisitorAction VisitDocumentEnd(Document doc)

{

AddWord();

doc.BuiltInDocumentProperties.Characters = mCharCount;

doc.BuiltInDocumentProperties.Words = mWordCount;

doc.BuiltInDocumentProperties.Paragraphs = mParaCount;

return VisitorAction.Stop;

}

public override VisitorAction VisitHeaderFooterStart(HeaderFooter headerFooter)

{

Lock();

return VisitorAction.Continue;

}

public override VisitorAction VisitHeaderFooterEnd(HeaderFooter headerFooter)

{

Unlock();

return VisitorAction.Continue;

}

public override VisitorAction VisitFootnoteStart(Footnote footnote)

{

Lock();

return VisitorAction.Continue;

}

public override VisitorAction VisitFootnoteEnd(Footnote footnote)

{

Unlock();

return VisitorAction.Continue;

}

public override VisitorAction VisitShapeStart(Shape shape)

{

Lock();

return VisitorAction.Continue;

}

public override VisitorAction VisitShapeEnd(Shape shape)

{

Unlock();

return VisitorAction.Continue;

}

public override VisitorAction VisitFieldStart(FieldStart fieldStart)

{

Lock();

return VisitorAction.Continue;

}

public override VisitorAction VisitFieldSeparator(FieldSeparator fieldSeparator)

{

Unlock();

return VisitorAction.Continue;

}

public override VisitorAction VisitFieldEnd(FieldEnd fieldEnd)

{

if (!fieldEnd.HasSeparator)

Unlock();

return VisitorAction.Continue;

}

public override VisitorAction VisitParagraphStart(Paragraph paragraph)

{

if (!IsLocked)

AddListLabel(paragraph);

return VisitorAction.Continue;

}

public override VisitorAction VisitParagraphEnd(Paragraph paragraph)

{

if (!IsLocked)

{

//End of paragraph is always end of a word.

AddWord();

AddPara();

}

return VisitorAction.Continue;

}

public override VisitorAction VisitSectionEnd(Section section)

{

//End of section is always end of a word, but I don't think this should be done

//here because a paragraph must have just ended before end of section.

//Maybe should call end of paragraphs as well in this case?

if (!IsLocked)

AddWord();

return VisitorAction.Continue;

}

public override VisitorAction VisitSpecialChar(SpecialChar specialChar)

{

// Add default text input chars

if (specialChar.GetText() == ControlChar.DefaultTextInputChar.ToString())

{

AddChar();

mIsInsideWord = true;

}

return VisitorAction.Continue;

}

public override VisitorAction VisitRun(Run run)

{

if (!IsLocked)

{

for (int i = 0; i < run.Text.Length; i++)

{

char c = run.TextIdea [I];

if (Char.IsWhiteSpace(c))

{

AddWord();

}

else if (IsSpecialSymbol(c))

{

AddChar();

AddWord();

}

else

{

AddChar();

mIsInsideWord = true;

mIsNonEmptyPara = true;

}

}

}

return VisitorAction.Continue;

}

private void AddChar()

{

mCharCount++;

}

private void AddWord()

{

if (mIsInsideWord)

{

mWordCount++;

mIsInsideWord = false;

}

}

private void AddPara()

{

if (mIsNonEmptyPara)

{

mParaCount++;

mIsNonEmptyPara = false;

}

}

private void Lock()

{

mLockCount++;

}

private void Unlock()

{

mLockCount--;

}

///

/// Adds length of the list label if the current paragraph is a list item.

///

///

private void AddListLabel(Paragraph paragraph)

{

int labelLength = mListNumberGenerator.UpdateAndCreateLabel(paragraph).Length;

if (labelLength > 0)

{

mCharCount += labelLength;

mWordCount++;

}

}

///

/// I've noticed that Word doesn't consider some chars as word separators but however

/// counts them. Currently these are em dash and en dash.

///

///

///

private static bool IsSpecialSymbol(char c)

{

for (int i = 0; i < gSpecialSymbols.Length; i++)

if (c == gSpecialSymbolsIdea [I])

return true;

return false;

}

private bool IsLocked

{

get { return (mLockCount > 0); }

}

private int mCharCount;

private int mWordCount;

private int mParaCount;

private int mLockCount;

///

/// True if current char belongs to a word.

///

private bool mIsInsideWord;

///

/// True if current para contains some chars other than white space.

///

private bool mIsNonEmptyPara;

private ListNumberGenerator mListNumberGenerator;

private static readonly char[] gSpecialSymbols = { '–', '—' };

}

}


#3

How do I gain access to ListNumberGenerator?


#4

Sorry, that is a bit of code I don’t want to publish. Just cut it out from the example.