Hi,
On the attached pdf I am unable to extract all text, in particular the top right block. When using a TextAbsorber to get all text the top right blocks extracts as below, while all other text seems correct. Using adobe reader text selection is a bit strange in this area, but it still returns the correct text.
2>;D8@A;B8;B )>BE8@?8>
38@5 ,CF9:8658@B
-;78<:8;=<55> !
" " ).14+0/+. "
The code used:
Factuur 815.pdf (22.8 KB)
public void ExtractWords(string fileName)
{
//Try and get full text
Document doc = new Document(fileName);
//create TextAbsorber object to extract text
TextAbsorber textAbsorber = new TextAbsorber();
//accept the absorber for all the pages
doc.Pages.Accept(textAbsorber);
//get the extracted text
Console.WriteLine(textAbsorber.Text);
//try and get text fragments
using (Document pdfDocument = new Document(fileName))
{
//split file in ocr pages
for (int i = 0; i < pdfDocument.Pages.Count; i++)
{
var textFragmentAbsorber = new TextFragmentAbsorber(@"[\S]+", new TextSearchOptions(true));
//accept the absorber for all the pages
pdfDocument.Pages[i + 1].Accept(textFragmentAbsorber);
//get the extracted text fragments
var textFragmentCollection = textFragmentAbsorber.TextFragments;
//loop through the fragments
foreach (TextFragment textFragment in textFragmentCollection)
{
Console.WriteLine(textFragment.Text);
}
}
}
}