Hi,
We are using Aspose page PDF extraction method, But below specified method is extracting the text into Gibberish. Please have a look into it as a priority issue.
public PageContent Extract(Aspose.Pdf.Page page)
{
var pageRect = page.GetPageRect(false);
var size = new Size(pageRect.Width, pageRect.Height);
var position = new Position(pageRect.LLX, pageRect.LLY);
var pageRotation = (int)page.Rotate;
var pageContent = new PageContent(page.Number, size, position, pageRotation);
page.Rotate = Rotation.None;
var tables = _tableCellExtractor.ExtractTables(page).ToList();
var paragraphs = _paragraphExtractor.ExtractParagraphs(page, tables);
foreach (var paragraph in paragraphs)
{
pageContent.AddParagraph(paragraph);
}
foreach (var tableCell in tables.Where(t => t.HasText).SelectMany(t => t.Rows.SelectMany(r => r.Cells)))
{
pageContent.AddCell(tableCell.CellArea);
}
return pageContent;
}
public IEnumerable<Table> ExtractTables(Page page)
{
var absorber = new Aspose.Pdf.Text.TableAbsorber();
absorber.Visit(page);
return absorber.TableList.Select(ExtractTable);
}
private Table ExtractTable(AbsorbedTable table)
{
var tableRectangle = Rectangle.FromAsposeRectangle(table.Rectangle);
var rows = table.RowList.Select(ExtractRow).ToList();
return new Table()
{
TableRectangle = tableRectangle,
Rows = rows
};
}
public IEnumerable<TextArea> ExtractParagraphs(Aspose.Pdf.Page page, List<Table> tables)
{
var absorber = new ParagraphAbsorber();
absorber.Visit(page);
var markup = absorber.PageMarkups[0];
var markupParagraphs = markup.Sections.SelectMany(s => s.Paragraphs);
return markupParagraphs.SelectMany(p => PrepareParagraph(p, tables));
}
private IEnumerable<TextArea> PrepareParagraph(MarkupParagraph markupParagraph, List<Table> tables)
{
var textAreas = ExtractTextAreas(markupParagraph, tables).ToList();
ApplyWhiteSpaces(markupParagraph, textAreas);
return textAreas;
}
Here with we have attached the sample document.
SampleDocs_AsposeIssue.pdf (25.6 KB)