We're sorry Aspose doesn't work properply without JavaScript enabled.

Free Support Forum - aspose.com

Extract table from PDF is disturbed

I am using following code to extract table. But I am not getting output as per expectation of shown table

report.pdf (49.0 KB)

    public static void Extract_Table()
    {
        // Load source PDF document
        var filePath = @"c:\users\raja_c\downloads\report.pdf";
        Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(filePath);
        foreach (var page in pdfDocument.Pages)
        {
            Aspose.Pdf.Text.TableAbsorber absorber = new Aspose.Pdf.Text.TableAbsorber();
            absorber.Visit(page);
            foreach (AbsorbedTable table in absorber.TableList)
            {
                Console.WriteLine("Table");
                foreach (AbsorbedRow row in table.RowList)
                {
                    foreach (AbsorbedCell cell in row.CellList)
                    {
                        foreach (TextFragment fragment in cell.TextFragments)
                        {
                            var sb = new StringBuilder();
                            foreach (TextSegment seg in fragment.Segments)
                                sb.Append(seg.Text);
                            Console.Write($"{sb.ToString()}|");
                        }
                    }
                    Console.WriteLine();
                }
                Console.ReadLine();
            }
        }
    }

@crshekharam

Can you please explain it little more by comparing current output and expected output with snapshots.

@mudassir.fayyaz
Changed code to get table in HTML format in variable sb

These below screenshots are for HTML generated and expected
Resulted Output
image.png (59.4 KB)

Expected Ouput
image.png (54.0 KB)

Resulted HTML files
table html.zip (2.9 KB)

    public static void Extract_Table()
    {
        // Load source PDF document
        var filePath = @"c:\users\raja_c\downloads\report.pdf";
        Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(filePath);
        var sb = new StringBuilder();
        foreach (var page in pdfDocument.Pages)
        {
            Aspose.Pdf.Text.TableAbsorber absorber = new Aspose.Pdf.Text.TableAbsorber();
            absorber.Visit(page);
            foreach (AbsorbedTable table in absorber.TableList)
            {
                sb.Append("<Table border=1>");
                foreach (AbsorbedRow row in table.RowList)
                {
                    sb.Append("<tr>");
                    foreach (AbsorbedCell cell in row.CellList)
                    {
                        sb.Append("<td>");
                        foreach (TextFragment fragment in cell.TextFragments)
                        {
                           
                            foreach (TextSegment seg in fragment.Segments)
                                sb.Append(seg.Text);
                        }
                        sb.Append("</td>");
                    }
                    sb.Append("</tr>");
                }
                sb.Append("</table>");
            }                
        }
        Console.ReadLine();
    }

Please find file used as input for this program report.pdf (49.0 KB)

@crshekharam

A ticket with ID PDFNET-49964 has been created in our issue tracking system to further investigate the issue on our end. This thread has been linked with the issue so that you may be notified once the issue will be fixed.