We're sorry Aspose doesn't work properply without JavaScript enabled.

Free Support Forum - aspose.com

I want to create a searchable PDF from a PDF file containing only images by tesseract's OCR recognition. But I can't put the recognized text in the right place right now

Here is my test code in c# and test pdf file.

    public static void GetPDFElement()
    {
        string num = "12";
        string datadir = @"D:\project\OCRBaseDev\trunk\OCR_source\test-data\";
        string tempPath = datadir + num + @"_pdf\";
        if (!Directory.Exists(tempPath))
        {
            Directory.CreateDirectory(tempPath);
        }

        if (File.Exists(datadir + num + @"_bak.pdf"))
        {
            File.Delete(datadir + num + @"_bak.pdf");
        }

        File.Copy(datadir + num + ".pdf", datadir + num + "_bak.pdf");

        Document document = new Document(datadir + num + @"_bak.pdf");

        // extract all pages
        foreach (Page page in document.Pages)
        {
            Console.WriteLine("--------------------------------");
            ImagePlacementAbsorber abs = new ImagePlacementAbsorber();
            page.Accept(abs);
            // Get the count of images over specific page
            Console.WriteLine("Total Images = {0} over page number {1}", abs.ImagePlacements.Count, page.Number);
            int image_counter = 1;
            foreach (ImagePlacement ia in abs.ImagePlacements)
            {
                Aspose.Pdf.Rectangle rc = ia.Rectangle;
                Aspose.Pdf.Devices.Resolution res = ia.Resolution;

                Console.WriteLine("Matrix:" + ia.Matrix);
                Console.WriteLine("Rotation:" + ia.Rotation);
                Console.WriteLine("Resolution:" + ia.Resolution);
                Console.WriteLine("Rectangle:" + ia.Rectangle);

                string name = num + "_" + page.Number + "_" + ia.Image.GetColorType() + "(" + rc.LLX + "," + rc.LLY + "," + rc.Width + "," + rc.Height + ")_(" + res.X + "," + res.Y + ").png";
                FileStream outputImage = new FileStream(tempPath + name, FileMode.Create);

                // Save output image
                int rot = (int)ia.Rotation;
                if(rot != 0)
                {
                    MemoryStream ms = new MemoryStream();
                    ia.Image.Save(ms);
                    System.Drawing.Image img = System.Drawing.Image.FromStream(ms);
                    switch (rot)
                    {
                        case 90:
                            img.RotateFlip(RotateFlipType.Rotate90FlipNone);
                            //rc.Rotate(Rotation.on90);
                            break;
                        case 180:
                            img.RotateFlip(RotateFlipType.Rotate180FlipNone);
                            //rc.Rotate(Rotation.on180);
                            break;
                        case 270:
                            img.RotateFlip(RotateFlipType.Rotate270FlipNone);
                            //rc.Rotate(Rotation.on270);
                            break;
                        default:
                            break;
                    }

                    img.Save(outputImage, System.Drawing.Imaging.ImageFormat.Png);
                }
                else
                {
                    ia.Image.Save(outputImage, System.Drawing.Imaging.ImageFormat.Png);
                }
                outputImage.Close();
                image_counter += 1;

                // ocr
                string text = GetOCRText(tempPath + name, tempPath);
                BuildSearchablePdf(page, rc, text);
            }
        }

        Console.WriteLine("Save Pdf Start");
        document.Save(datadir + num + "_bak_test.pdf");
        Console.WriteLine("Save Pdf End");


        Aspose.Pdf.DocSaveOptions docSaveOption = new DocSaveOptions();
        docSaveOption.CustomProgressHandler = new DocSaveOptions.ConversionProgressEventHandler(ShowProgressOnConsole);
        //docSaveOption.Mode = DocSaveOptions.RecognitionMode.Flow;
        Console.WriteLine("Save Doc Start");
        document.Save(datadir + num + "_bak_test.doc", docSaveOption);
        Console.WriteLine("Save Doc End");
    }

    public static string GetOCRText(string imgPath, string tempPath)
    {
        string text;
        ProcessStartInfo info = new ProcessStartInfo(@"D:\project\OCRBaseDev\trunk\OCR_source\tools\tesseract\tesseract.exe");
        info.WindowStyle = ProcessWindowStyle.Hidden;
        info.Arguments = imgPath + " " + tempPath + "res -l chi_sim";
        Process p = new Process();
        p.StartInfo = info;
        p.Start();
        p.WaitForExit();
        StreamReader streamReader = new StreamReader(tempPath + "res.txt");
        text = streamReader.ReadToEnd();
        streamReader.Close();
        return text;
    }

    public static void BuildSearchablePdf(Aspose.Pdf.Page page, Aspose.Pdf.Rectangle imgRect, string ocrText)
    {
        // Insert Into Hocr Text
        Console.WriteLine(ocrText);
        // Create RedactionAnnotation instance for specific page region
        Aspose.Pdf.Annotations.RedactionAnnotation annot = new Aspose.Pdf.Annotations.RedactionAnnotation(page, imgRect);
        annot.FillColor = Aspose.Pdf.Color.White;
        annot.BorderColor = Aspose.Pdf.Color.Yellow;
        annot.Color = Aspose.Pdf.Color.Black;
        // Text to be printed on redact annotation
        annot.OverlayText = ocrText;

        // Add annotation to annotations collection of first page
        page.Annotations.Add(annot);
        // Flattens annotation and redacts page contents (i.e. removes text and image
        // Under redacted annotation)
        annot.Redact();

        FloatingBox box = new FloatingBox();
        Aspose.Pdf.Text.TextFragment fragment = new Aspose.Pdf.Text.TextFragment(ocrText);
        fragment.TextState.HorizontalAlignment = HorizontalAlignment.Left;
        box.Paragraphs.Add(fragment);
        box.Top = page.Rect.URY - imgRect.URY - page.PageInfo.Margin.Top;
        page.PageInfo.Margin.Left = imgRect.LLX;
        page.Paragraphs.Add(box);
    }

    public static void ShowProgressOnConsole(MobiXmlSaveOptions.ProgressEventHandlerInfo eventInfo)
    {
        switch (eventInfo.EventType)
        {
            case DocSaveOptions.ProgressEventType.TotalProgress:
                Console.WriteLine(String.Format("{0}  - Conversion progress : {1}% .", DateTime.Now.ToLongTimeString(), eventInfo.Value.ToString()));
                break;
            case DocSaveOptions.ProgressEventType.SourcePageAnalized:
                Console.WriteLine(String.Format("{0}  - Source page {1} of {2} analyzed.", DateTime.Now.ToLongTimeString(), eventInfo.Value.ToString(), eventInfo.MaxValue.ToString()));
                break;
            case DocSaveOptions.ProgressEventType.ResultPageCreated:
                Console.WriteLine(String.Format("{0}  - Result page's {1} of {2} layout created.", DateTime.Now.ToLongTimeString(), eventInfo.Value.ToString(), eventInfo.MaxValue.ToString()));
                break;
            case DocSaveOptions.ProgressEventType.ResultPageSaved:
                Console.WriteLine(String.Format("{0}  - Result page {1} of {2} exported.", DateTime.Now.ToLongTimeString(), eventInfo.Value.ToString(), eventInfo.MaxValue.ToString()));
                break;
            default:
                break;
        }
    }

12.pdf (1.5 MB)

hope to get help!

@chuckwilson

Thank you for contacting support.

Would you please elaborate a little more about putting the text in right place. Is the text on your generated PDF has alignment or position related issues. Moreover, shared code snippet is accessing res.txt file for which FileNotFoundException is thrown. Kindly share the generated file with us so that we may proceed to help you out.

Res.txt is a file that saves the text result recognized by tesseract.exe. The text that is now recognized does not know how to position the image space in user space. The image of each page of the PDF is read by ImagePlacement.Rotation=270, Page.Rotation=90. I suspect that these flips cause the recognized text to not correctly position the coordinates.
Here is the target pdf file. But the text coordinate origin position does not match the picture origin position.
12_bak_test.pdf (2.4 MB)

@chuckwilson

Thank you for elaborating it.

We have been able to notice the issue. A ticket with ID PDFNET-45818 has been logged in our issue management system for further investigation and resolution. The ticket ID has been linked with this thread so that you will receive notification as soon as the ticket is resolved.

We are sorry for the inconvenience.