Hi Team,
I am working on replacing a 3rd party software which was used to OCR images and PDF file with images in an application. I have used Aspose.OCR and Aspose.PDF in my POC application for replacement of that 3rd party, I could achieve the required output but the problem is performance.
For just 162 pdf file it is taking 1hr 15 mins.
Please find the code snippet:
public void ExtractImagesAndPerformOCR(string pdfPath)
{
// Initialize license object
Aspose.Pdf.License license = new Aspose.Pdf.License();
// License license = new License();
using (MemoryStream stream = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(AsposePdfLicense)))
{
license.SetLicense(stream);
}
// Load the PDF document
Document pdfDocument = new Document(pdfPath);
int PageCount = pdfDocument.Pages.Count();
Console.WriteLine($"Page: {PageCount} ");
// Initialize Aspose.OCR engine
AsposeOcr ocrEngine = new AsposeOcr();
// Loop through each page in the PDF
for (int pageIndex = 1; pageIndex <= pdfDocument.Pages.Count; pageIndex++)
{
Page page = pdfDocument.Pages[pageIndex];
// Create a stream to hold the image
using (MemoryStream imageStream = new MemoryStream())
{
// Convert the page to an image
Resolution resolution = new Resolution(150);
JpegDevice jpegDevice = new JpegDevice(resolution,75 );
jpegDevice.Process(page, imageStream);
// Perform OCR on the image
imageStream.Position = 0;
OcrInput input = new OcrInput(InputType.SingleImage, null);
input.Add(imageStream);
OcrOutput result = ocrEngine.Recognize(input, new RecognitionSettings());
// Output the recognized text
if (result != null && result.Count > 0)
{
Console.WriteLine($"Page {pageIndex} OCR Result:");
Console.WriteLine(result[0].RecognitionText);
}
}
}
}
Please suggest the way to improve the performance of OCR.
Regards,
Ramya.B