Here is my test code in c# and test pdf file.
public static void GetPDFElement()
{
string num = "12";
string datadir = @"D:\project\OCRBaseDev\trunk\OCR_source\test-data\";
string tempPath = datadir + num + @"_pdf\";
if (!Directory.Exists(tempPath))
{
Directory.CreateDirectory(tempPath);
}
if (File.Exists(datadir + num + @"_bak.pdf"))
{
File.Delete(datadir + num + @"_bak.pdf");
}
File.Copy(datadir + num + ".pdf", datadir + num + "_bak.pdf");
Document document = new Document(datadir + num + @"_bak.pdf");
// extract all pages
foreach (Page page in document.Pages)
{
Console.WriteLine("--------------------------------");
ImagePlacementAbsorber abs = new ImagePlacementAbsorber();
page.Accept(abs);
// Get the count of images over specific page
Console.WriteLine("Total Images = {0} over page number {1}", abs.ImagePlacements.Count, page.Number);
int image_counter = 1;
foreach (ImagePlacement ia in abs.ImagePlacements)
{
Aspose.Pdf.Rectangle rc = ia.Rectangle;
Aspose.Pdf.Devices.Resolution res = ia.Resolution;
Console.WriteLine("Matrix:" + ia.Matrix);
Console.WriteLine("Rotation:" + ia.Rotation);
Console.WriteLine("Resolution:" + ia.Resolution);
Console.WriteLine("Rectangle:" + ia.Rectangle);
string name = num + "_" + page.Number + "_" + ia.Image.GetColorType() + "(" + rc.LLX + "," + rc.LLY + "," + rc.Width + "," + rc.Height + ")_(" + res.X + "," + res.Y + ").png";
FileStream outputImage = new FileStream(tempPath + name, FileMode.Create);
// Save output image
int rot = (int)ia.Rotation;
if(rot != 0)
{
MemoryStream ms = new MemoryStream();
ia.Image.Save(ms);
System.Drawing.Image img = System.Drawing.Image.FromStream(ms);
switch (rot)
{
case 90:
img.RotateFlip(RotateFlipType.Rotate90FlipNone);
//rc.Rotate(Rotation.on90);
break;
case 180:
img.RotateFlip(RotateFlipType.Rotate180FlipNone);
//rc.Rotate(Rotation.on180);
break;
case 270:
img.RotateFlip(RotateFlipType.Rotate270FlipNone);
//rc.Rotate(Rotation.on270);
break;
default:
break;
}
img.Save(outputImage, System.Drawing.Imaging.ImageFormat.Png);
}
else
{
ia.Image.Save(outputImage, System.Drawing.Imaging.ImageFormat.Png);
}
outputImage.Close();
image_counter += 1;
// ocr
string text = GetOCRText(tempPath + name, tempPath);
BuildSearchablePdf(page, rc, text);
}
}
Console.WriteLine("Save Pdf Start");
document.Save(datadir + num + "_bak_test.pdf");
Console.WriteLine("Save Pdf End");
Aspose.Pdf.DocSaveOptions docSaveOption = new DocSaveOptions();
docSaveOption.CustomProgressHandler = new DocSaveOptions.ConversionProgressEventHandler(ShowProgressOnConsole);
//docSaveOption.Mode = DocSaveOptions.RecognitionMode.Flow;
Console.WriteLine("Save Doc Start");
document.Save(datadir + num + "_bak_test.doc", docSaveOption);
Console.WriteLine("Save Doc End");
}
public static string GetOCRText(string imgPath, string tempPath)
{
string text;
ProcessStartInfo info = new ProcessStartInfo(@"D:\project\OCRBaseDev\trunk\OCR_source\tools\tesseract\tesseract.exe");
info.WindowStyle = ProcessWindowStyle.Hidden;
info.Arguments = imgPath + " " + tempPath + "res -l chi_sim";
Process p = new Process();
p.StartInfo = info;
p.Start();
p.WaitForExit();
StreamReader streamReader = new StreamReader(tempPath + "res.txt");
text = streamReader.ReadToEnd();
streamReader.Close();
return text;
}
public static void BuildSearchablePdf(Aspose.Pdf.Page page, Aspose.Pdf.Rectangle imgRect, string ocrText)
{
// Insert Into Hocr Text
Console.WriteLine(ocrText);
// Create RedactionAnnotation instance for specific page region
Aspose.Pdf.Annotations.RedactionAnnotation annot = new Aspose.Pdf.Annotations.RedactionAnnotation(page, imgRect);
annot.FillColor = Aspose.Pdf.Color.White;
annot.BorderColor = Aspose.Pdf.Color.Yellow;
annot.Color = Aspose.Pdf.Color.Black;
// Text to be printed on redact annotation
annot.OverlayText = ocrText;
// Add annotation to annotations collection of first page
page.Annotations.Add(annot);
// Flattens annotation and redacts page contents (i.e. removes text and image
// Under redacted annotation)
annot.Redact();
FloatingBox box = new FloatingBox();
Aspose.Pdf.Text.TextFragment fragment = new Aspose.Pdf.Text.TextFragment(ocrText);
fragment.TextState.HorizontalAlignment = HorizontalAlignment.Left;
box.Paragraphs.Add(fragment);
box.Top = page.Rect.URY - imgRect.URY - page.PageInfo.Margin.Top;
page.PageInfo.Margin.Left = imgRect.LLX;
page.Paragraphs.Add(box);
}
public static void ShowProgressOnConsole(MobiXmlSaveOptions.ProgressEventHandlerInfo eventInfo)
{
switch (eventInfo.EventType)
{
case DocSaveOptions.ProgressEventType.TotalProgress:
Console.WriteLine(String.Format("{0} - Conversion progress : {1}% .", DateTime.Now.ToLongTimeString(), eventInfo.Value.ToString()));
break;
case DocSaveOptions.ProgressEventType.SourcePageAnalized:
Console.WriteLine(String.Format("{0} - Source page {1} of {2} analyzed.", DateTime.Now.ToLongTimeString(), eventInfo.Value.ToString(), eventInfo.MaxValue.ToString()));
break;
case DocSaveOptions.ProgressEventType.ResultPageCreated:
Console.WriteLine(String.Format("{0} - Result page's {1} of {2} layout created.", DateTime.Now.ToLongTimeString(), eventInfo.Value.ToString(), eventInfo.MaxValue.ToString()));
break;
case DocSaveOptions.ProgressEventType.ResultPageSaved:
Console.WriteLine(String.Format("{0} - Result page {1} of {2} exported.", DateTime.Now.ToLongTimeString(), eventInfo.Value.ToString(), eventInfo.MaxValue.ToString()));
break;
default:
break;
}
}
12.pdf (1.5 MB)
hope to get help!