I have found some issues in how the code was implemented by
our offshore team that resulted in duplicated highlighted text. I have solved
that issue but I am running into another issue where I am not able to extract
just the highlighted text within a given line. Please see code below
to see implementation... I was not able to upload the console app
using Aspose.Pdf;
using Aspose.Pdf.Annotations;
using Aspose.Pdf.Text;
using System;
using System.Collections.Generic;
using System.Configuration;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace PDFParser
{
class Program
{
static void Main(string[] args)
{
Aspose.Pdf.License license = new Aspose.Pdf.License();
license.SetLicense("Aspose.Pdf.lic");
// Open document
string pdfPath = "20 pages 20 tags perf testing.pdf";
Document pdfDocument = new Document(pdfPath);
Console.WriteLine("Reading " + pdfPath);
foreach(Page page in pdfDocument.Pages)
{
foreach (Annotation annotation in page.Annotations)
{
if (annotation.AnnotationType == AnnotationType.Highlight)
{
HighlightAnnotation linkAnno = (HighlightAnnotation)annotation;
Aspose.Pdf.Rectangle rect = linkAnno.Rect;
TextAbsorber absorber = new TextAbsorber();
absorber.TextSearchOptions.LimitToPageBounds = true;
absorber.TextSearchOptions.Rectangle = rect;
page.Accept(absorber);
//this is not limiting to only highlighted text
//It is there a way to ignore words that are not highlighted?
//Please look at the first highlighted text in the sample pdf used here
//It should stop at "setting."
string extractedText = absorber.Text;
}
}
Console.WriteLine(".....");
}
}
}
}