Hi Asad,
Sorry to know that there was a problem with download i will try to add the code here:
public class Program
{
static void Main(string[] args)
{
DotNetClass dotNetobj = new DotNetClass();
// Example file path and page index
string filePath = "C:\\PDF files\\PDF_SinglePage.pdf";
//int pageIndex = 0;
ASPOSE_DOCUMENT documentHandle = Aspose_LoadDocument(filePath, NULL);
int pageCount = Aspose_GetPageCount(documentHandle);
//Console.WriteLine($"Page Count: {pageCount}");
for (int i = 0; i < pageCount; i++)
{
int localPageID = Aspose_LoadPage(documentHandle, i);
//// Example usage of the loaded page
double pageWidth = Aspose_GetPageWidth(localPageID);
double pageHeight = Aspose_GetPageHeight(localPageID);
// Load the text page using the page handle
bool resultLoadpage = AsposeText_LoadPage(localPageID);
if (!resultLoadpage)
{
RLLOG_ERROR("Failed to load text page");
return false;
}
double left, right, bottom, top;
left = right = bottom = top = 0;
int iRect = 1;
bool resultRect = Aspose_GetRectangle(localPageID, iRect, &left, &right, &bottom, &top);
if (resultRect)
{
int nSegCharStart, nSegCharsNum;
int nPageSegsNum = Aspose_CountBoundedSegments(localPageID, left, right, bottom, top);
for (int s = 0; s < nPageSegsNum; ++s)
{
bool resultSeg = Aspose_GetBoundedSegment(s, localPageID, left, top, right, bottom, &nSegCharStart, &nSegCharsNum);
if (resultSeg)
{
CString czWord;
int nWordStart = 0;
for (int j = 0; j < nSegCharsNum; j++)
{
unsigned int unicode = Aspose_GetUnicode(localPageID, nSegCharStart + j);
unsigned char c = 0;
if (unicode <= 255)
{
c = static_cast<char>(unicode);
}
else
{
c = ' ';
}
if (!Aspose_IsTextGenerated(c) && (c <= 128 && (isalpha(c) || isdigit(c))))
{
if (czWord.IsEmpty())
{
nWordStart = nSegCharStart;
}
czWord += c;
}
else
{
//RLLOG_DEBUG("Word: " << czWord);
if (!czWord.IsEmpty())
{
AsposeAddWord(pOCRData, czWord, nWordStart, localPageID);
outFile << "Aspose Word :" << (LPCTSTR)czWord << std::endl;
czWord.Empty();
}
//AddWord(pOCRData, czWord, nWordStart, localPageID);
}
}
RLLOG_DEBUG("Aspose Word: " << czWord);
//AddWord(pOCRData, czWord, nWordStart, localPageID);
AsposeAddWord(pOCRData, czWord, nWordStart, localPageID);
}
}
}
}
}
}
public class AsposePDFApi
{
private static readonly Lazy<AsposePDFApi> instance = new Lazy<AsposePDFApi>(() => new AsposePDFApi());
static int pageNumb = 1;
public Dictionary<int, (MemoryStream, Aspose.Pdf.Page)> pageDictionary = new Dictionary<int, (MemoryStream, Aspose.Pdf.Page)>();
public Dictionary<IntPtr, Aspose.Pdf.Page> pagePointerDictionary = new Dictionary<IntPtr, Aspose.Pdf.Page>();
public Dictionary<int, TextFragmentAbsorber> textAbsorberDictionary = new Dictionary<int, TextFragmentAbsorber>();
private Dictionary<IntPtr, Document> documentDictionary = new Dictionary<IntPtr, Document>();
// Private constructor to prevent instantiation
private AsposePDFApi()
{
pageDictionary = new Dictionary<int, (MemoryStream, Aspose.Pdf.Page)>();
textAbsorberDictionary = new Dictionary<int, TextFragmentAbsorber>();
}
// Public static method to get the single instance of the class
public static AsposePDFApi Instance
{
get
{
return instance.Value;
}
}
// Method: Aspose_LoadDocument Load a PDF document from a file.
public IntPtr Aspose_LoadDocument(string filePath, string password)
{
// Load the document from file
byte[] fileData = System.IO.File.ReadAllBytes(filePath);
GCHandle handle = GCHandle.Alloc(fileData, GCHandleType.Pinned);
return (IntPtr)handle;
}
// Method: Aspose_GetPageCount Get the total number of pages in the loaded PDF document.
public int Aspose_GetPageCount(IntPtr document)
{
try
{// Return the number of pages in the document
GCHandle handle = (GCHandle)document;
byte[] fileData = (byte[])handle.Target;
using (var stream = new System.IO.MemoryStream(fileData))
{
Document pdfDocument = new Document(stream);
return pdfDocument.Pages.Count;
}
}
catch (Exception ex)
{
//MessageBox.Show($"An error occurred while loading the page: {ex.Message}");
return -1;
}
}
// Method: Aspose_LoadPage Load a text page from the PDF document.
public int Aspose_LoadPage(IntPtr document, int pageIndex)
{
GCHandle handle = (GCHandle)document;
byte[] fileData = (byte[])handle.Target;
var stream = new MemoryStream(fileData);
Document pdfDocument = new Document(stream);
Aspose.Pdf.Page pdfPage;
try
{
pdfPage = pdfDocument.Pages[pageIndex + 1]; // Aspose.PDF pages are 1-based
int pageId = pdfPage.GetHashCode();
// Check for hash code collision
if (pageDictionary.ContainsKey(pageId))
{
MessageBox.Show($"Hash code collision detected for page ID.", "Information", MessageBoxButtons.OK, MessageBoxIcon.Information);
throw new InvalidOperationException("Hash code collision detected for page ID.");
}
pageDictionary[pageId] = (stream, pdfPage);
return pageId;
}
catch (ArgumentOutOfRangeException ex)
{
MessageBox.Show($"Error: Page index {pageIndex} is out of range. {ex.Message}");
return -1;
}
catch (Exception ex)
{
MessageBox.Show($"An error occurred while loading the page: {ex.Message}");
return -1;
}
}
// Method: Aspose_LoadTextPage Load a text page from the PDF document.
public bool AsposeText_LoadPage(int pageID)
{
try
{
if (pageDictionary.TryGetValue(pageID, out var pageTuple))
{
//MessageBox.Show($"PageID : {pageID}", "Information", //MessageBoxButtons.OK, //MessageBoxIcon.Information);
MemoryStream stream = pageTuple.Item1;
Aspose.Pdf.Page pagehandle = pageTuple.Item2;
// Ensure the MemoryStream is not disposed
if (stream == null || !stream.CanRead)
{
Console.WriteLine("MemoryStream is not available or has been disposed.");
throw new InvalidOperationException("MemoryStream is not available or has been disposed.");
}
// Reinitialize the Document object using the MemoryStream
stream.Seek(0, SeekOrigin.Begin); // Reset the stream position
Document pdfDocument = new Document(stream);
// Retrieve the Page from the reinitialized Document
pagehandle = pdfDocument.Pages[pagehandle.Number];
// Ensure the page object is not null
if (pagehandle == null || pagehandle.PageInfo == null)
{
throw new InvalidOperationException("Page object is not fully initialized or is null.");
}
// Create a TextFragmentAbsorber to find text within the page
TextFragmentAbsorber textAbsorber = new TextFragmentAbsorber();
// Accept the absorber to extract text
pagehandle.Accept(textAbsorber);
textAbsorberDictionary[pageID] = textAbsorber;
//Console.WriteLine("PageID");
return true;
}
else
{
throw new KeyNotFoundException("Page not found.");
}
}
catch (Exception ex)
{
//MessageBox.Show($"Error in Aspose_LoadTextPageFromPage: {ex.Message}", "Information", //MessageBoxButtons.OK, //MessageBoxIcon.Information);
// Log or handle the exception as needed
return false; // Return false if an exception occurs
}
}
// Function: Aspose_GetPageWidth Get the width of a specific page (exported function).
public double Aspose_GetPageWidth(int pageHandle)
{
if (pageDictionary.TryGetValue(pageHandle, out var pageTuple))
{
return pageTuple.Item2.Rect.Width;
}
throw new KeyNotFoundException("Page not found.");
}
// Function: GetPageHeight Get the Height of a specific page (exported function).
public double Aspose_GetPageHeight(int pageHandle)
{
if (pageDictionary.TryGetValue(pageHandle, out var pageTuple))
{
return pageTuple.Item2.Rect.Height;
}
throw new KeyNotFoundException("Page not found.");
}
public bool Aspose_GetRectangle(int pageID, int iRect, out double left, out double right, out double bottom, out double top)
{
left = right = bottom = top = 0;
try
{
if (pageDictionary.TryGetValue(pageID, out var pageTuple))
{
Aspose.Pdf.Page page = pageTuple.Item2;
Aspose.Pdf.Rectangle rect = null;
switch (iRect)
{
case 0:
rect = page.Rect;
break;
case 1:
rect = page.CropBox;
break;
case 2:
rect = page.MediaBox;
break;
case 3:
rect = page.CropBox;
break;
case 4:
rect = page.TrimBox;
break;
case 5:
rect = page.ArtBox;
break;
case 6:
rect = page.BleedBox;
break;
default:
left = right = bottom = top = 0;
return false;
}
if (rect != null)
{
left = rect.LLX;
right = rect.URX;
bottom = rect.LLY;
top = rect.URY;
return true;
}
}
else
{
throw new KeyNotFoundException("Page not found for the given page ID.");
}
}
catch (Exception ex)
{
// Log or handle the exception as needed
Console.WriteLine($"Error in Aspose_GetRectangle: {ex.Message}");
}
return false;
}
public int Aspose_CountBoundedSegments(int pageID, double left, double right, double bottom, double top)
{
// Retrieve the Page object from the dictionary using PageID
try
{
if (pageDictionary.TryGetValue(pageID, out var pageTuple))
{
Aspose.Pdf.Page page = pageTuple.Item2;
// Retrieve the TextFragmentAbsorber from the dictionary using PageID
if (textAbsorberDictionary.TryGetValue(pageID, out var textAbsorber))
{
TextFragmentCollection textFragments = textAbsorber.TextFragments;
int segmentCount = 0;
// Iterate through the text fragments to count the bounded segments
foreach (TextFragment fragment in textFragments)
{
foreach (TextSegment segment in fragment.Segments)
{
Aspose.Pdf.Rectangle segmentRect = segment.Rectangle;
if (segmentRect.LLX >= left && segmentRect.URY <= top && segmentRect.URX <= right && segmentRect.LLY >= bottom)
{
segmentCount++;
}
}
}
return segmentCount;
}
else
{
throw new KeyNotFoundException("TextFragmentAbsorber not found for the given page ID.");
}
}
else
{
throw new KeyNotFoundException("Page not found for the given page ID.");
}
}
catch (Exception ex)
{
// Log or handle the exception as needed
Console.WriteLine($"Error in CountBoundedSegments: {ex.Message}");
return 0;
}
}
public bool Aspose_GetBoundedSegment(int segmentIndex, int pageID, double left, double top, double right, double bottom, out int nSegCharStart, out int nSegCharsNum)
{
nSegCharStart = 0;
nSegCharsNum = 0;
// Check if the page exists in the dictionary
if (!pageDictionary.TryGetValue(pageID, out var pageTuple))
{
Console.WriteLine("Page not found for the given page ID.");
return false;
}
Aspose.Pdf.Page page = pageTuple.Item2;
// Check if the TextFragmentAbsorber exists in the dictionary
if (!textAbsorberDictionary.TryGetValue(pageID, out var textAbsorber))
{
Console.WriteLine("TextFragmentAbsorber not found for the given page ID.");
return false;
}
TextFragmentCollection textFragments = textAbsorber.TextFragments;
int segmentCount = 0;
int charIndex = 0;
// Iterate through the text fragments to find the bounded segment
foreach (TextFragment fragment in textFragments)
{
foreach (TextSegment segment in fragment.Segments)
{
Aspose.Pdf.Rectangle segmentRect = segment.Rectangle;
// Check if the segment is within the specified rectangle
if (segmentRect.LLX >= left && segmentRect.URY <= top && segmentRect.URX <= right && segmentRect.LLY >= bottom)
{
if (segmentCount == segmentIndex)
{
nSegCharStart = charIndex;
nSegCharsNum = segment.Text.Length;
return true;
}
segmentCount++;
}
charIndex += segment.Text.Length;
}
}
// Ensure out parameters are assigned before returning false
nSegCharStart = -1;
nSegCharsNum = 0;
return false;
}
public uint Aspose_GetUnicode(int pageID, int index)
{
if (pageDictionary.TryGetValue(pageID, out var pageTuple))
{
Aspose.Pdf.Page page = pageTuple.Item2;
// Create a TextFragmentAbsorber to find text within the page
TextFragmentAbsorber textFragmentAbsorber = new TextFragmentAbsorber();
page.Accept(textFragmentAbsorber);
// Get the collection of text fragments
TextFragmentCollection textFragments = textFragmentAbsorber.TextFragments;
int currentIndex = 0;
// Iterate through the text fragments and their segments
foreach (TextFragment fragment in textFragments)
{
foreach (TextSegment segment in fragment.Segments)
{
// Check if the current segment contains the character at the specified index
if (currentIndex + segment.Text.Length > index)
{
return segment.Text[index - currentIndex];
}
currentIndex += segment.Text.Length;
}
}
}
// Return 0 if the index is out of range or character is not found
return 0;
}
public bool Aspose_IsTextGenerated(char character)
{
return char.IsWhiteSpace(character) || character == '\n' || character == '\r';
}
}
}
Hope this code snippet should be sufficient.
Regards,
Ramya.B