I need to compress the pdf documents that I am generating via Aspose.Words and Aspose.Pdf.Kit. The need for compression is driven mainly by the size of the images within the pdf documents. I do not have access to the images before the pdf documents are generated. My plan is to extract all of the images from the pdf document, compress each image and then replace each original image in the pdf with a compressed version.
The image extraction process appears to be very slow. It takes over 2.5 minutes alone just to extract the image descriptions. Extracting the images takes over 20 minutes to go through 253 pages and extract 36 images and then it fails before completing with the following exception:
Wrong image extracting, please check your pdf.: v4.8.0.0
at x30a678191d9b1780.x542b077cf1ddeb66.xd88348c4c144fb81(ExtractImageMode xa4aa8b4150b11435)
at Aspose.Pdf.Kit.PdfExtractor.xd88348c4c144fb81(ExtractImageMode xa4aa8b4150b11435)
at Aspose.Pdf.Kit.PdfExtractor.ExtractImage(ExtractImageMode mode)
at EBI.RPM.DocBuilder.PdfBuilder.CompressPdf(MemoryStream pdfInput, String buildFolderPath) in C:\Vss\RPM\main\DocAutomation\DocBuilder\PdfBuilder.cs:line 286
at EBI.RPM.BuilderApp.Form1.btnCompressPdf_Click(Object sender, EventArgs e) in C:\Vss\RPM\main\DocAutomation\BuilderApp\Form1.cs:line 377
My code looks like this:
public static MemoryStream CompressPdf(MemoryStream pdfInput, string buildFolderPath)
{
MemoryStream pdfOutput = null;
ILog log = LogManager.GetLogger("EBI.RPM.DocBuilder.PdfBuilder");
log.InfoFormat("CompressPdf started, input size: {0}", pdfInput.Length.ToString());
//note that when replacing images, the image source must be the file system
String tempPath = buildFolderPath + @"\images";
if (!Directory.Exists(tempPath))
Directory.CreateDirectory(tempPath);
//get all image descriptions
ExtractImageMode imageMode = ExtractImageMode.ExcludeInlineImages;
PdfExtractor extractor = new PdfExtractor();
extractor.BindPdf(pdfInput);
ImageDescription[] imageList = extractor.GetImageDescriptions(imageMode);
log.Debug("CompressPdf image description extraction completed");
int imageCount = 0;
int currentPage = 0;
HashSet processedIndexes = new HashSet();
List processedImages = new List();
foreach (ImageDescription img in imageList)
{
if (currentPage != img.Page)
{
log.DebugFormat("CompressPdf extracting images on page {0}", img.Page.ToString());
imageCount = 1;
currentPage = img.Page;
extractor = new PdfExtractor();
extractor.BindPdf(pdfInput);
extractor.ExtractImage(imageMode); <<< line 286
extractor.StartPage = currentPage;
extractor.EndPage = currentPage;
}
else
imageCount++;
if (!extractor.HasNextImage())
throw new Exception(String.Format("Image {0} on page {1} was not found.",
imageCount.ToString(), currentPage.ToString()));
using (MemoryStream imageStream = new MemoryStream())
{
if (!extractor.GetNextImage(imageStream))
throw new Exception(String.Format("Unable to extract image {0} on page {1}.",
imageCount.ToString(), currentPage.ToString()));
//skip image if it has already been processed
if (!processedIndexes.Contains(img.Index))
{
processedIndexes.Add(img.Index);
processedImages.Add(img);
//compress image (not implemented yet)
//save compressed image
string imgpath = ImagePath(tempPath, img);
log.DebugFormat("Saving image to {0}", imgpath);
Utils.SaveMemoryStream(imageStream, imgpath);
}
}
}
log.Debug("CompressPdf image extraction/compression completed; starting image replacement.");
PdfContentEditor editor = new PdfContentEditor();
editor.BindPdf(pdfInput);
foreach (ImageDescription img in processedImages)
{
string imgpath2 = ImagePath(tempPath, img);
log.DebugFormat("Replacing image {0}-{1} with {2}", img.Page, img.Index, imgpath2);
editor.ReplaceImage(img.Page, img.Index, imgpath2);
}
log.Debug("CompressPdf image replacement completed");
pdfOutput = new MemoryStream();
editor.Save(pdfOutput);
log.InfoFormat("CompressPdf completed, output size: {0}", pdfOutput.Length.ToString());
return pdfOutput;
}
private static string ImagePath(string rootPath, ImageDescription image)
{
string ext = "";
if (image.Type == ImageFormat.Jpeg)
ext = "jpg";
else if (image.Type == ImageFormat.Gif)
ext = "gif";
else if (image.Type == ImageFormat.Bmp)
ext = "bmp";
else if (image.Type == ImageFormat.Png)
ext = "png";
else if (image.Type == ImageFormat.Tiff)
ext = "tif";
return Path.Combine(rootPath, image.Name + "-" + image.Index.ToString() + "." + ext);
}
The test file I am using is approximately 18MB. My development machine has 2GB of ram.
What am I doing wrong to cause this error? Is there a better approach that would improve performance?
Thank you,
Ken