Pdf to html regression

ahong · May 20, 2015, 1:52am

There’s a problem with the output when converting the attached pdf to html, using straightforward code below. This seems to be a regression in aspose-pdf-10.3.0.jar. This used to work correctly.

/**

* Convert the PDF stored in bytes to html. Split the html into chunks and save them as CloudStorage blobs

* @return chunkCount, with resourceCount[0] containing the count of html resources

*/

public static int processPDFForHtmlChunks(final byte[] bytes, final CloudStorage cloudStorage, final String bucketName, final String csKey, final int[] resourceCount,

final int[] cssCount) throws Exception {

try {

long startTime = System.currentTimeMillis();

logger.info(String.format(“processPDFForHtmlChunks started for %s, length=%d”, csKey, bytes.length));

ByteArrayInputStream ins = new ByteArrayInputStream(bytes);

com.aspose.pdf.Document doc = new com.aspose.pdf.Document(ins);

HtmlSaveOptions saveOptions = new HtmlSaveOptions();

saveOptions.FontSavingMode = HtmlSaveOptions.FontSavingModes.SaveInAllFormats;

final String[] htmlString = new String[1];

resourceCount[0] = 0;

saveOptions.CustomHtmlSavingStrategy = new HtmlSaveOptions.HtmlPageMarkupSavingStrategy() {

@Override public void invoke(HtmlSaveOptions.HtmlPageMarkupSavingInfo si) {

try {

if (logger.isDebugEnabled())

logger.debug(String.format(“HTMLPageMarkeupSavingStrategy.invoke(), SupposedName=%s, Content len=%d”, si.SupposedFileName, si.ContentStream.getLength()));

byte[] bytes = new byte[(int)si.ContentStream.getLength()];

InputStream stream = si.ContentStream.toInputStream();

int count = stream.read(bytes);

htmlString[0] = new String(bytes, “UTF-8”);

} catch (Throwable t) {

Utils.logStackTrace(t);

throw Utils.makeRuntimeException(t);

}

};

saveOptions.CustomResourceSavingStrategy = new HtmlSaveOptions.ResourceSavingStrategy() {

@Override public String invoke(SaveOptions.ResourceSavingInfo si) {

try {

++resourceCount[0];

if (si instanceof HtmlSaveOptions.HtmlImageSavingInfo) {

HtmlSaveOptions.HtmlImageSavingInfo imageSi = (HtmlSaveOptions.HtmlImageSavingInfo) si;

}

byte[] bytes = new byte[(int)si.ContentStream.getLength()];

InputStream stream = si.ContentStream.toInputStream();

int count = stream.read(bytes);

int extIndex = si.SupposedFileName.lastIndexOf(’.’);

String ext = null;

if (extIndex > 0)

ext = si.SupposedFileName.substring(extIndex+1);

String mimeType = MimeTypes.getMimeType(ext);

String blobKey = String.format("%s.resourcePart%d", csKey, resourceCount[0]);

cloudWriters.putBinary(cloudStorage, bytes, mimeType, bucketName, blobKey);

return String.format("/resources/item/%s/blob", blobKey);

} catch (Throwable t) {

Utils.logStackTrace(t);

throw Utils.makeRuntimeException(t);

}

};

cssCount[0] = 0;

saveOptions.setSplitCssIntoPages(false);

saveOptions.CustomStrategyOfCssUrlCreation = new HtmlSaveOptions.CssUrlMakingStrategy() {

@Override public String invoke(HtmlSaveOptions.CssUrlRequestInfo ri) {

++cssCount[0];

if (logger.isDebugEnabled())

logger.debug(String.format(“CssUrlMakingStrategy.invoke(), cssCount=%d”, cssCount[0]));

return cssCount[0] == 1?

String.format("/resources/item/%s/css", csKey):

String.format("/resources/item/%s/css/{0}", csKey);

}

};

saveOptions.CustomCssSavingStrategy = new HtmlSaveOptions.CssSavingStrategy() {

@Override public void invoke(HtmlSaveOptions.CssSavingInfo si) {

try {

if (logger.isDebugEnabled())

logger.debug(String.format(“CssSavingStrategy.invoke(), SupposedURL=%s ContentStream len=%d cssNumber=%d”, si.SupposedURL, si.ContentStream.getLength(), si.CssNumber));

byte[] bytes = new byte[(int)si.ContentStream.getLength()];

InputStream stream = si.ContentStream.toInputStream();

int count = stream.read(bytes);

String key = si.CssNumber == 1?

String.format("%s.css", csKey):

String.format("%s.css.%d", csKey, si.CssNumber);

cloudStorage.putBinary(bytes, MimeTypes.MIME_TEXT_CSS, bucketName, key);

} catch (Throwable t) {

Utils.logStackTrace(t);

throw Utils.makeRuntimeException(t);

}

};

try {

String dummyName = String.format(“dummy-%d.html”, Thread.currentThread().getId());

logger.debug("Saving to " + dummyName);

doc.save(dummyName, saveOptions); // directory with empty content will be written to disk, delete it afterward

logger.debug("Deleting " + dummyName);

FileUtils.deleteDirectory(new File(dummyName));

} catch (Throwable t) {

logger.error("Unable to save html for " + csKey + “, throwable=” + t.toString());

}

//

// Now split the html into chunks

//

int rv = splitHtmlToChunks(htmlString[0], cloudStorage, bucketName, csKey, false);

logger.info(String.format(“Finished processPDFForHtmlChunks chunkCount=%d, resourceCount=%d, msec=%d”, rv, resourceCount[0], System.currentTimeMillis()-startTime));

return rv;

} catch (IOException e) {

Utils.logStackTrace(e);

throw new IOException(e);

}

tilal.ahmad · May 20, 2015, 11:53pm

Hi there,

Thanks for your inquiry. We have noticed the regression issue in PDF to HTML conversion, so logged a ticket PDFNEWJAVA-34860 in our issue tracking system for further investigation and rectification. We will notify you as soon as it is resolved.

We are sorry for the inconvenience caused.

Best Regards,

aspose.notifier · June 8, 2017, 8:08am

The issues you have found earlier (filed as PDFJAVA-34860) have been fixed in Aspose.Pdf for Java 17.5.

This message was posted using Notification2Forum from Downloads module by Aspose Notifier.