There’s a problem with the output when converting the attached pdf to html, using straightforward code below. This seems to be a regression in aspose-pdf-10.3.0.jar. This used to work correctly.
/**
* Convert the PDF stored in bytes to html. Split the html into chunks and save them as CloudStorage blobs
* @return chunkCount, with resourceCount[0] containing the count of html resources
*/
public static int processPDFForHtmlChunks(final byte[] bytes, final CloudStorage cloudStorage, final String bucketName, final String csKey, final int[] resourceCount,
final int[] cssCount) throws Exception {
try {
long startTime = System.currentTimeMillis();
logger.info(String.format(“processPDFForHtmlChunks started for %s, length=%d”, csKey, bytes.length));
ByteArrayInputStream ins = new ByteArrayInputStream(bytes);
com.aspose.pdf.Document doc = new com.aspose.pdf.Document(ins);
HtmlSaveOptions saveOptions = new HtmlSaveOptions();
saveOptions.FontSavingMode = HtmlSaveOptions.FontSavingModes.SaveInAllFormats;
final String[] htmlString = new String[1];
resourceCount[0] = 0;
saveOptions.CustomHtmlSavingStrategy = new HtmlSaveOptions.HtmlPageMarkupSavingStrategy() {
@Override public void invoke(HtmlSaveOptions.HtmlPageMarkupSavingInfo si) {
try {
if (logger.isDebugEnabled())
logger.debug(String.format(“HTMLPageMarkeupSavingStrategy.invoke(), SupposedName=%s, Content len=%d”, si.SupposedFileName, si.ContentStream.getLength()));
byte[] bytes = new byte[(int)si.ContentStream.getLength()];
InputStream stream = si.ContentStream.toInputStream();
int count = stream.read(bytes);
htmlString[0] = new String(bytes, “UTF-8”);
} catch (Throwable t) {
Utils.logStackTrace(t);
throw Utils.makeRuntimeException(t);
}
}
};
saveOptions.CustomResourceSavingStrategy = new HtmlSaveOptions.ResourceSavingStrategy() {
@Override public String invoke(SaveOptions.ResourceSavingInfo si) {
try {
++resourceCount[0];
if (si instanceof HtmlSaveOptions.HtmlImageSavingInfo) {
HtmlSaveOptions.HtmlImageSavingInfo imageSi = (HtmlSaveOptions.HtmlImageSavingInfo) si;
}
byte[] bytes = new byte[(int)si.ContentStream.getLength()];
InputStream stream = si.ContentStream.toInputStream();
int count = stream.read(bytes);
int extIndex = si.SupposedFileName.lastIndexOf(’.’);
String ext = null;
if (extIndex > 0)
ext = si.SupposedFileName.substring(extIndex+1);
String mimeType = MimeTypes.getMimeType(ext);
String blobKey = String.format("%s.resourcePart%d", csKey, resourceCount[0]);
cloudWriters.putBinary(cloudStorage, bytes, mimeType, bucketName, blobKey);
return String.format("/resources/item/%s/blob", blobKey);
} catch (Throwable t) {
Utils.logStackTrace(t);
throw Utils.makeRuntimeException(t);
}
}
};
cssCount[0] = 0;
saveOptions.setSplitCssIntoPages(false);
saveOptions.CustomStrategyOfCssUrlCreation = new HtmlSaveOptions.CssUrlMakingStrategy() {
@Override public String invoke(HtmlSaveOptions.CssUrlRequestInfo ri) {
++cssCount[0];
if (logger.isDebugEnabled())
logger.debug(String.format(“CssUrlMakingStrategy.invoke(), cssCount=%d”, cssCount[0]));
return cssCount[0] == 1?
String.format("/resources/item/%s/css", csKey):
String.format("/resources/item/%s/css/{0}", csKey);
}
};
saveOptions.CustomCssSavingStrategy = new HtmlSaveOptions.CssSavingStrategy() {
@Override public void invoke(HtmlSaveOptions.CssSavingInfo si) {
try {
if (logger.isDebugEnabled())
logger.debug(String.format(“CssSavingStrategy.invoke(), SupposedURL=%s ContentStream len=%d cssNumber=%d”, si.SupposedURL, si.ContentStream.getLength(), si.CssNumber));
byte[] bytes = new byte[(int)si.ContentStream.getLength()];
InputStream stream = si.ContentStream.toInputStream();
int count = stream.read(bytes);
String key = si.CssNumber == 1?
String.format("%s.css", csKey):
String.format("%s.css.%d", csKey, si.CssNumber);
cloudStorage.putBinary(bytes, MimeTypes.MIME_TEXT_CSS, bucketName, key);
} catch (Throwable t) {
Utils.logStackTrace(t);
throw Utils.makeRuntimeException(t);
}
}
};
try {
String dummyName = String.format(“dummy-%d.html”, Thread.currentThread().getId());
logger.debug("Saving to " + dummyName);
doc.save(dummyName, saveOptions); // directory with empty content will be written to disk, delete it afterward
logger.debug("Deleting " + dummyName);
FileUtils.deleteDirectory(new File(dummyName));
} catch (Throwable t) {
logger.error("Unable to save html for " + csKey + “, throwable=” + t.toString());
}
//
// Now split the html into chunks
//
int rv = splitHtmlToChunks(htmlString[0], cloudStorage, bucketName, csKey, false);
logger.info(String.format(“Finished processPDFForHtmlChunks chunkCount=%d, resourceCount=%d, msec=%d”, rv, resourceCount[0], System.currentTimeMillis()-startTime));
return rv;
} catch (IOException e) {
Utils.logStackTrace(e);
throw new IOException(e);
}
}