Dear team,
We are using Image extraction using aspose java below mentioned document not getting extracted
Source code :
if ((paragraph.toString(SaveFormat.TEXT).toLowerCase().trim().startsWith("fig")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Scheme")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Plate")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Abb")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Abbildung"))
// for duplicate figure caption it-15
&& (paragraph.getNextSibling() != null
&& !paragraph.getNextSibling().toString(SaveFormat.TEXT).trim().matches(matches)
|| (paragraph.getNextSibling() != null
&& paragraph.getNextSibling().getNodeType() != NodeType.TABLE
&& paragraph.getNextSibling().toString(SaveFormat.TEXT).trim().matches(matches)
&& (((Paragraph)paragraph.getNextSibling()).getChildNodes(NodeType.SHAPE, true)
.getCount() > 0
|| (paragraph.getNextSibling().getNextSibling()) != null
&& paragraph.getNextSibling().getNextSibling()
.getNodeType() != NodeType.TABLE
&& ((((Paragraph)paragraph.getNextSibling().getNextSibling())
.getChildNodes(NodeType.SHAPE, true).getCount() == 0)
//this codition added by pavi-14-12-2021 for duplicate captions
|| (((Paragraph)paragraph.getNextSibling().getNextSibling())
.getChildNodes(NodeType.SHAPE, true).getCount() > 0))))
|| paragraph.getParentSection().getBody().getLastParagraph().getText().trim()
.matches(matches))
// for duplicate figure caption
&& ((paragraph.getPreviousSibling() != null
&& paragraph.getPreviousSibling().getNodeType() != NodeType.TABLE)
|| paragraph.getParentSection().getBody().getFirstParagraph().getText().trim()
.matches(matches))
&& paragraph.getNodeType() != NodeType.TABLE
&& paragraph.getParentNode().getNodeType() != NodeType.CELL
&& !paragraph.toString(SaveFormat.TEXT).contains(AIE.docName)
//condition added by pavi -14-12-2021
&& (!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figure Captions")) ||
!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figures"))))
//|| ((paragraph.getNextSibling() == null) && (builder.getCurrentParagraph().isEndOfDocument()))
Input document : Paper 2.docx (2.8 MB)
please do needful
@e503824 In this case the problem occurs because shapes in your document are floating and actually can be child of any paragraph on the page. I al afraid it will not be possible to achieve what you need using your approach because, as I already mentioned, conditions will become too complicated and impossible to handle at some point.
I would suggest you to use another approach. For example you can try using DocumentVisitor to achieve what you need. I create a simple code that works correctly with your document and extracts images from it as PDF document:
Document doc = new Document("C:\\Temp\\in.docx");
ImageExtractor extractor = new ImageExtractor("C:\\Temp\\");
doc.accept(extractor);
private static class ImageExtractor extends DocumentVisitor {
public ImageExtractor(String targetFolder) {
mTargetFolder = targetFolder;
}
@Override
public int visitGroupShapeStart(GroupShape groupShape) throws Exception {
if (groupShape.isTopLevel())
mTopShapes.push(groupShape);
saveShapeAsPdf();
return super.visitGroupShapeStart(groupShape);
}
@Override
public int visitShapeStart(Shape shape) throws Exception {
if (shape.isTopLevel() &&
shape.getChildNodes(NodeType.PARAGRAPH, true).getCount() == 0) {
mTopShapes.push(shape);
}
saveShapeAsPdf();
return super.visitShapeStart(shape);
}
@Override
public int visitParagraphStart(Paragraph paragraph) throws Exception {
if (isCaptionParagraph(paragraph))
mCaptions.push(paragraph.toString(SaveFormat.TEXT).trim());
saveShapeAsPdf();
return super.visitParagraphStart(paragraph);
}
/**
* Checks whether paragraph is likely to be an image caption.
*/
private static boolean isCaptionParagraph(Paragraph paragraph) throws Exception {
// Get only Run text in account because if caption is in a textbox
// paragraph.toString will return the same value for both
// paragraph inside textbox shape and for paragraph that contains textbox shape.
String paraText = "";
for (Run r : paragraph.getRuns())
paraText += r.getText();
return paraText.startsWith("Fig") ||
paraText.startsWith("Scheme") ||
paraText.startsWith("Plate") ||
paraText.startsWith("Figure");
}
/**
* Save the last shape as a separate PDF document.
*/
private void saveShapeAsPdf() throws Exception {
if (!mTopShapes.empty() && !mCaptions.empty()) {
String caption = mCaptions.pop();
System.out.println(mShapeCounter);
System.out.println(caption);
ShapeBase shape = mTopShapes.pop();
// Create e temporary document which will be exported to PDF.
Document tmp = (Document) shape.getDocument().deepClone(false);
Node tmpSection = tmp.importNode(shape.getAncestor(NodeType.SECTION), false, ImportFormatMode.USE_DESTINATION_STYLES);
tmp.appendChild(tmpSection);
tmp.ensureMinimum();
Node resultShape = tmp.importNode(shape, true, ImportFormatMode.USE_DESTINATION_STYLES);
tmp.getFirstSection().getBody().getFirstParagraph().appendChild(resultShape);
// Format the output file path.
String outFilePath = mTargetFolder + "image_" + mShapeCounter + ".pdf";
tmp.save(outFilePath);
mShapeCounter++;
// Empty stacks.
mTopShapes.clear();
mCaptions.clear();
}
}
// Upon visiting the shapes captions and shapes are pushed in stack.
// only top level shapes will be collected.
private Stack<ShapeBase> mTopShapes = new Stack<ShapeBase>();
private Stack<String> mCaptions = new Stack<String>();
private int mShapeCounter = 0;
private String mTargetFolder;
}