Dear team,
We are extracting images from docx but below case we are notable to extract below case please find source code
Source code :
if ((paragraph.toString(SaveFormat.TEXT).toLowerCase().trim().startsWith("fig")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Scheme")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Plate")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Abb")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Abbildung"))
// for duplicate figure caption it-15
&& (paragraph.getNextSibling() != null
&& !paragraph.getNextSibling().toString(SaveFormat.TEXT).trim().matches(matches)
|| (paragraph.getNextSibling() != null
&& paragraph.getNextSibling().getNodeType() != NodeType.TABLE
&& paragraph.getNextSibling().toString(SaveFormat.TEXT).trim().matches(matches)
&& (((Paragraph) paragraph.getNextSibling()).getChildNodes(NodeType.SHAPE, true)
.getCount() > 0
|| (paragraph.getNextSibling().getNextSibling()) != null
&& paragraph.getNextSibling().getNextSibling()
.getNodeType() != NodeType.TABLE
&& ((((Paragraph) paragraph.getNextSibling().getNextSibling())
.getChildNodes(NodeType.SHAPE, true).getCount() == 0)
//this codition added by pavi-14-12-2021 for duplicate captions
||(((Paragraph) paragraph.getNextSibling().getNextSibling())
.getChildNodes(NodeType.SHAPE, true).getCount() > 0))))
|| paragraph.getParentSection().getBody().getLastParagraph().getText().trim()
.matches(matches))
// for duplicate figure caption
&& ((paragraph.getPreviousSibling() != null
&& paragraph.getPreviousSibling().getNodeType() != NodeType.TABLE)
|| paragraph.getParentSection().getBody().getFirstParagraph().getText().trim()
.matches(matches))
&& paragraph.getNodeType() != NodeType.TABLE
&& paragraph.getParentNode().getNodeType() != NodeType.CELL
&& !paragraph.toString(SaveFormat.TEXT).contains(AIE.docName)
//condition added by pavi -14-12-2021
&& (!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figure Captions"))||
!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figures"))))
//|| ((paragraph.getNextSibling() == null) && (builder.getCurrentParagraph().isEndOfDocument()))
{
input file : clear-Manuscript_2022.5.2-3.57 (1).docx (7.4 MB)
@e503824 This is a complicated one, because under one caption falls several separate shapes, which are not grouped. I am afraid it will not be possible to handle such case using your approach with complicated if
statement. However, I managed to achieve this using DocumentVisitor
approach I had suggested earlier:
private static class ImageExtractor extends DocumentVisitor {
public ImageExtractor(String targetFolder) {
mTargetFolder = targetFolder;
}
@Override
public int visitRowStart(Row row) throws Exception {
if(row.getChildNodes(NodeType.SHAPE, true).getCount()>0)
mRows.push(row);
return super.visitRowStart(row);
}
@Override
public int visitGroupShapeStart(GroupShape groupShape) throws Exception {
if (groupShape.isTopLevel())
mTopShapes.push(groupShape);
saveShapeAsPdf();
return super.visitGroupShapeStart(groupShape);
}
@Override
public int visitShapeStart(Shape shape) throws Exception {
if (shape.isTopLevel() &&
(shape.getChildNodes(NodeType.PARAGRAPH, true).getCount() == 0) &&
!isOleEquation(shape)) {
mTopShapes.push(shape);
}
saveShapeAsPdf();
return super.visitShapeStart(shape);
}
@Override
public int visitParagraphStart(Paragraph paragraph) throws Exception {
if (isCaptionParagraph(paragraph)) {
mCaptions.push(paragraph.toString(SaveFormat.TEXT).trim());
saveShapeAsPdf();
return VisitorAction.SKIP_THIS_NODE;
}
return super.visitParagraphStart(paragraph);
}
/**
* Removes images from the source document and inserts bookmark at the image position.
*/
public void RemoveImagesFromSourceDocument()
{
for (String key : mNodesToRemove.keySet()) {
ArrayList<Node> nodesToRemove = mNodesToRemove.get(key);
if(nodesToRemove.size() == 0)
continue;
Node firstNode = nodesToRemove.get(0);
DocumentBuilder builder = new DocumentBuilder((Document)firstNode.getDocument());
// In case of table move cursor to the next paragraph.
if(firstNode.getNodeType() == NodeType.TABLE)
builder.moveTo(firstNode.getNextSibling());
else
builder.moveTo(firstNode);
// Insert bookmark.
builder.startBookmark(key);
builder.endBookmark(key);
// Remove all image nodes.
for (Node n : nodesToRemove) {
n.remove();
}
}
}
/**
* Checks whether paragraph is likely to be an image caption.
*/
private static boolean isCaptionParagraph(Paragraph paragraph) throws Exception {
// Get only Run text in account because if caption is in a textbox
// paragraph.toString will return the same value for both
// paragraph inside textbox shape and for paragraph that contains textbox shape.
// Caption often contain SEQ fields.
boolean hasSeqFields = false;
for (Field f : paragraph.getRange().getFields())
hasSeqFields |= (f.getType() == FieldType.FIELD_SEQUENCE);
// More conditions might be added here to better distinguish captions.
// .........
String paraText = "";
for (Run r : paragraph.getRuns()) {
paraText += r.getText();
}
boolean hasCaptionLikeContent = (paraText.startsWith("Fig") ||
paraText.startsWith("Scheme") ||
paraText.startsWith("Plate") ||
paraText.startsWith("Figure"));
return hasSeqFields || hasCaptionLikeContent;
}
/**
* Check whether shape is an embedded Equation.DSMT4 OLE object
*/
private static boolean isOleEquation(Shape shape)
{
return (shape.getOleFormat() != null) && (shape.getOleFormat().getProgId().equals("Equation.DSMT4"));
}
/**
* Save the last shape as a separate PDF document.
*/
private void saveShapeAsPdf() throws Exception {
if (!mTopShapes.empty() && !mCaptions.empty()) {
String caption = mCaptions.pop();
System.out.println(mShapeCounter);
System.out.println(caption);
// Create e temporary document which will be exported to PDF.
Document tmp = (Document) mTopShapes.peek().getDocument().deepClone(false);
Node tmpSection = tmp.importNode(mTopShapes.peek().getAncestor(NodeType.SECTION), false, ImportFormatMode.USE_DESTINATION_STYLES);
tmp.appendChild(tmpSection);
tmp.ensureMinimum();
// There might be several shape to import under one caption.
ArrayList<Node> nodesToImport = new ArrayList<Node>();
if(mTopShapes.size() > 1 && !mRows.isEmpty())
{
Table imagesTable = (Table)mRows.peek().getParentTable().deepClone(false);
while (!mRows.isEmpty()) {
Row r = mRows.pop();
imagesTable.prependChild(r.deepClone(true));
}
nodesToImport.add(imagesTable);
}
else
{
while (!mTopShapes.isEmpty()) {
ShapeBase s = mTopShapes.pop();
nodesToImport.add(s);
}
}
String key = "image_" + mShapeCounter;
mNodesToRemove.put(key, nodesToImport);
for (Node imageNode : nodesToImport) {
Node resultImage = tmp.importNode(imageNode, true, ImportFormatMode.USE_DESTINATION_STYLES);
if (resultImage.isComposite()) {
resultImage.getRange().unlinkFields();
((CompositeNode) resultImage).getChildNodes(NodeType.RUN, true).clear();
}
if (resultImage.getNodeType() == NodeType.TABLE)
tmp.getFirstSection().getBody().prependChild(resultImage);
else
tmp.getFirstSection().getBody().getFirstParagraph().prependChild(resultImage);
}
// Format the output file path.
String outFilePath = mTargetFolder + key + ".pdf";
tmp.save(outFilePath);
mShapeCounter++;
// Empty stacks.
mTopShapes.clear();
mRows.clear();
mCaptions.clear();
}
}
// Upon visiting the shapes captions and shapes are pushed in stack.
// only top level shapes will be collected.
private Stack<ShapeBase> mTopShapes = new Stack<ShapeBase>();
private Stack<Row> mRows = new Stack<Row>();
private Stack<String> mCaptions = new Stack<String>();
private int mShapeCounter = 0;
private String mTargetFolder;
// Dictionary with nodes that should be deleted and replaced with bookmark.
HashMap<String, ArrayList<Node>> mNodesToRemove = new HashMap<String, ArrayList<Node>>();
}