Dear team,
We are extracting images from docx but below case its extracting formulas also please find source code and input, output file. please do needful
Source Code :
class ImageExtractor extends DocumentVisitor {
public ImageExtractor(String targetFolder, Document doc) {
mTargetFolder = targetFolder;
sourceDoc=doc;
}
/**
* Removes images from the source document and inserts bookmark at the image position.
*/
public void RemoveImagesFromSourceDocument()
{
try {
for (String key : mNodesToRemove.keySet())
{
Node nodeToRemove = mNodesToRemove.get(key);
DocumentBuilder builder = new DocumentBuilder((Document)nodeToRemove.getDocument());
// In case of table move cursor to the next paragraph.
if (nodeToRemove.getNodeType() == NodeType.TABLE)
builder.moveTo(nodeToRemove.getNextSibling());
else
builder.moveTo(nodeToRemove);
// Insert bookmark.
/*builder.startBookmark(key);
builder.endBookmark(key);*/
// Remove image node.
nodeToRemove.remove();
}}
catch(Exception e){logger.info(e.getMessage());}
}
@Override
public int visitRowStart(Row row) throws Exception {
if(row.getChildNodes(NodeType.SHAPE, true).getCount()>0)
mRows.push(row);
return super.visitRowStart(row);
}
@Override
public int visitGroupShapeStart(GroupShape groupShape) throws Exception {
if (groupShape.isTopLevel())
mTopShapes.push(groupShape);
saveShapeAsPdf();
return super.visitGroupShapeStart(groupShape);
}
@Override
public int visitShapeStart(Shape shape) throws Exception {
if (shape.isTopLevel() &&
shape.getChildNodes(NodeType.PARAGRAPH, true).getCount() == 0) {
mTopShapes.push(shape);
}
saveShapeAsPdf();
return super.visitShapeStart(shape);
}
@Override
public int visitParagraphStart(Paragraph paragraph) throws Exception {
if (isCaptionParagraph(paragraph))
mCaptions.push(paragraph.toString(SaveFormat.TEXT).trim());
saveShapeAsPdf();
return super.visitParagraphStart(paragraph);
}
/**
* Checks whether paragraph is likely to be an image caption.
*/
private static boolean isCaptionParagraph(Paragraph paragraph) throws Exception {
// Get only Run text in account because if caption is in a textbox
// paragraph.toString will return the same value for both
// paragraph inside textbox shape and for paragraph that contains textbox shape.
// Some captions are in textboxes.
boolean isInshape = paragraph.getAncestor(NodeType.SHAPE)!=null;
// Some caption are marked as bold
boolean isBold = false;
// More conditions might be added here to better distinguish captions.
// .........
String paraText = "";
for (Run r : paragraph.getRuns()) {
paraText += r.getText();
isBold |= r.getFont().getBold();
}
return (isInshape || isBold) && (paraText.startsWith("Fig") ||
paraText.startsWith("Scheme") ||
paraText.startsWith("Plate") ||
paraText.startsWith("Figure")) && !paraText.contains(AIE.docName);
}
/**
* Save the last shape as a separate PDF document.
*/
private void saveShapeAsPdf() throws Exception {
try {
if (!mTopShapes.empty() && !mCaptions.empty()) {
String caption = mCaptions.pop();
Node imageNode = mTopShapes.peek();
if(imageNode.getParentNode().getChildNodes(NodeType.TABLE, true) != null) {
// Create e temporary document which will be exported to PDF.
Document tmp = (Document) imageNode.getDocument().deepClone(false);
Node tmpSection = tmp.importNode(imageNode.getAncestor(NodeType.SECTION), false, ImportFormatMode.USE_DESTINATION_STYLES);
tmp.appendChild(tmpSection);
tmp.ensureMinimum();
if(mTopShapes.size() > 1 && !mRows.isEmpty())
{
Table imagesTable = (Table)mRows.peek().getParentTable().deepClone(false);
while (!mRows.isEmpty())
imagesTable.prependChild(mRows.pop().deepClone(true));
imageNode = imagesTable;
}
Node resultImage = tmp.importNode(imageNode, true, ImportFormatMode.USE_DESTINATION_STYLES);
if(resultImage.getNodeType() == NodeType.TABLE)
tmp.getFirstSection().getBody().prependChild(resultImage);
else
tmp.getFirstSection().getBody().getFirstParagraph().appendChild(resultImage);
if(resultImage.isComposite()) {
resultImage.getRange().unlinkFields();
//((CompositeNode) resultImage).getChildNodes(NodeType.RUN, true).clear();
Iterable<Paragraph> paragraphs = ((CompositeNode) resultImage).getChildNodes(NodeType.PARAGRAPH, true);
for (Paragraph p : paragraphs)
{
if(isCaptionParagraph(p))
p.remove();
}
}
//System.out.println("caption: "+caption);
String bookmarkname = AIE.formatImgcaption(caption, AIE.fileName);
String newBookmarkName=bookmarkname.substring(bookmarkname.lastIndexOf('_') + 1);
// Format the output file path.
String outFilePath = mTargetFolder +newBookmarkName + ".pdf";
tmp.save(outFilePath);
Paragraph pa=(Paragraph) imageNode.getParentNode();
AIE.insertBookmark(sourceDoc, pa, bookmarkname);
mNodesToRemove.put(newBookmarkName, imageNode);
//imageNode.remove();
AIE.configurationWork(bookmarkname, tmp, outFilePath);
mShapeCounter++;
// Empty stacks.
mTopShapes.clear();
mRows.clear();
mCaptions.clear();
}
}
}
catch(Exception e) {
logger.info(e.getMessage());
}
}
// Upon visiting the shapes captions and shapes are pushed in stack.
// only top level shapes will be collected.
private Stack<ShapeBase> mTopShapes = new Stack<ShapeBase>();
private Stack<Row> mRows = new Stack<Row>();
private Stack<String> mCaptions = new Stack<String>();
private int mShapeCounter = 0;
private String mTargetFolder;
private Document sourceDoc;
HashMap<String, Node> mNodesToRemove = new HashMap<String, Node>();
private static org.apache.logging.log4j.Logger logger = LogManager.getLogger(TextFrameImage.class);
}
Input and output : Test.zip (2.3 MB)
please do needful