Extraction Image naming issue

e503824 · August 16, 2022, 12:06pm

Dear team,

We are extracting images from docm but below case images are extracting but image names are mismatching Please find input and output file

Input and Output : 6 Need to check.zip (3.4 MB)

and we are using below source code for extraction

class ImageExtractor extends DocumentVisitor {

    public ImageExtractor(String targetFolder, Document doc) {
        mTargetFolder = targetFolder;
        sourceDoc=doc;
    }

	    /**
	     * Removes images from the source document and inserts bookmark at the image position.
	     */
	 public void RemoveImagesFromSourceDocument()
	 {
	     try {
		 for (String key : mNodesToRemove.keySet())
	     {
	         Node nodeToRemove = mNodesToRemove.get(key);
	
	         DocumentBuilder builder = new DocumentBuilder((Document)nodeToRemove.getDocument());
	         // In case of table move cursor to the next paragraph.
	         if (nodeToRemove.getNodeType() == NodeType.TABLE)
	             builder.moveTo(nodeToRemove.getNextSibling());
	         else
	             builder.moveTo(nodeToRemove);
	         
	         // Insert bookmark.
	         /*builder.startBookmark(key);
	         builder.endBookmark(key);*/
	
	         // Remove image node.
	         nodeToRemove.remove();
	     }}
	     catch(Exception e){logger.info(e.getMessage());}
	 }

	@Override
    public int visitRowStart(Row row) throws Exception {

		if(rowHasImage(row))
            mRows.push(row);

        return super.visitRowStart(row);
    }

    @Override
    public int visitGroupShapeStart(GroupShape groupShape) throws Exception {

        if (groupShape.isTopLevel())
            mTopShapes.push(groupShape);

        saveShapeAsPdf();
        
        return super.visitGroupShapeStart(groupShape);
    }

    @Override
    public int visitShapeStart(Shape shape) throws Exception {

        if (shape.isTopLevel() &&
                shape.getChildNodes(NodeType.PARAGRAPH, true).getCount() == 0 && !FixedGraphic.isOleEquation(shape)) {
            mTopShapes.push(shape);
        }
        
        saveShapeAsPdf();
        return super.visitShapeStart(shape);
    }

    @Override
    public int visitParagraphStart(Paragraph paragraph) throws Exception {

        if (isCaptionParagraph(paragraph))
            mCaptions.push(paragraph.toString(SaveFormat.TEXT).trim());

        saveShapeAsPdf();

        return super.visitParagraphStart(paragraph);
    }

    /**
     * Checks whether paragraph is likely to be an image caption.
     */
    private static boolean isCaptionParagraph(Paragraph paragraph) throws Exception {
        // Get only Run text in account because if caption is in a textbox
        // paragraph.toString will return the same value for both
        // paragraph inside textbox shape and for paragraph that contains textbox shape.

        // Some captions are in textboxes.
        boolean isInshape = paragraph.getAncestor(NodeType.SHAPE)!=null;
        
        
        // Some caption are marked as bold
        boolean isBold = false;
        // More conditions might be added here to better distinguish captions.
        // .........
        
        
        
        String paraText = "";
        for (Run r : paragraph.getRuns()) {
            paraText += r.getText();
            isBold |= r.getFont().getBold();
        }

        return (isInshape || isBold) && (paraText.startsWith("Fig") ||
                paraText.startsWith("Scheme") ||
                paraText.startsWith("Plate") ||
                paraText.startsWith("Figure")) && !paraText.contains(AIE.docName) && !paraText.toLowerCase().startsWith("figs") && !paraText.toLowerCase().startsWith("figures");
    }
    private static boolean rowHasImage(Row row)
    {
        NodeCollection shapes = row.getChildNodes(NodeType.SHAPE, true);
        if(shapes.getCount() == 0)
            return false;

        boolean hasImage = false;
        for (Shape s : (Iterable<Shape>)shapes) {
            hasImage |= !isOleEquation(s);
        }

        return  hasImage;
    }
    private static boolean isOleEquation(Shape shape)
    {
        return (shape.getOleFormat() != null) && (shape.getOleFormat().getProgId().equals("Equation.DSMT4"));
    }

    /**
     * Save the last shape as a separate PDF document.
     */
    private void saveShapeAsPdf() throws Exception {
        try {
        	
    	if (!mTopShapes.empty() && !mCaptions.empty()) {
    		
            String caption = mCaptions.pop();
            
            
            Node imageNode = mTopShapes.peek();
            
            if(imageNode.getParentNode().getChildNodes(NodeType.TABLE, true) != null) {
            
            // Create e temporary document which will be exported to PDF.
            Document tmp = (Document) imageNode.getDocument().deepClone(false);
            Node tmpSection = tmp.importNode(imageNode.getAncestor(NodeType.SECTION), false, ImportFormatMode.USE_DESTINATION_STYLES);
            tmp.appendChild(tmpSection);
            tmp.ensureMinimum();

            if(mTopShapes.size() > 1 && !mRows.isEmpty())
            {
                Table imagesTable = (Table)mRows.peek().getParentTable().deepClone(false);
                while (!mRows.isEmpty())
                    imagesTable.prependChild(mRows.pop().deepClone(true));

                imageNode = imagesTable;
            }

            Node resultImage = tmp.importNode(imageNode, true, ImportFormatMode.USE_DESTINATION_STYLES);
            if(resultImage.getNodeType() == NodeType.TABLE)
                tmp.getFirstSection().getBody().prependChild(resultImage);
            else
                tmp.getFirstSection().getBody().getFirstParagraph().appendChild(resultImage);
            
            if(resultImage.isComposite()) {
                resultImage.getRange().unlinkFields();
                //((CompositeNode) resultImage).getChildNodes(NodeType.RUN, true).clear();
                Iterable<Paragraph> paragraphs  = ((CompositeNode) resultImage).getChildNodes(NodeType.PARAGRAPH, true);
                for (Paragraph p : paragraphs)
                {
                    if(isCaptionParagraph(p))
                        p.remove();
                }
            }
            
            //System.out.println("caption: "+caption);
            String bookmarkname = AIE.formatImgcaption(caption, AIE.fileName);
            String newBookmarkName=bookmarkname.substring(bookmarkname.lastIndexOf('_') + 1);

            // Format the output file path.
            String outFilePath = mTargetFolder +newBookmarkName  + ".pdf";
            File f = new File(outFilePath);
            if(!f.exists()) {
            PdfSaveOptions opts = new PdfSaveOptions();
            opts.getDownsampleOptions().setDownsampleImages(false);
            opts.setCompliance(PdfCompliance.PDF_A_1_B);
			        
            tmp.save(outFilePath);
           

            
            Paragraph pa=(Paragraph) imageNode.getParentNode();
            AIE.insertBookmark(sourceDoc, pa, bookmarkname);
            
           mNodesToRemove.put(newBookmarkName, imageNode);
            
            //imageNode.remove();
            
            AIE.configurationWork(bookmarkname, tmp, outFilePath);
            }
            mShapeCounter++;

            // Empty stacks.
            mTopShapes.clear();
            mRows.clear();
            mCaptions.clear();
            
            }
            
        }
        }
        catch(Exception e) {
        	logger.info(e.getMessage());
        	}
        }
    

    // Upon visiting the shapes captions and shapes are pushed in stack.
    // only top level shapes will be collected.
    private Stack<ShapeBase> mTopShapes = new Stack<ShapeBase>();
    private Stack<Row> mRows = new Stack<Row>();
    private Stack<String> mCaptions = new Stack<String>();
    private int mShapeCounter = 0;
    private String mTargetFolder;
    private Document sourceDoc;
    HashMap<String, Node> mNodesToRemove = new HashMap<String, Node>();
    private static org.apache.logging.log4j.Logger logger = LogManager.getLogger(TextFrameImage.class);
    
}

alexey.noskov · August 16, 2022, 7:34pm

@e503824 The second and third images are in table. ImageExtractor considers all shapes in the table as a single figure. See visitRowStart method. This was done because in other your document there are are figures represented by table and caption under the table. You can add an additional method to process such table. For example se the following code:

Document doc = new Document("C:\\Temp\\in.docm");
doc.updateListLabels();
ImageExtractor extractor = new ImageExtractor("C:\\Temp\\out\\");
doc.accept(extractor);

public class ImageExtractor extends DocumentVisitor {
    public ImageExtractor(String targetFolder) {
        mTargetFolder = targetFolder;
    }

    @Override
    public int visitTableStart(Table table) throws Exception {

        // If table has two rows and each row has the same number of cells
        // and there is a caption paragraph in the last row - do special processing.
        if((table.getRows().getCount() == 2) &&
                (table.getFirstRow().getCells().getCount() == table.getLastRow().getCells().getCount()) &&
                isCaptionParagraph(table.getLastRow().getFirstCell().getFirstParagraph()))
        {
            for(int i=0; i<table.getFirstRow().getCells().getCount(); i++)
            {
                Cell firstRowCell = table.getFirstRow().getCells().get(i);
                Cell lastRowCell = table.getLastRow().getCells().get(i);

                Shape shape = (Shape)firstRowCell.getChild(NodeType.SHAPE, 0, true);
                if(shape!=null && shape.isTopLevel())
                    mTopShapes.push(shape);

                if(isCaptionParagraph(lastRowCell.getFirstParagraph()))
                    mCaptions.push(lastRowCell.getFirstParagraph().toString(SaveFormat.TEXT).trim());

                saveShapeAsPdf();
            }

            return VisitorAction.SKIP_THIS_NODE;
        }

        return VisitorAction.CONTINUE;
    }

    @Override
    public int visitRowStart(Row row) throws Exception {

        if(rowHasImage(row))
            mRows.push(row);

        return super.visitRowStart(row);
    }

    @Override
    public int visitGroupShapeStart(GroupShape groupShape) throws Exception {

        if (groupShape.isTopLevel())
            mTopShapes.push(groupShape);

        saveShapeAsPdf();

        return super.visitGroupShapeStart(groupShape);
    }

    @Override
    public int visitShapeStart(Shape shape) throws Exception {

        if (shape.isTopLevel() &&
                (shape.getChildNodes(NodeType.PARAGRAPH, true).getCount() == 0) &&
                !isOleEquation(shape)) {
            mTopShapes.push(shape);
        }

        saveShapeAsPdf();

        return super.visitShapeStart(shape);
    }

    @Override
    public int visitParagraphStart(Paragraph paragraph) throws Exception {

        if (isCaptionParagraph(paragraph)) {
            mCaptions.push(paragraph.toString(SaveFormat.TEXT).trim());
            saveShapeAsPdf();
            return VisitorAction.SKIP_THIS_NODE;
        }

        return super.visitParagraphStart(paragraph);
    }

    /**
     * Removes images from the source document and inserts bookmark at the image position.
     */
    public void RemoveImagesFromSourceDocument()
    {
        for (String key : mNodesToRemove.keySet()) {

            ArrayList<Node> nodesToRemove  = mNodesToRemove.get(key);
            if(nodesToRemove.size() == 0)
                continue;

            Node firstNode = nodesToRemove.get(0);

            DocumentBuilder builder = new DocumentBuilder((Document)firstNode.getDocument());
            // In case of table move cursor to the next paragraph.
            if(firstNode.getNodeType() == NodeType.TABLE)
                builder.moveTo(firstNode.getNextSibling());
            else
                builder.moveTo(firstNode);

            // Insert bookmark.
            builder.startBookmark(key);
            builder.endBookmark(key);

            // Remove all image nodes.
            for (Node n : nodesToRemove) {
                n.remove();
            }
        }
    }

    /**
     * Checks whether row has shapes except formulas.
     */
    private static boolean rowHasImage(Row row)
    {
        NodeCollection shapes = row.getChildNodes(NodeType.SHAPE, true);
        if(shapes.getCount() == 0)
            return false;

        boolean hasImage = false;
        for (Shape s : (Iterable<Shape>)shapes) {
            hasImage |= !isOleEquation(s);
        }

        return  hasImage;
    }

    /**
     * Checks whether paragraph is likely to be an image caption.
     */
    private static boolean isCaptionParagraph(Paragraph paragraph) throws Exception {
        // Get only Run text in account because if caption is in a textbox
        // paragraph.toString will return the same value for both
        // paragraph inside textbox shape and for paragraph that contains textbox shape.

        // Caption often contain SEQ fields.
        boolean hasSeqFields = false;
        for (Field f : paragraph.getRange().getFields())
            hasSeqFields |= (f.getType() == FieldType.FIELD_SEQUENCE);
        // More conditions might be added here to better distinguish captions.
        // .........

        String paraText = paragraph.isListItem() ? paragraph.getListLabel().getLabelString() : "";
        for (Run r : paragraph.getRuns()) {
            paraText += r.getText();
        }

        boolean hasCaptionLikeContent = (paraText.startsWith("Fig") ||
                paraText.startsWith("Scheme") ||
                paraText.startsWith("Plate") ||
                paraText.startsWith("Figure") ||
                paraText.startsWith("Flowchart"));

        return  (hasSeqFields || hasCaptionLikeContent) && (paraText.length()<200);
    }

    /**
     * Check whether shape is an embedded Equation.DSMT4 OLE object
     */
    private static boolean isOleEquation(Shape shape)
    {
        return (shape.getOleFormat() != null) && (shape.getOleFormat().getProgId().equals("Equation.DSMT4"));
    }

    /**
     * Save the last shape as a separate PDF document.
     */
    private void saveShapeAsPdf() throws Exception {
        if (!mTopShapes.empty() && !mCaptions.empty()) {
            String caption = mCaptions.pop();
            System.out.println(mShapeCounter);
            System.out.println(caption);

            // Create e temporary document which will be exported to PDF.
            Document tmp = (Document) mTopShapes.peek().getDocument().deepClone(false);
            Node tmpSection = tmp.importNode(mTopShapes.peek().getAncestor(NodeType.SECTION), false, ImportFormatMode.USE_DESTINATION_STYLES);
            tmp.appendChild(tmpSection);
            tmp.ensureMinimum();

            // There might be several shape to import under one caption.
            ArrayList<Node> nodesToImport = new ArrayList<Node>();
            if(mTopShapes.size() > 1 && !mRows.isEmpty())
            {
                Table imagesTable = (Table)mRows.peek().getParentTable().deepClone(false);
                while (!mRows.isEmpty()) {
                    Row r = mRows.pop();
                    imagesTable.prependChild(r.deepClone(true));
                }

                nodesToImport.add(imagesTable);
            }
            else
            {
                while (!mTopShapes.isEmpty()) {
                    ShapeBase s = mTopShapes.pop();
                    nodesToImport.add(s);
                }
            }

            String key = "image_" + mShapeCounter;
            mNodesToRemove.put(key, nodesToImport);

            for (Node imageNode :  nodesToImport) {
                Node resultImage = tmp.importNode(imageNode, true, ImportFormatMode.USE_DESTINATION_STYLES);

                if (resultImage.isComposite()) {
                    resultImage.getRange().unlinkFields();
                    Iterable<Paragraph> paragraphs  = ((CompositeNode) resultImage).getChildNodes(NodeType.PARAGRAPH, true);
                    for (Paragraph p : paragraphs)
                    {
                        if(isCaptionParagraph(p))
                            p.remove();
                    }
                    //((CompositeNode) resultImage).getChildNodes(NodeType.RUN, true).clear();
                }

                if (resultImage.getNodeType() == NodeType.TABLE)
                    tmp.getFirstSection().getBody().prependChild(resultImage);
                else
                    tmp.getFirstSection().getBody().getFirstParagraph().prependChild(resultImage);
            }

            // Format the output file path.
            String outFilePath = mTargetFolder + caption + ".pdf";
            tmp.save(outFilePath);

            mShapeCounter++;

            // Empty stacks.
            mTopShapes.clear();
            mRows.clear();
            mCaptions.clear();
        }
    }

    // Upon visiting the shapes captions and shapes are pushed in stack.
    // only top level shapes will be collected.
    private Stack<ShapeBase> mTopShapes = new Stack<ShapeBase>();
    private Stack<Row> mRows = new Stack<Row>();
    private Stack<String> mCaptions = new Stack<String>();
    private int mShapeCounter = 0;
    private String mTargetFolder;

    // Dictionary with nodes that should be deleted and replaced with bookmark.
    HashMap<String, ArrayList<Node>> mNodesToRemove = new HashMap<String, ArrayList<Node>>();
}

e503824 · August 17, 2022, 9:32am

Dear team,

I’m getting below error while using public void RemoveImagesFromSourceDocument() this method

Error : Error.jpg (22.3 KB)

alexey.noskov · August 17, 2022, 7:18pm

@e503824 Looks like you are using some other implementation of ImageExtractor. I cannot reproduce the problem with my implementation. Please use the code I have provided in my previous answer or update your code accordingly.