Extraction issue 6

e503824 · June 6, 2022, 8:55am

Dear team,

we are facing image extraction issue in below document please find source code and input, output files

Source code :

class ImageExtractor extends DocumentVisitor {

    public ImageExtractor(String targetFolder, Document doc) {
        mTargetFolder = targetFolder;
        sourceDoc=doc;
    }

	    /**
	     * Removes images from the source document and inserts bookmark at the image position.
	     */
	 public void RemoveImagesFromSourceDocument()
	 {
	     try {
		 for (String key : mNodesToRemove.keySet())
	     {
	         Node nodeToRemove = mNodesToRemove.get(key);
	
	         DocumentBuilder builder = new DocumentBuilder((Document)nodeToRemove.getDocument());
	         // In case of table move cursor to the next paragraph.
	         if (nodeToRemove.getNodeType() == NodeType.TABLE)
	             builder.moveTo(nodeToRemove.getNextSibling());
	         else
	             builder.moveTo(nodeToRemove);
	         
	         // Insert bookmark.
	         /*builder.startBookmark(key);
	         builder.endBookmark(key);*/
	
	         // Remove image node.
	         nodeToRemove.remove();
	     }}
	     catch(Exception e){logger.info(e.getMessage());}
	 }

	@Override
    public int visitRowStart(Row row) throws Exception {

        if(row.getChildNodes(NodeType.SHAPE, true).getCount()>0)
            mRows.push(row);
        return super.visitRowStart(row);
    }

    @Override
    public int visitGroupShapeStart(GroupShape groupShape) throws Exception {

        if (groupShape.isTopLevel())
            mTopShapes.push(groupShape);

        saveShapeAsPdf();
        
        return super.visitGroupShapeStart(groupShape);
    }

    @Override
    public int visitShapeStart(Shape shape) throws Exception {

        if (shape.isTopLevel() &&
                shape.getChildNodes(NodeType.PARAGRAPH, true).getCount() == 0) {
            mTopShapes.push(shape);
        }
        
        saveShapeAsPdf();
        return super.visitShapeStart(shape);
    }

    @Override
    public int visitParagraphStart(Paragraph paragraph) throws Exception {

        if (isCaptionParagraph(paragraph))
            mCaptions.push(paragraph.toString(SaveFormat.TEXT).trim());

        saveShapeAsPdf();

        return super.visitParagraphStart(paragraph);
    }

    /**
     * Checks whether paragraph is likely to be an image caption.
     */
    private static boolean isCaptionParagraph(Paragraph paragraph) throws Exception {
        // Get only Run text in account because if caption is in a textbox
        // paragraph.toString will return the same value for both
        // paragraph inside textbox shape and for paragraph that contains textbox shape.

        // Some captions are in textboxes.
        boolean isInshape = paragraph.getAncestor(NodeType.SHAPE)!=null;
        
        
        // Some caption are marked as bold
        boolean isBold = false;
        // More conditions might be added here to better distinguish captions.
        // .........
        
        
        
        String paraText = "";
        for (Run r : paragraph.getRuns()) {
            paraText += r.getText();
            isBold |= r.getFont().getBold();
        }

        return (isInshape || isBold) && (paraText.startsWith("Fig") ||
                paraText.startsWith("Scheme") ||
                paraText.startsWith("Plate") ||
                paraText.startsWith("Figure"));
    }

    /**
     * Save the last shape as a separate PDF document.
     */
    private void saveShapeAsPdf() throws Exception {
        try {
        	
    	if (!mTopShapes.empty() && !mCaptions.empty()) {
    		
            String caption = mCaptions.pop();
           
            Node imageNode = mTopShapes.peek();
            
            if(imageNode.getParentNode().getChildNodes(NodeType.TABLE, true) != null) {
            
            // Create e temporary document which will be exported to PDF.
            Document tmp = (Document) imageNode.getDocument().deepClone(false);
            Node tmpSection = tmp.importNode(imageNode.getAncestor(NodeType.SECTION), false, ImportFormatMode.USE_DESTINATION_STYLES);
            tmp.appendChild(tmpSection);
            tmp.ensureMinimum();

            if(mTopShapes.size() > 1 && !mRows.isEmpty())
            {
                Table imagesTable = (Table)mRows.peek().getParentTable().deepClone(false);
                while (!mRows.isEmpty())
                    imagesTable.prependChild(mRows.pop().deepClone(true));

                imageNode = imagesTable;
            }

            Node resultImage = tmp.importNode(imageNode, true, ImportFormatMode.USE_DESTINATION_STYLES);
            if(resultImage.getNodeType() == NodeType.TABLE)
                tmp.getFirstSection().getBody().prependChild(resultImage);
            else
                tmp.getFirstSection().getBody().getFirstParagraph().appendChild(resultImage);
            
            if(resultImage.isComposite()) {
                resultImage.getRange().unlinkFields();
                ((CompositeNode) resultImage).getChildNodes(NodeType.RUN, true).clear();
            }
            
            //System.out.println("caption: "+caption);
            String bookmarkname = AIE.formatImgcaption(caption, AIE.fileName);
            String newBookmarkName=bookmarkname.substring(bookmarkname.lastIndexOf('_') + 1);

            // Format the output file path.
            String outFilePath = mTargetFolder +newBookmarkName  + ".pdf";
            tmp.save(outFilePath);
            

            
            Paragraph pa=(Paragraph) imageNode.getParentNode();
            AIE.insertBookmark(sourceDoc, pa, bookmarkname);
            
           mNodesToRemove.put(newBookmarkName, imageNode);
            
            //imageNode.remove();
            
            AIE.configurationWork(bookmarkname, tmp, outFilePath);
            
            mShapeCounter++;

            // Empty stacks.
            mTopShapes.clear();
            mRows.clear();
            mCaptions.clear();
            
            }
            
        }
        }
        catch(Exception e) {
        	logger.info(e.getMessage());
        	}
        }
    

    // Upon visiting the shapes captions and shapes are pushed in stack.
    // only top level shapes will be collected.
    private Stack<ShapeBase> mTopShapes = new Stack<ShapeBase>();
    private Stack<Row> mRows = new Stack<Row>();
    private Stack<String> mCaptions = new Stack<String>();
    private int mShapeCounter = 0;
    private String mTargetFolder;
    private Document sourceDoc;
    HashMap<String, Node> mNodesToRemove = new HashMap<String, Node>();
    private static org.apache.logging.log4j.Logger logger = LogManager.getLogger(TextFrameImage.class);
    
}

Input and output : 3.zip (663.4 KB)

alexey.noskov · June 6, 2022, 4:37pm

@e503824 You should replace the following line of code if saveAsPdf method:

((CompositeNode) resultImage).getChildNodes(NodeType.RUN, true).clear();

with this:

Iterable<Paragraph> paragraphs  = ((CompositeNode) resultImage).getChildNodes(NodeType.PARAGRAPH, true);
for (Paragraph p : paragraphs)
{
    if(isCaptionParagraph(p))
        p.remove();
}

The old code removes all text from the extracted shape. In your case the extracted shapes are group shapes with textboxes. Since the old code removes all text content, you see only frames in the output.

e503824 · June 7, 2022, 6:13am

Dear team,

Thanks for your support its working fine now