Image extraction issue 14

e503824 · August 29, 2022, 4:24am

Dear team,

We are extracting images from document but below case we are extracting landscape images. Here some images got merged, some images got missed please refer below input and output file.

Source File : 13.zip (6.6 MB)

Source Code :

private static class ImageExtractor extends DocumentVisitor {
    public ImageExtractor(String targetFolder) {
        mTargetFolder = targetFolder;
    }

    @Override
    public int visitRowStart(Row row) throws Exception {

        if(rowHasImage(row))
            mRows.push(row);

        return super.visitRowStart(row);
    }

    @Override
    public int visitGroupShapeStart(GroupShape groupShape) throws Exception {

        if (groupShape.isTopLevel())
            mTopShapes.push(groupShape);

        saveShapeAsPdf();

        return super.visitGroupShapeStart(groupShape);
    }

    @Override
    public int visitShapeStart(Shape shape) throws Exception {

        if (shape.isTopLevel() &&
                (shape.getChildNodes(NodeType.PARAGRAPH, true).getCount() == 0) &&
                !isOleEquation(shape)) {
            mTopShapes.push(shape);
        }

        saveShapeAsPdf();

        return super.visitShapeStart(shape);
    }

    @Override
    public int visitParagraphStart(Paragraph paragraph) throws Exception {

        if (isCaptionParagraph(paragraph)) {
            mCaptions.push(paragraph.toString(SaveFormat.TEXT).trim());
            saveShapeAsPdf();
            return VisitorAction.SKIP_THIS_NODE;
        }

        return super.visitParagraphStart(paragraph);
    }

    /**
        * Removes images from the source document and inserts bookmark at the image position.
        */
    public void RemoveImagesFromSourceDocument()
    {
        for (String key : mNodesToRemove.keySet()) {

            ArrayList<Node> nodesToRemove  = mNodesToRemove.get(key);
            if(nodesToRemove.size() == 0)
                continue;

            Node firstNode = nodesToRemove.get(0);

            DocumentBuilder builder = new DocumentBuilder((Document)firstNode.getDocument());
            // In case of table move cursor to the next paragraph.
            if(firstNode.getNodeType() == NodeType.TABLE)
                builder.moveTo(firstNode.getNextSibling());
            else
                builder.moveTo(firstNode);

            // Insert bookmark.
            builder.startBookmark(key);
            builder.endBookmark(key);

            // Remove all image nodes.
            for (Node n : nodesToRemove) {
                n.remove();
            }
        }
    }

    /**
        * Checks whether row has shapes except formulas.
        */
    private static boolean rowHasImage(Row row)
    {
        NodeCollection shapes = row.getChildNodes(NodeType.SHAPE, true);
        if(shapes.getCount() == 0)
            return false;

        boolean hasImage = false;
        for (Shape s : (Iterable<Shape>)shapes) {
            hasImage |= !isOleEquation(s);
        }

        return  hasImage;
    }

    /**
        * Checks whether paragraph is likely to be an image caption.
        */
    private static boolean isCaptionParagraph(Paragraph paragraph) throws Exception {
        // Get only Run text in account because if caption is in a textbox
        // paragraph.toString will return the same value for both
        // paragraph inside textbox shape and for paragraph that contains textbox shape.

        // Caption often contain SEQ fields.
        boolean hasSeqFields = false;
        for (Field f : paragraph.getRange().getFields())
            hasSeqFields |= (f.getType() == FieldType.FIELD_SEQUENCE);
        // More conditions might be added here to better distinguish captions.
        // .........

        String paraText = paragraph.isListItem() ? paragraph.getListLabel().getLabelString() : "";
        for (Run r : paragraph.getRuns()) {
            paraText += r.getText();
        }

        boolean hasCaptionLikeContent = (paraText.startsWith("Fig") ||
                paraText.startsWith("Scheme") ||
                paraText.startsWith("Plate") ||
                paraText.startsWith("Figure") ||
                paraText.startsWith("Flowchart"));

        return  (hasSeqFields || hasCaptionLikeContent) && (paraText.length()<200);
    }

    /**
        * Check whether shape is an embedded Equation.DSMT4 OLE object
        */
    private static boolean isOleEquation(Shape shape)
    {
        return (shape.getOleFormat() != null) && (shape.getOleFormat().getProgId().equals("Equation.DSMT4"));
    }

    /**
        * Save the last shape as a separate PDF document.
        */
    private void saveShapeAsPdf() throws Exception {
        if (!mTopShapes.empty() && !mCaptions.empty()) {
            String caption = mCaptions.pop();
            System.out.println(mShapeCounter);
            System.out.println(caption);

            // Create e temporary document which will be exported to PDF.
            Document tmp = (Document) mTopShapes.peek().getDocument().deepClone(false);
            Node tmpSection = tmp.importNode(mTopShapes.peek().getAncestor(NodeType.SECTION), false, ImportFormatMode.USE_DESTINATION_STYLES);
            tmp.appendChild(tmpSection);
            tmp.ensureMinimum();

            // There might be several shape to import under one caption.
            ArrayList<Node> nodesToImport = new ArrayList<Node>();
            if(mTopShapes.size() > 1 && !mRows.isEmpty())
            {
                Table imagesTable = (Table)mRows.peek().getParentTable().deepClone(false);
                while (!mRows.isEmpty()) {
                    Row r = mRows.pop();
                    imagesTable.prependChild(r.deepClone(true));
                }

                nodesToImport.add(imagesTable);
            }
            else
            {
                while (!mTopShapes.isEmpty()) {
                    ShapeBase s = mTopShapes.pop();
                    nodesToImport.add(s);
                }
            }

            String key = "image_" + mShapeCounter;
            mNodesToRemove.put(key, nodesToImport);

            for (Node imageNode :  nodesToImport) {
                Node resultImage = tmp.importNode(imageNode, true, ImportFormatMode.USE_DESTINATION_STYLES);

                if (resultImage.isComposite()) {
                    resultImage.getRange().unlinkFields();
                    Iterable<Paragraph> paragraphs  = ((CompositeNode) resultImage).getChildNodes(NodeType.PARAGRAPH, true);
                    for (Paragraph p : paragraphs)
                    {
                        if(isCaptionParagraph(p))
                            p.remove();
                    }
                    //((CompositeNode) resultImage).getChildNodes(NodeType.RUN, true).clear();
                }

                if (resultImage.getNodeType() == NodeType.TABLE)
                    tmp.getFirstSection().getBody().prependChild(resultImage);
                else
                    tmp.getFirstSection().getBody().getFirstParagraph().prependChild(resultImage);
            }

            // Format the output file path.
            String outFilePath = mTargetFolder + key + ".pdf";
            tmp.save(outFilePath);

            mShapeCounter++;

            // Empty stacks.
            mTopShapes.clear();
            mRows.clear();
            mCaptions.clear();
        }
    }

    // Upon visiting the shapes captions and shapes are pushed in stack.
    // only top level shapes will be collected.
    private Stack<ShapeBase> mTopShapes = new Stack<ShapeBase>();
    private Stack<Row> mRows = new Stack<Row>();
    private Stack<String> mCaptions = new Stack<String>();
    private int mShapeCounter = 0;
    private String mTargetFolder;

    // Dictionary with nodes that should be deleted and replaced with bookmark.
    HashMap<String, ArrayList<Node>> mNodesToRemove = new HashMap<String, ArrayList<Node>>();
}

alexey.noskov · August 29, 2022, 4:18pm

@e503824 Aspose.Words reads all shapes in your document properly. The problem is in the logic used for shapes extraction. If you inspect the document structure you will note that the figure 5 (the problematic one) consist of two shapes - one comes before caption and another after caption. The algorithm used for figures extraction detect the first shape and caption after it and saves the detected shape, the shapes that goes after caption is extracted with the next caption, that is expected according to the algorithm used for extraction:

You have to adjust your algorithm to handle such situation or, if you have control over document creation, define the rules the figures are insert into the document. There are plenty of ways the figures can be inserted into MS Word document and if figures are floating, like in this case it might be difficult of even impossible to determine visual order of the figures. For example floating figure might be anchored (inserted) in the first paragraph on the page, but visually is placed at the end of page.