Table Image extraction issue1

e503824 · June 27, 2022, 12:49pm

Dear team,

We are extracting images from docx using aspose java but below case its extracting wrongly please find source code and Input,output files

Source Code :

Document tableDoc = docSetup(interimdoc, table);
NodeImporter importers = new NodeImporter(interimdoc, tableDoc,
    ImportFormatMode.KEEP_SOURCE_FORMATTING);
tableDoc.getFirstSection().getBody().appendChild(importers.importNode(table, true));
tableDoc.save(AIE.pdfFolder + saveAS);

private static Document docSetup(Document interimDoc, Table table) {
    try {
        //Document tableDoc = new Document();
           
        Document tableDoc=generateDocument(interimDoc);
        DocumentBuilder builder = new DocumentBuilder(interimDoc);
        builder.moveTo(table);
        PageSetup sourcePageSetup = builder.getCurrentSection().getPageSetup();
        tableDoc.getFirstSection().getPageSetup().setOrientation(
            ((Paragraph) table.getNextSibling()).getParentSection().getPageSetup().getOrientation());
        tableDoc.getFirstSection().getPageSetup().setPaperSize(sourcePageSetup.getPaperSize());
        tableDoc.getFirstSection().getPageSetup().setLeftMargin(sourcePageSetup.getLeftMargin());
        tableDoc.getFirstSection().getPageSetup().setRightMargin(sourcePageSetup.getRightMargin());
        return tableDoc;
    } catch (Exception e) {
        return interimDoc;
    }
}

Input and output files : test.zip (171.2 KB)

alexey.noskov · June 27, 2022, 7:02pm

@e503824 The images are extracted properly using ImageExtractor class, which I have suggested earlier. Just in case I provide source here again:

Document doc = new Document("C:\\Temp\\in.docx");
ImageExtractor extractor = new ImageExtractor("C:\\Temp\\");
doc.accept(extractor);

private static class ImageExtractor extends DocumentVisitor {
    public ImageExtractor(String targetFolder) {
        mTargetFolder = targetFolder;
    }

    @Override
    public int visitRowStart(Row row) throws Exception {

        if(rowHasImage(row))
            mRows.push(row);

        return super.visitRowStart(row);
    }

    @Override
    public int visitGroupShapeStart(GroupShape groupShape) throws Exception {

        if (groupShape.isTopLevel())
            mTopShapes.push(groupShape);

        saveShapeAsPdf();

        return super.visitGroupShapeStart(groupShape);
    }

    @Override
    public int visitShapeStart(Shape shape) throws Exception {

        if (shape.isTopLevel() &&
                (shape.getChildNodes(NodeType.PARAGRAPH, true).getCount() == 0) &&
                !isOleEquation(shape)) {
            mTopShapes.push(shape);
        }

        saveShapeAsPdf();

        return super.visitShapeStart(shape);
    }

    @Override
    public int visitParagraphStart(Paragraph paragraph) throws Exception {

        if (isCaptionParagraph(paragraph)) {
            mCaptions.push(paragraph.toString(SaveFormat.TEXT).trim());
            saveShapeAsPdf();
            return VisitorAction.SKIP_THIS_NODE;
        }

        return super.visitParagraphStart(paragraph);
    }

    /**
        * Removes images from the source document and inserts bookmark at the image position.
        */
    public void RemoveImagesFromSourceDocument()
    {
        for (String key : mNodesToRemove.keySet()) {

            ArrayList<Node> nodesToRemove  = mNodesToRemove.get(key);
            if(nodesToRemove.size() == 0)
                continue;

            Node firstNode = nodesToRemove.get(0);

            DocumentBuilder builder = new DocumentBuilder((Document)firstNode.getDocument());
            // In case of table move cursor to the next paragraph.
            if(firstNode.getNodeType() == NodeType.TABLE)
                builder.moveTo(firstNode.getNextSibling());
            else
                builder.moveTo(firstNode);

            // Insert bookmark.
            builder.startBookmark(key);
            builder.endBookmark(key);

            // Remove all image nodes.
            for (Node n : nodesToRemove) {
                n.remove();
            }
        }
    }

    /**
        * Checks whether row has shapes except formulas.
        */
    private static boolean rowHasImage(Row row)
    {
        NodeCollection shapes = row.getChildNodes(NodeType.SHAPE, true);
        if(shapes.getCount() == 0)
            return false;

        boolean hasImage = false;
        for (Shape s : (Iterable<Shape>)shapes) {
            hasImage |= !isOleEquation(s);
        }

        return  hasImage;
    }

    /**
        * Checks whether paragraph is likely to be an image caption.
        */
    private static boolean isCaptionParagraph(Paragraph paragraph) throws Exception {
        // Get only Run text in account because if caption is in a textbox
        // paragraph.toString will return the same value for both
        // paragraph inside textbox shape and for paragraph that contains textbox shape.

        // Caption often contain SEQ fields.
        boolean hasSeqFields = false;
        for (Field f : paragraph.getRange().getFields())
            hasSeqFields |= (f.getType() == FieldType.FIELD_SEQUENCE);
        // More conditions might be added here to better distinguish captions.
        // .........

        String paraText = "";
        for (Run r : paragraph.getRuns()) {
            paraText += r.getText();
        }

        boolean hasCaptionLikeContent = (paraText.startsWith("Fig") ||
                paraText.startsWith("Scheme") ||
                paraText.startsWith("Plate") ||
                paraText.startsWith("Figure") ||
                paraText.startsWith("Flowchart"));

        return  (hasSeqFields || hasCaptionLikeContent);
    }

    /**
        * Check whether shape is an embedded Equation.DSMT4 OLE object
        */
    private static boolean isOleEquation(Shape shape)
    {
        return (shape.getOleFormat() != null) && (shape.getOleFormat().getProgId().equals("Equation.DSMT4"));
    }

    /**
        * Save the last shape as a separate PDF document.
        */
    private void saveShapeAsPdf() throws Exception {
        if (!mTopShapes.empty() && !mCaptions.empty()) {
            String caption = mCaptions.pop();
            System.out.println(mShapeCounter);
            System.out.println(caption);

            // Create e temporary document which will be exported to PDF.
            Document tmp = (Document) mTopShapes.peek().getDocument().deepClone(false);
            Node tmpSection = tmp.importNode(mTopShapes.peek().getAncestor(NodeType.SECTION), false, ImportFormatMode.USE_DESTINATION_STYLES);
            tmp.appendChild(tmpSection);
            tmp.ensureMinimum();

            // There might be several shape to import under one caption.
            ArrayList<Node> nodesToImport = new ArrayList<Node>();
            if(mTopShapes.size() > 1 && !mRows.isEmpty())
            {
                Table imagesTable = (Table)mRows.peek().getParentTable().deepClone(false);
                while (!mRows.isEmpty()) {
                    Row r = mRows.pop();
                    imagesTable.prependChild(r.deepClone(true));
                }

                nodesToImport.add(imagesTable);
            }
            else
            {
                while (!mTopShapes.isEmpty()) {
                    ShapeBase s = mTopShapes.pop();
                    nodesToImport.add(s);
                }
            }

            String key = "image_" + mShapeCounter;
            mNodesToRemove.put(key, nodesToImport);

            for (Node imageNode :  nodesToImport) {
                Node resultImage = tmp.importNode(imageNode, true, ImportFormatMode.USE_DESTINATION_STYLES);

                if (resultImage.isComposite()) {
                    resultImage.getRange().unlinkFields();
                    Iterable<Paragraph> paragraphs  = ((CompositeNode) resultImage).getChildNodes(NodeType.PARAGRAPH, true);
                    for (Paragraph p : paragraphs)
                    {
                        if(isCaptionParagraph(p))
                            p.remove();
                    }
                    //((CompositeNode) resultImage).getChildNodes(NodeType.RUN, true).clear();
                }

                if (resultImage.getNodeType() == NodeType.TABLE)
                    tmp.getFirstSection().getBody().prependChild(resultImage);
                else
                    tmp.getFirstSection().getBody().getFirstParagraph().prependChild(resultImage);
            }

            // Format the output file path.
            String outFilePath = mTargetFolder + key + ".pdf";
            tmp.save(outFilePath);

            mShapeCounter++;

            // Empty stacks.
            mTopShapes.clear();
            mRows.clear();
            mCaptions.clear();
        }
    }

    // Upon visiting the shapes captions and shapes are pushed in stack.
    // only top level shapes will be collected.
    private Stack<ShapeBase> mTopShapes = new Stack<ShapeBase>();
    private Stack<Row> mRows = new Stack<Row>();
    private Stack<String> mCaptions = new Stack<String>();
    private int mShapeCounter = 0;
    private String mTargetFolder;

    // Dictionary with nodes that should be deleted and replaced with bookmark.
    HashMap<String, ArrayList<Node>> mNodesToRemove = new HashMap<String, ArrayList<Node>>();
}