Extract images from this document

MikeLak · September 20, 2018, 8:22am

I want to extract all the images from this document. doc3.zip (283.1 KB) sample input is attached here.Expected output doc3 _Output.zip (282.8 KB)
Thanks in advance.

tahir.manzoor · September 20, 2018, 4:16pm

@MikeLak

Thanks for your inquiry. Please use the following code example to get the desired output. You can get the code of extractContent and generateDocument methods from the following article.
Extract Selected Content Between Nodes

Document doc = new Document(MyDir + "doc3.docx");

DocumentBuilder builder = new DocumentBuilder(doc);
ArrayList tables = new ArrayList();
int bookmark = 1;
int i = 1;
NodeCollection paragraphs = doc.getChildNodes(NodeType.PARAGRAPH, true);
for (Paragraph  paragraph : (Iterable<Paragraph>) paragraphs)
{
    if(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Fig"))
    {

        Node PreviousPara = paragraph.getPreviousSibling();

        while (PreviousPara != null &&

                (PreviousPara.toString(SaveFormat.TEXT).trim().length() == 0
                        || (PreviousPara.toString(SaveFormat.TEXT).trim().contains("(a)") ||
                        PreviousPara.toString(SaveFormat.TEXT).trim().contains("(b)") ||
                        PreviousPara.toString(SaveFormat.TEXT).trim().contains("(b)") ||
                        PreviousPara.toString(SaveFormat.TEXT).trim().contains("(d)")
                )
                )

                && ((Paragraph)PreviousPara).getChildNodes(NodeType.SHAPE, true).getCount() > 0
                )
        {
            PreviousPara = PreviousPara.getPreviousSibling();
        }

        
        if(PreviousPara == null)
        {
            builder.moveToDocumentStart();
            builder.insertParagraph();
            builder.startBookmark("Bookmark" + bookmark);
            builder.moveToParagraph(paragraphs.indexOf(paragraph), 0);
            builder.endBookmark("Bookmark" + bookmark);
            bookmark++;
        }
        else if(PreviousPara.getNodeType() == NodeType.PARAGRAPH)
        {
            Node node = ((Paragraph)PreviousPara).getParentNode().insertAfter(new Paragraph(doc), PreviousPara);
            builder.moveTo(node);
            builder.startBookmark("Bookmark" + bookmark);
            builder.moveTo(paragraph);
            builder.endBookmark("Bookmark" + bookmark);
            bookmark++;
        }
        else if(PreviousPara.getNodeType() == NodeType.TABLE)
        {
            if(((Table)PreviousPara).getChildNodes(NodeType.SHAPE, true).getCount() > 0)
                tables.add(((Table)PreviousPara));
        }

    }
}

for (Bookmark bm : doc.getRange().getBookmarks())
{
    if(bm.getName().startsWith("Bookmark"))
    {
        ArrayList nodes =  ExtractContents.extractContent(bm.getBookmarkStart(), bm.getBookmarkEnd(), true);
        Document dstDoc = ExtractContents.generateDocument(doc, nodes);

        PageSetup sourcePageSetup = ((Paragraph)bm.getBookmarkStart().getParentNode()).getParentSection().getPageSetup();
        dstDoc.getFirstSection().getPageSetup().setPaperSize(sourcePageSetup.getPaperSize());
        dstDoc.getFirstSection().getPageSetup().setLeftMargin(sourcePageSetup.getLeftMargin());
        dstDoc.getFirstSection().getPageSetup().setRightMargin(sourcePageSetup.getRightMargin());

        if(dstDoc.getLastSection().getBody().getLastParagraph().toString(SaveFormat.TEXT).trim().startsWith("Fig"))
            dstDoc.getLastSection().getBody().getLastParagraph().remove();

        if(dstDoc.getFirstSection().getBody().getFirstParagraph().getChildNodes(NodeType.SHAPE, true).getCount() == 0)
            dstDoc.getFirstSection().getBody().getFirstParagraph().remove();

        dstDoc.save(MyDir + "output"+i+".docx");
        i++;
    }
}