Issue on display image

priyanga · April 18, 2018, 1:15pm

Hi Team,

My Requirement is to extract the display images and save into new document.

In source document Figure 1 also extracted.please kindly help me to ignore those images.

Source document:manuscriptrevised.zip (2.5 MB)

Actual Output: manuscriptrevised (2).zip (120.3 KB)

Thanks & regards,
Priyanga G

tahir.manzoor · April 18, 2018, 4:56pm

@priyanga,

Thanks for your inquiry. Please use the following code example to get the desired output. Hope this helps you.

Document doc = new Document(MyDir + "manuscriptrevised.docx");
DocumentBuilder builder = new DocumentBuilder(doc);
ArrayList tables = new ArrayList();
int bookmark = 1;
int i = 1;
NodeCollection paragraphs = doc.getChildNodes(NodeType.PARAGRAPH, true);
for (Paragraph  paragraph : (Iterable<Paragraph>) paragraphs)
{
    if(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Fig"))
    {
        Node PreviousPara = paragraph.getPreviousSibling();
        while (PreviousPara != null && PreviousPara.toString(SaveFormat.TEXT).trim().length() == 0
                && ((Paragraph)PreviousPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0)
            PreviousPara = PreviousPara.getPreviousSibling();

        while (PreviousPara != null
                && PreviousPara.getNodeType() == NodeType.PARAGRAPH
                &&   (((Paragraph)PreviousPara).getChildNodes(NodeType.SHAPE, true).getCount() > 0 || ((Paragraph)PreviousPara).getChildNodes(NodeType.GROUP_SHAPE, true).getCount() > 0)
                || (PreviousPara.toString(SaveFormat.TEXT).trim().length() > 0 &&
                    (PreviousPara.toString(SaveFormat.TEXT).trim().contains("(a)") ||
                            PreviousPara.toString(SaveFormat.TEXT).trim().contains("(b)") ||
                            PreviousPara.toString(SaveFormat.TEXT).trim().contains("(b)") ||
                            PreviousPara.toString(SaveFormat.TEXT).trim().contains("(d)")
                    )
                    )
                )

        {
            PreviousPara = PreviousPara.getPreviousSibling();
        }

        if(PreviousPara == null)
        {
            builder.moveToDocumentStart();
            builder.insertParagraph();
            builder.startBookmark("Bookmark" + bookmark);
            builder.moveToParagraph(paragraphs.indexOf(paragraph), 0);
            builder.endBookmark("Bookmark" + bookmark);
            bookmark++;
        }
        else if(PreviousPara.getNodeType() == NodeType.PARAGRAPH)
        {
            Node node = ((Paragraph)PreviousPara).getParentNode().insertAfter(new Paragraph(doc), PreviousPara);
            builder.moveTo(node);
            builder.startBookmark("Bookmark" + bookmark);
            builder.moveTo(paragraph);

            builder.endBookmark("Bookmark" + bookmark);
            bookmark++;
        }
        else if(PreviousPara.getNodeType() == NodeType.TABLE)
        {
            if(((Table)PreviousPara).getChildNodes(NodeType.SHAPE, true).getCount() > 0)
                tables.add(((Table)PreviousPara));
        }

    }
}

for (Bookmark bm : doc.getRange().getBookmarks())
{
    if(bm.getName().startsWith("Bookmark"))
    {
        ArrayList nodes =  ExtractContents.extractContent(bm.getBookmarkStart(), bm.getBookmarkEnd(), true);
        Document dstDoc = ExtractContents.generateDocument(doc, nodes);

        PageSetup sourcePageSetup = ((Paragraph)bm.getBookmarkStart().getParentNode()).getParentSection().getPageSetup();
        dstDoc.getFirstSection().getPageSetup().setPaperSize(sourcePageSetup.getPaperSize());
        dstDoc.getFirstSection().getPageSetup().setLeftMargin(sourcePageSetup.getLeftMargin());
        dstDoc.getFirstSection().getPageSetup().setRightMargin(sourcePageSetup.getRightMargin());
        if(dstDoc.getLastSection().getBody().getLastParagraph().toString(SaveFormat.TEXT).trim().startsWith("Fig"))
            dstDoc.getLastSection().getBody().getLastParagraph().remove();

        if(dstDoc.getChildNodes(NodeType.SHAPE, true).getCount() > 0
                || dstDoc.getChildNodes(NodeType.GROUP_SHAPE, true).getCount() > 0)
        dstDoc.save(MyDir + "out\\output"+i+".docx");
        i++;
    }
}

for(Table table : (Iterable<Table>)tables)
{
    Document dstDoc = new Document();

    NodeImporter importer = new NodeImporter(doc, dstDoc, ImportFormatMode.KEEP_SOURCE_FORMATTING);
    Node newNode = importer.importNode(table, true);
    dstDoc.getFirstSection().getBody().appendChild(newNode);
    dstDoc.save(MyDir + "out\\output" + i + ".docx");
    i++;
}