Hi Team,
My Requirement is to extract the display images and save into new document.
In source document Figure 1 also extracted.please kindly help me to ignore those images.
Source document:manuscriptrevised.zip (2.5 MB)
Actual Output: manuscriptrevised (2).zip (120.3 KB)
Thanks & regards,
Priyanga G
@priyanga,
Thanks for your inquiry. Please use the following code example to get the desired output. Hope this helps you.
Document doc = new Document(MyDir + "manuscriptrevised.docx");
DocumentBuilder builder = new DocumentBuilder(doc);
ArrayList tables = new ArrayList();
int bookmark = 1;
int i = 1;
NodeCollection paragraphs = doc.getChildNodes(NodeType.PARAGRAPH, true);
for (Paragraph paragraph : (Iterable<Paragraph>) paragraphs)
{
if(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Fig"))
{
Node PreviousPara = paragraph.getPreviousSibling();
while (PreviousPara != null && PreviousPara.toString(SaveFormat.TEXT).trim().length() == 0
&& ((Paragraph)PreviousPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0)
PreviousPara = PreviousPara.getPreviousSibling();
while (PreviousPara != null
&& PreviousPara.getNodeType() == NodeType.PARAGRAPH
&& (((Paragraph)PreviousPara).getChildNodes(NodeType.SHAPE, true).getCount() > 0 || ((Paragraph)PreviousPara).getChildNodes(NodeType.GROUP_SHAPE, true).getCount() > 0)
|| (PreviousPara.toString(SaveFormat.TEXT).trim().length() > 0 &&
(PreviousPara.toString(SaveFormat.TEXT).trim().contains("(a)") ||
PreviousPara.toString(SaveFormat.TEXT).trim().contains("(b)") ||
PreviousPara.toString(SaveFormat.TEXT).trim().contains("(b)") ||
PreviousPara.toString(SaveFormat.TEXT).trim().contains("(d)")
)
)
)
{
PreviousPara = PreviousPara.getPreviousSibling();
}
if(PreviousPara == null)
{
builder.moveToDocumentStart();
builder.insertParagraph();
builder.startBookmark("Bookmark" + bookmark);
builder.moveToParagraph(paragraphs.indexOf(paragraph), 0);
builder.endBookmark("Bookmark" + bookmark);
bookmark++;
}
else if(PreviousPara.getNodeType() == NodeType.PARAGRAPH)
{
Node node = ((Paragraph)PreviousPara).getParentNode().insertAfter(new Paragraph(doc), PreviousPara);
builder.moveTo(node);
builder.startBookmark("Bookmark" + bookmark);
builder.moveTo(paragraph);
builder.endBookmark("Bookmark" + bookmark);
bookmark++;
}
else if(PreviousPara.getNodeType() == NodeType.TABLE)
{
if(((Table)PreviousPara).getChildNodes(NodeType.SHAPE, true).getCount() > 0)
tables.add(((Table)PreviousPara));
}
}
}
for (Bookmark bm : doc.getRange().getBookmarks())
{
if(bm.getName().startsWith("Bookmark"))
{
ArrayList nodes = ExtractContents.extractContent(bm.getBookmarkStart(), bm.getBookmarkEnd(), true);
Document dstDoc = ExtractContents.generateDocument(doc, nodes);
PageSetup sourcePageSetup = ((Paragraph)bm.getBookmarkStart().getParentNode()).getParentSection().getPageSetup();
dstDoc.getFirstSection().getPageSetup().setPaperSize(sourcePageSetup.getPaperSize());
dstDoc.getFirstSection().getPageSetup().setLeftMargin(sourcePageSetup.getLeftMargin());
dstDoc.getFirstSection().getPageSetup().setRightMargin(sourcePageSetup.getRightMargin());
if(dstDoc.getLastSection().getBody().getLastParagraph().toString(SaveFormat.TEXT).trim().startsWith("Fig"))
dstDoc.getLastSection().getBody().getLastParagraph().remove();
if(dstDoc.getChildNodes(NodeType.SHAPE, true).getCount() > 0
|| dstDoc.getChildNodes(NodeType.GROUP_SHAPE, true).getCount() > 0)
dstDoc.save(MyDir + "out\\output"+i+".docx");
i++;
}
}
for(Table table : (Iterable<Table>)tables)
{
Document dstDoc = new Document();
NodeImporter importer = new NodeImporter(doc, dstDoc, ImportFormatMode.KEEP_SOURCE_FORMATTING);
Node newNode = importer.importNode(table, true);
dstDoc.getFirstSection().getBody().appendChild(newNode);
dstDoc.save(MyDir + "out\\output" + i + ".docx");
i++;
}