My requirement is to extracting the images using paragraph nodes and fig caption.but some the documents having images without fig caption instead of it using legends like (a),(b).let me know how to extract the images using legends.
I have enclosed the source code isNewAip.zip (58.5 KB)
Thanks for your inquiry. Please ZIP and attach your input Word document here for testing. We will investigate the issue on our side and provide you more information.
Thanks for sharing the document. The “legends like (a),(b)” are list items in your document. Please use following code example to get the desired output. Hope this helps you.
Document doc = new Document(MyDir + "test (8).docx");
doc.updateListLabels();
int i = 1;
ArrayList nodes = new ArrayList();
//Get the paragraphs that start with "(a)".
for (Paragraph paragraph : (Iterable<Paragraph>) doc.getChildNodes(NodeType.PARAGRAPH, true))
{
if(paragraph.getListFormat().isListItem())
{
if(paragraph.getListLabel().getLabelString().trim().startsWith("(a)") ||
paragraph.getListLabel().getLabelString().trim().startsWith("(b)") ||
paragraph.getListLabel().getLabelString().trim().startsWith("(c)"))
{
Node previousPara = paragraph.getPreviousSibling();
while (previousPara != null
&& previousPara.getNodeType() == NodeType.PARAGRAPH
&& previousPara.toString(SaveFormat.TEXT).trim().length() == 0
&& ((Paragraph)previousPara).getChildNodes(NodeType.SHAPE, true).getCount() > 0)
{
if(previousPara != null)
nodes.add(previousPara);
previousPara = previousPara.getPreviousSibling();
}
if(nodes.size() > 0)
{
//Reverse the node collection.
Collections.reverse(nodes);
//Extract the consecutive shapes and export them into new document
Document dstDoc = new Document();
for (Paragraph para : (Iterable<Paragraph>)nodes)
{
NodeImporter importer = new NodeImporter(doc, dstDoc, ImportFormatMode.KEEP_SOURCE_FORMATTING);
Node newNode = importer.importNode(para, true);
dstDoc.getFirstSection().getBody().appendChild(newNode);
}
//Remove the first empty paragraph
if(dstDoc.getFirstSection().getBody().getFirstParagraph().toString(SaveFormat.TEXT).trim().length() == 0)
dstDoc.getFirstSection().getBody().getFirstParagraph().remove();
dstDoc.save(MyDir + "out\\output"+i+".docx");
i++;
nodes.clear();
}
}
}
}