Dear team,
We are using aspose to image extraction using java, we are facing one issue in Image extraction tool. In this case Caption Above images are not extracting for some docx files, please find below source code
public class CaptionAbove {
private static final String BK = "Bookmark";
static String matches = "Fig.*(?:[ \\r\\n\\t].*)+|Scheme.*|Plate.*|Abbildung.*";
//private static Logger logger = Logger.getLogger(CaptionAbove.class.getName());
private static org.apache.logging.log4j.Logger logger = LogManager.getLogger(CaptionAbove.class);
@SuppressWarnings("unchecked")
public static void captionAbove(Document interimdoc) throws Exception {
Document doc = interimdoc;
doc.acceptAllRevisions();
DocumentBuilder builder = AIE.getBuilderObject(doc);
System.out.println("builder :" + builder);
int bookmark = 1;
NodeCollection<Paragraph> paragraphs = doc.getChildNodes(NodeType.PARAGRAPH, true);
System.out.println("paragraphs :"+ paragraphs);
// removing bookmark sample:Figures
for (Bookmark bkmark : doc.getRange().getBookmarks()) {
if (!bkmark.getName().startsWith("bookmark")) {
bkmark.remove();
}
}
// removing bookmark
for (Paragraph paragraph : (Iterable<Paragraph>) paragraphs) {
System.out.println("paragraph 6:" + paragraph.getText().toString());
try {
if ((paragraph.toString(SaveFormat.TEXT).trim().startsWith("Fig")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Scheme")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Plate")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Abb")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Abbildung")
&& paragraph.getNodeType() != NodeType.TABLE)
// //changes by pavi -starts check sample D:\testing\AIE\Iteration 16_4 points\Document contains Duplicate figure captions\Revised-MANUSCRIPT
&& ((paragraph.getNextSibling() != null
&& paragraph.getNextSibling().getNodeType() != NodeType.TABLE)
|| paragraph.getParentSection().getBody().getFirstParagraph().getText().trim()
.matches(matches))
// && paragraph.getNextSibling().getNodeType() != NodeType.TABLE
//changes by pavi -end
&& paragraph.getChildNodes(NodeType.SHAPE, true).getCount() == 0
&& !paragraph.toString(SaveFormat.TEXT).contains(AIE.docName)
&& !paragraph.getNextSibling().toString(SaveFormat.TEXT).trim().matches(matches)//duplicate caption by pavi
&& (!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figure Captions"))||
!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figures")))) {
// supplymentry check sample: JCIS_SRE_2020_1_2nd_revision.docx
if (AIE.supplymentryCheck(paragraph.toString(SaveFormat.TEXT).trim())) {
AIE.insertBookmark(interimdoc, paragraph, AIE.fileName);
continue;
}
// supplymentry check
Node nextPara = paragraph.getNextSibling();
boolean isLabel;
try {
isLabel = AIE.findLabel(nextPara, doc);
} catch (Exception e) {
isLabel = false;
if (nextPara.getNodeType() == 9 || nextPara.getNodeType() == 10) {
nextPara.remove();
nextPara = paragraph.getNextSibling();
}
}
nextPara = isLabel ? nextPara.getNextSibling() : nextPara;
while (((Paragraph) nextPara).toString(SaveFormat.TEXT).trim().length() == 0
&& ((Paragraph) nextPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0) {
nextPara = nextPara.getNextSibling();
}
while (nextPara != null && nextPara.getNodeType() == NodeType.PARAGRAPH
&& !AIE.removeEquationShapes((Paragraph) nextPara)
&& !((Paragraph) nextPara).toString(SaveFormat.TEXT).trim().matches(matches)
&& ((Paragraph) nextPara).getChildNodes(NodeType.SHAPE, true).getCount() > 0) {
try {
if (nextPara == doc.getLastSection().getBody().getLastParagraph()) {
nextPara = nextPara.getNextSibling();
break;
} else {
nextPara = nextPara.getNextSibling();
isLabel = AIE.findLabel(nextPara, doc);
}
} catch (NullPointerException | ClassCastException e) {
isLabel = false;
}
nextPara = isLabel ? nextPara.getNextSibling() : nextPara;
while (nextPara != null && ((Paragraph) nextPara).toString(SaveFormat.TEXT).trim().length() == 0
&& ((Paragraph) nextPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0) {
nextPara = nextPara.getNextSibling();
}
}
try {
if (nextPara == null) {
Paragraph paratoappend = new Paragraph(doc);
Run run = new Run(doc);
run.setText(" ");
paratoappend.appendChild(run);
Node startnode = (paragraph.getPreviousSibling() == null)
? paragraph.getParentNode().insertBefore(paratoappend, paragraph)
: paragraph.getPreviousSibling();
Node endnode = paragraph.getParentSection().getBody().getLastParagraph();
builder.moveTo(startnode);
builder.startBookmark(BK + bookmark);
builder.moveTo(endnode);
builder.endBookmark(BK + bookmark);
bookmark++;
} else if (nextPara.getNodeType() == NodeType.PARAGRAPH
&& (!((Paragraph) nextPara).toString(SaveFormat.TEXT).trim().matches(matches)
|| abovecheck(paragraph, doc))) {
while (nextPara.getPreviousSibling().toString(SaveFormat.TEXT).trim().length() == 0
&& ((Paragraph) nextPara.getPreviousSibling()).getChildNodes(NodeType.SHAPE, true)
.getCount() == 0) {
nextPara = nextPara.getPreviousSibling();
}
if (!AIE.removeEquationShapes((Paragraph) nextPara)
&& ((Paragraph) nextPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0
&& ((Paragraph) nextPara.getPreviousSibling()).getChildNodes(NodeType.SHAPE, true)
.getCount() > 0) {
Paragraph paras = new Paragraph(doc);
Run run = new Run(doc);
run.setText(" ");
paras.appendChild(run);
Node nodeEnd = ((Paragraph) nextPara).getParentNode().insertBefore(paras, nextPara);
Node nodeStart = paragraph.getParentNode().insertBefore(new Paragraph(doc), paragraph);
builder.moveTo(nodeStart);
builder.startBookmark(BK + bookmark);
builder.moveTo(nodeEnd);
builder.endBookmark(BK + bookmark);
bookmark++;
}
}
} catch (NullPointerException e) {
logger.info( "Exception occur, {0}", e.getMessage());
}
}
} catch (NullPointerException|ClassCastException e) {
//e.printStackTrace();
continue;
}
}
AIE.extractImage(doc);
}
Input file : Boydetal_AzraqRevisedManuscriptEdited.docx (3.6 MB)
Output file : output.pdf (48.6 KB)
in this case only one image is extracting, that image is the table, Please do needful