Image extraction Issue in java

e503824 · April 1, 2022, 3:48am

Dear team,

We are using aspose to image extraction using java, we are facing one issue in Image extraction tool. In this case Caption Above images are not extracting for some docx files, please find below source code

public class CaptionAbove {
	private static final String BK = "Bookmark";
	static String matches = "Fig.*(?:[ \\r\\n\\t].*)+|Scheme.*|Plate.*|Abbildung.*";
	//private static Logger logger = Logger.getLogger(CaptionAbove.class.getName());
	private static org.apache.logging.log4j.Logger logger = LogManager.getLogger(CaptionAbove.class);
	@SuppressWarnings("unchecked")
	public static void captionAbove(Document interimdoc) throws Exception {
		Document doc = interimdoc;
		doc.acceptAllRevisions();
		DocumentBuilder builder = AIE.getBuilderObject(doc);
		System.out.println("builder  :" + builder);
		int bookmark = 1;
		NodeCollection<Paragraph> paragraphs = doc.getChildNodes(NodeType.PARAGRAPH, true);
		System.out.println("paragraphs :"+ paragraphs);
		// removing bookmark sample:Figures
		for (Bookmark bkmark : doc.getRange().getBookmarks()) {
			if (!bkmark.getName().startsWith("bookmark")) {
				bkmark.remove();
			}
		}
		// removing bookmark
		for (Paragraph paragraph : (Iterable<Paragraph>) paragraphs) {
			System.out.println("paragraph  6:" + paragraph.getText().toString());
			try {
				if ((paragraph.toString(SaveFormat.TEXT).trim().startsWith("Fig")
						|| paragraph.toString(SaveFormat.TEXT).startsWith("Scheme")
						|| paragraph.toString(SaveFormat.TEXT).startsWith("Plate")
						|| paragraph.toString(SaveFormat.TEXT).startsWith("Abb")
						|| paragraph.toString(SaveFormat.TEXT).startsWith("Abbildung")
								&& paragraph.getNodeType() != NodeType.TABLE)
//						//changes by pavi -starts check sample  D:\testing\AIE\Iteration 16_4 points\Document contains Duplicate figure captions\Revised-MANUSCRIPT
						&& ((paragraph.getNextSibling() != null
						&& paragraph.getNextSibling().getNodeType() != NodeType.TABLE)
						|| paragraph.getParentSection().getBody().getFirstParagraph().getText().trim()
								.matches(matches))
						
					//	&& paragraph.getNextSibling().getNodeType() != NodeType.TABLE
						//changes by pavi -end 
						&& paragraph.getChildNodes(NodeType.SHAPE, true).getCount() == 0
						&& !paragraph.toString(SaveFormat.TEXT).contains(AIE.docName)
						&& !paragraph.getNextSibling().toString(SaveFormat.TEXT).trim().matches(matches)//duplicate caption by pavi
						&& (!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figure Captions"))||
								!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figures")))) {
					// supplymentry check sample: JCIS_SRE_2020_1_2nd_revision.docx
					if (AIE.supplymentryCheck(paragraph.toString(SaveFormat.TEXT).trim())) {
						AIE.insertBookmark(interimdoc, paragraph, AIE.fileName);
						continue;
					}
					// supplymentry check
					Node nextPara = paragraph.getNextSibling();
					boolean isLabel;
					try {
						isLabel = AIE.findLabel(nextPara, doc);
					} catch (Exception e) {
						isLabel = false;
						if (nextPara.getNodeType() == 9 || nextPara.getNodeType() == 10) {
							nextPara.remove();
							nextPara = paragraph.getNextSibling();
						}
					}
					nextPara = isLabel ? nextPara.getNextSibling() : nextPara;
					while (((Paragraph) nextPara).toString(SaveFormat.TEXT).trim().length() == 0
							&& ((Paragraph) nextPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0) {
						nextPara = nextPara.getNextSibling();
					}

					while (nextPara != null && nextPara.getNodeType() == NodeType.PARAGRAPH
							&& !AIE.removeEquationShapes((Paragraph) nextPara)
							&& !((Paragraph) nextPara).toString(SaveFormat.TEXT).trim().matches(matches)
							&& ((Paragraph) nextPara).getChildNodes(NodeType.SHAPE, true).getCount() > 0) {
						try {

							if (nextPara == doc.getLastSection().getBody().getLastParagraph()) {
								nextPara = nextPara.getNextSibling();
								break;
							} else {
								nextPara = nextPara.getNextSibling();
								isLabel = AIE.findLabel(nextPara, doc);
							}

						} catch (NullPointerException | ClassCastException e) {

							isLabel = false;
						}
						nextPara = isLabel ? nextPara.getNextSibling() : nextPara;
						while (nextPara != null && ((Paragraph) nextPara).toString(SaveFormat.TEXT).trim().length() == 0
								&& ((Paragraph) nextPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0) {
						nextPara = nextPara.getNextSibling();
						}

					}
					try {
						if (nextPara == null) {
							Paragraph paratoappend = new Paragraph(doc);
							Run run = new Run(doc);
							run.setText(" ");
							paratoappend.appendChild(run);
							Node startnode = (paragraph.getPreviousSibling() == null)
									? paragraph.getParentNode().insertBefore(paratoappend, paragraph)
									: paragraph.getPreviousSibling();
							Node endnode = paragraph.getParentSection().getBody().getLastParagraph();
							builder.moveTo(startnode);
							builder.startBookmark(BK + bookmark);
							builder.moveTo(endnode);
							builder.endBookmark(BK + bookmark);
							bookmark++;
						} else if (nextPara.getNodeType() == NodeType.PARAGRAPH
								&& (!((Paragraph) nextPara).toString(SaveFormat.TEXT).trim().matches(matches)
										|| abovecheck(paragraph, doc))) {
							while (nextPara.getPreviousSibling().toString(SaveFormat.TEXT).trim().length() == 0
									&& ((Paragraph) nextPara.getPreviousSibling()).getChildNodes(NodeType.SHAPE, true)
											.getCount() == 0) {
								nextPara = nextPara.getPreviousSibling();

							}

							if (!AIE.removeEquationShapes((Paragraph) nextPara)
									&& ((Paragraph) nextPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0
									&& ((Paragraph) nextPara.getPreviousSibling()).getChildNodes(NodeType.SHAPE, true)
											.getCount() > 0) {
								Paragraph paras = new Paragraph(doc);
								Run run = new Run(doc);
								run.setText(" ");
								paras.appendChild(run);
								Node nodeEnd = ((Paragraph) nextPara).getParentNode().insertBefore(paras, nextPara);
								Node nodeStart = paragraph.getParentNode().insertBefore(new Paragraph(doc), paragraph);

								builder.moveTo(nodeStart);
								builder.startBookmark(BK + bookmark);
								builder.moveTo(nodeEnd);
								builder.endBookmark(BK + bookmark);
								bookmark++;
							}

						}
					} catch (NullPointerException e) {
						logger.info( "Exception occur, {0}", e.getMessage());
						

					}
				}
			} catch (NullPointerException|ClassCastException  e) {
				//e.printStackTrace();
				
				continue;
			} 

		}
		AIE.extractImage(doc);
	}

Input file : Boydetal_AzraqRevisedManuscriptEdited.docx (3.6 MB)

Output file : output.pdf (48.6 KB)

in this case only one image is extracting, that image is the table, Please do needful

alexey.noskov · April 1, 2022, 6:56am

@e503824 Your code is too complicated, it would be easier to use IReplacingCallback to match figures captions and then extract figures from your document. For example see the following simple code that extracts figures with captions:

Document doc = new Document("C:\\Temp\\in.docx");

Pattern pattern = Pattern.compile("(Fig(ure)?\\s*\\d+):");
FindReplaceOptions options = new FindReplaceOptions();
FiguresExtractor extractor = new FiguresExtractor(doc);
options.setReplacingCallback(extractor);
doc.getRange().replace(pattern, "", options);

extractor.getDocument().save("C:\\Temp\\out.pdf");

private static class FiguresExtractor implements IReplacingCallback
{
    public FiguresExtractor(Document srcDocument)
    {
        mDstDocument = (Document)srcDocument.deepClone(false);
        mDstDocument.ensureMinimum();
    }

    @Override
    public int replacing(ReplacingArgs args) throws Exception {
        // In the source document caption is under the paragraph with shape.
        // So extract the matched paragraph and the previous one.
        Paragraph caption = (Paragraph)args.getMatchNode().getAncestor(NodeType.PARAGRAPH);
        CompositeNode previousNode = (CompositeNode)caption.getPreviousSibling();

        if(previousNode != null && previousNode.getNodeType() == NodeType.PARAGRAPH &&
                previousNode.getChildNodes(NodeType.SHAPE, true).getCount() >0) {
            mDstDocument.getFirstSection().getBody().appendChild(mDstDocument.importNode(previousNode, true, ImportFormatMode.USE_DESTINATION_STYLES));
            mDstDocument.getFirstSection().getBody().appendChild(mDstDocument.importNode(caption, true, ImportFormatMode.USE_DESTINATION_STYLES));
        }

        return ReplaceAction.SKIP;
    }

    public Document getDocument() {
        return mDstDocument;
    }

    private Document mDstDocument;
}