Extract unnumbered images

jan.kathir · March 9, 2020, 11:40am

Dear Team
I am facing some difficultly to extract unnumbered images please find the solution regarding my requirement I have attached the source code and also sample input document.
Sample::unnumbered.zip (280.6 KB)
Source code::private static void unNumberedImageExtraction(Document interimdoc) throws Exception {
int imageCount = 0;
Document doc1 = new Document();
Document doc = interimdoc;
NodeImporter importer = new NodeImporter(doc, doc1, ImportFormatMode.KEEP_SOURCE_FORMATTING);
NodeCollection paragraphs = doc.getChildNodes(NodeType.PARAGRAPH, true);
for (Paragraph paragraph : (Iterable) paragraphs) {
NodeCollection shapes = paragraph.getChildNodes(NodeType.SHAPE, true);

		for (Shape shape : (Iterable<Shape>) shapes) {
			String imageExtension = "";
			try {
				imageExtension = FilenameUtils.getExtension(FileFormatUtil.imageTypeToExtension(shape.getImageData().getImageType()));
			} catch (Exception exception) {
				imageExtension = "tif";
			}
			if (!imageExtension.equals("wmf")) {
				isImage = true;
			}
		}

		if (isImage) {
			Node imageNode = importer.importNode(paragraph, true);
			doc1.getFirstSection().getBody().appendChild(imageNode);
			doc1.updatePageLayout();
		}
		System.out.println(paragraph.getText());
		Paragraph testPara=paragraph;

testPara.getParagraphFormat().clearFormatting();
if (testPara.getText().startsWith(“Fig”)) {
doc1.getFirstSection().getBody().removeAllChildren();
doc1.updatePageLayout();
isImage = false;
}

		if (!letterCheck(paragraph)) {
			String text = paragraph.getText();
			for (int i = 0; i < text.length(); i++) {
				char ch = text.charAt(i);
				if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || text.equals("")
						|| paragraph.getCount() == 0) {
					isImg = true;
				}
			}
			if (!isImg && isImage) {
				imageCount++;
				doc1.getDocument();
				String bookmarkName = fileName +"_UNFIG"+String.format("%04d", imageCount);
				String jpegName = "";

				if (arg2.equals("EL")) {
					jpegName = elFolder + bookmarkName + ".jpg";
					doc1.save(jpegName);
					BufferedImage dest = trimWhiteSpaces(jpegName); // Trim White Space in Images.
					if (dest != null) {
						ImageIO.write(dest, "jpg", new File(elFolder + bookmarkName + ".jpg"));
					}
					File imgFile1 = new File(jpegName);
					BufferedImage img1 = ImageIO.read(imgFile1);
					int height1 = img1.getHeight();
					int width1 = img1.getWidth();
					createEL_XMLLog(bookmarkName, width1, height1);
					extractedimage.add(bookmarkName);
					allimages.add(bookmarkName);
					insertBookmark(doc, paragraph, bookmarkName);
					deleteNode(doc, paragraph);
				}
				if (arg2.equals("AIE")) {
					String pdf = "";
					pdf = pdfFolder + bookmarkName + ".pdf";
					doc1.save(pdf);
					String pdfFilename = pdfFolder + bookmarkName + ".jpeg";
					doc1.save(pdfFilename);
					BufferedImage dest = trimWhiteSpaces(pdfFilename); // Trim White Space in Images.
					if (dest != null) {
						ImageIO.write(dest, "jpeg", new File(pdfFilename));
					}
					File imgFile = new File(pdfFilename);
					BufferedImage img = ImageIO.read(imgFile);
					int height = img.getHeight();
					int width = img.getWidth();
					createEL_XMLLog(bookmarkName, width, height);
					extractedimage.add(bookmarkName);
					allimages.add(bookmarkName);
					File filName = new File(pdfFilename);
					if (filName.exists()) {
						Files.delete(filName.toPath());
					}
					insertBookmark(doc, paragraph, bookmarkName);
					deleteNode(doc, paragraph);
				}
				doc1.getFirstSection().getBody().removeAllChildren();
				doc1.updatePageLayout();
				
			}
		}
		isImage = false;
		isImg = false;
	}
}

public static boolean letterCheck(Paragraph paragraph) {
	boolean test = false;
	String text = paragraph.getText();
	char c;
	for (c = 'A'; c <= 'Z'; ++c) {
		if (text.contains(String.valueOf(c))) {
			test = true;
			break;
		}
	}
	for (c = 'a'; c <= 'z'; ++c) {
		if (text.contains(String.valueOf(c))) {
			test = true;
			break;
		}
	}
	for(int i=0;i<=9;i++) {
		if (text.contains(String.valueOf(i))) {
			test = true;
			break;
	}
	}
	return test;
}

tahir.manzoor · March 9, 2020, 4:14pm

@jan.kathir

Please check the code example in the following article to extract the images from the document.
How to Extract Images from a Document

Moreover, please read the following article to extract the content from the document. Hope this helps you.
Extract Selected Content Between Nodes