Dear Team
I am facing some difficultly to extract unnumbered images please find the solution regarding my requirement I have attached the source code and also sample input document.
Sample::unnumbered.zip (280.6 KB)
Source code::private static void unNumberedImageExtraction(Document interimdoc) throws Exception {
int imageCount = 0;
Document doc1 = new Document();
Document doc = interimdoc;
NodeImporter importer = new NodeImporter(doc, doc1, ImportFormatMode.KEEP_SOURCE_FORMATTING);
NodeCollection paragraphs = doc.getChildNodes(NodeType.PARAGRAPH, true);
for (Paragraph paragraph : (Iterable) paragraphs) {
NodeCollection shapes = paragraph.getChildNodes(NodeType.SHAPE, true);
for (Shape shape : (Iterable<Shape>) shapes) {
String imageExtension = "";
try {
imageExtension = FilenameUtils.getExtension(FileFormatUtil.imageTypeToExtension(shape.getImageData().getImageType()));
} catch (Exception exception) {
imageExtension = "tif";
}
if (!imageExtension.equals("wmf")) {
isImage = true;
}
}
if (isImage) {
Node imageNode = importer.importNode(paragraph, true);
doc1.getFirstSection().getBody().appendChild(imageNode);
doc1.updatePageLayout();
}
System.out.println(paragraph.getText());
Paragraph testPara=paragraph;
testPara.getParagraphFormat().clearFormatting();
if (testPara.getText().startsWith(“Fig”)) {
doc1.getFirstSection().getBody().removeAllChildren();
doc1.updatePageLayout();
isImage = false;
}
if (!letterCheck(paragraph)) {
String text = paragraph.getText();
for (int i = 0; i < text.length(); i++) {
char ch = text.charAt(i);
if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || text.equals("")
|| paragraph.getCount() == 0) {
isImg = true;
}
}
if (!isImg && isImage) {
imageCount++;
doc1.getDocument();
String bookmarkName = fileName +"_UNFIG"+String.format("%04d", imageCount);
String jpegName = "";
if (arg2.equals("EL")) {
jpegName = elFolder + bookmarkName + ".jpg";
doc1.save(jpegName);
BufferedImage dest = trimWhiteSpaces(jpegName); // Trim White Space in Images.
if (dest != null) {
ImageIO.write(dest, "jpg", new File(elFolder + bookmarkName + ".jpg"));
}
File imgFile1 = new File(jpegName);
BufferedImage img1 = ImageIO.read(imgFile1);
int height1 = img1.getHeight();
int width1 = img1.getWidth();
createEL_XMLLog(bookmarkName, width1, height1);
extractedimage.add(bookmarkName);
allimages.add(bookmarkName);
insertBookmark(doc, paragraph, bookmarkName);
deleteNode(doc, paragraph);
}
if (arg2.equals("AIE")) {
String pdf = "";
pdf = pdfFolder + bookmarkName + ".pdf";
doc1.save(pdf);
String pdfFilename = pdfFolder + bookmarkName + ".jpeg";
doc1.save(pdfFilename);
BufferedImage dest = trimWhiteSpaces(pdfFilename); // Trim White Space in Images.
if (dest != null) {
ImageIO.write(dest, "jpeg", new File(pdfFilename));
}
File imgFile = new File(pdfFilename);
BufferedImage img = ImageIO.read(imgFile);
int height = img.getHeight();
int width = img.getWidth();
createEL_XMLLog(bookmarkName, width, height);
extractedimage.add(bookmarkName);
allimages.add(bookmarkName);
File filName = new File(pdfFilename);
if (filName.exists()) {
Files.delete(filName.toPath());
}
insertBookmark(doc, paragraph, bookmarkName);
deleteNode(doc, paragraph);
}
doc1.getFirstSection().getBody().removeAllChildren();
doc1.updatePageLayout();
}
}
isImage = false;
isImg = false;
}
}
public static boolean letterCheck(Paragraph paragraph) {
boolean test = false;
String text = paragraph.getText();
char c;
for (c = 'A'; c <= 'Z'; ++c) {
if (text.contains(String.valueOf(c))) {
test = true;
break;
}
}
for (c = 'a'; c <= 'z'; ++c) {
if (text.contains(String.valueOf(c))) {
test = true;
break;
}
}
for(int i=0;i<=9;i++) {
if (text.contains(String.valueOf(i))) {
test = true;
break;
}
}
return test;
}