Dear team,
we are extracting images from docx, but below docx having having flowchat but we are notable to extract below case please find below source code and input file
Source code :
public class FixedGraphic {
static String matches = "Fig.*(?:[ \\r\\n\\t].*)+|Scheme.*|Plate.*|Abbildung.*|Fig.*(?:[ \\r\\n\\t]*)+";
private static org.apache.logging.log4j.Logger logger = LogManager.getLogger(FixedGraphic.class);
static int count = 1;
static Resultjson rs;
public static void fixedImage(Document interimdoc) throws Exception {
String pdf;
NodeCollection shapes = interimdoc.getChildNodes(NodeType.SHAPE, true);
LayoutCollector collector = new LayoutCollector(interimdoc);
int imageIndex = 1;
for (Shape shape : (Iterable<Shape>)shapes)
{
String text="NoMatch";
try {
text=shape.getParentParagraph().getAncestor(NodeType.TABLE).getPreviousSibling().toString(SaveFormat.TEXT);
}
catch(Exception e) {
logger.info(e.getMessage());
}
try {
//25.05.2022 - Mahe - 27
boolean mathType=isOleEquation(shape);
//System.out.println("mathType: "+mathType);
if (shape.hasImage() && !text.contains(AIE.docName) && mathType==false && !AIE.supplementaryFigure)
{
String imgName = "";
String lwFilearg= AIE.filearg.toLowerCase().replaceAll("\\s", "");
if(lwFilearg.contains("graphicalabstract")) {
imgName ="GA" +imageIndex;
}
else {
imgName ="FX" +imageIndex;
}
pdf = AIE.pdfFolder + imgName + ".pdf";
imageIndex++;
// Create an intermediate document to where shape will be imported to.
Document itermDoc = (Document)interimdoc.deepClone(false);
// use section imported from the source document to keep the same page size and orientation.
itermDoc.appendChild(itermDoc.importNode(
shape.getAncestor(NodeType.SECTION),
false,
ImportFormatMode.USE_DESTINATION_STYLES));
// Add required nodes since we did not import child nodes from the source document.
itermDoc.ensureMinimum();
Node shapeNode = shape;
while (shapeNode.getParentNode().getNodeType() == NodeType.GROUP_SHAPE)
{shapeNode = shapeNode.getParentNode();}
// Import shape and put it into the document.
Node importedShape = itermDoc.importNode(shapeNode, true, ImportFormatMode.USE_DESTINATION_STYLES);
itermDoc.getFirstSection().getBody().getFirstParagraph().appendChild(importedShape);
// Save as PDF.
itermDoc.save(pdf);
int width = (int) shape.getWidth();
int height = (int) shape.getHeight();
int pageNO =collector.getStartPageIndex(shape);
AIE.extractedimage.add(imgName);
AIE.allimages.add(imgName);
String extracteddetails =Kromatrix.fixedImgCreatejson(imgName, width, height,pageNO, pdf, AIE.allimages);
rs.setExtracteddetails(extracteddetails);
//Added bookmark in interim document
Paragraph pa = shape.getParentParagraph();
AIE.insertBookmark(interimdoc, pa, imgName);
//Remove the figure
shape.remove();
}
}
catch(Exception e) {
logger.info(e.getMessage());
}
}
}
input file : Manuscript RATE-protocol FINALR3.docx (73.5 KB)