Dear team,
We are extracting Images from docx but below case its getting extracted but in black image please find input and output, Please find source code also
if ((paragraph.toString(SaveFormat.TEXT).toLowerCase().trim().startsWith("fig")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Scheme")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Plate")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Abb")
|| paragraph.toString(SaveFormat.TEXT).startsWith("Abbildung"))
&& !paragraph.toString(SaveFormat.TEXT).toLowerCase().startsWith("abbreviations")
// for duplicate figure caption it-15
&& (paragraph.getNextSibling() != null
&& !paragraph.getNextSibling().toString(SaveFormat.TEXT).trim().matches(matches)
|| (paragraph.getNextSibling() != null
&& paragraph.getNextSibling().getNodeType() != NodeType.TABLE
&& paragraph.getNextSibling().toString(SaveFormat.TEXT).trim().matches(matches)
&& (((Paragraph)paragraph.getNextSibling()).getChildNodes(NodeType.SHAPE, true)
.getCount() > 0
|| (paragraph.getNextSibling().getNextSibling()) != null
&& paragraph.getNextSibling().getNextSibling()
.getNodeType() != NodeType.TABLE
&& ((((Paragraph)paragraph.getNextSibling().getNextSibling())
.getChildNodes(NodeType.SHAPE, true).getCount() == 0)
//this codition added by pavi-14-12-2021 for duplicate captions
|| (((Paragraph)paragraph.getNextSibling().getNextSibling())
.getChildNodes(NodeType.SHAPE, true).getCount() > 0))))
|| paragraph.getParentSection().getBody().getLastParagraph().getText().trim()
.matches(matches))
// for duplicate figure caption
&& ((paragraph.getPreviousSibling() != null
&& paragraph.getPreviousSibling().getNodeType() != NodeType.TABLE)
|| paragraph.getParentSection().getBody().getFirstParagraph().getText().trim()
.matches(matches))
&& paragraph.getNodeType() != NodeType.TABLE
&& paragraph.getParentNode().getNodeType() != NodeType.CELL
&& !paragraph.toString(SaveFormat.TEXT).contains(AIE.docName)
//condition added by pavi -14-12-2021
&& (!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figure Captions")) ||
!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figures"))))
//|| ((paragraph.getNextSibling() == null) && (builder.getCurrentParagraph().isEndOfDocument()))
{
// parallel check it-15
ArrayList<String> cap = new ArrayList<>();
cap.add("Fig");
cap.add("Scheme");
cap.add("Plate");
String captionName = paragraph.toString(SaveFormat.TEXT).trim();
/*String toreplace = "";
for (String caption : cap) {
if (captionName.matches(caption + ".*")) {
toreplace = caption;
break;
}
}*/
String toreplace = cap.stream().filter(i->captionName.matches(i + ".*")).collect(Collectors.toList()).toString().replace("[", "").replace("]", "");
int replaces = toreplace.equals("") ? 0 : paragraph.deepClone(true).getRange().replace(toreplace, toreplace);
//17.05.2022 - Mahe - 26
if (replaces > 1)
{
String[] parts = captionName.split("\\s{2,}Fig");
for (int i = 0; i < parts.length; i++)
{
String lastFourDigits = "";
if (parts[i].length() > 4)
{
lastFourDigits = parts[i].substring(parts[i].length() - 10);
}
lastFourDigits = lastFourDigits.replace(" ", "").replace("\t", "");
if (lastFourDigits.length() == 0 || parts.length == 1)
{
replaces = 1;
}
}
}
int seqcount = 0;
for (Field field : paragraph.getRange().getFields())
{
if (field.getType() == 12)
{
seqcount += 1;
}
}
//Mahe - 20.04.2022 - 2
boolean parallelPreviouscheck = false;
boolean parallelFigcheck = false;
try
{
parallelPreviouscheck = ((Paragraph)paragraph.getPreviousSibling())
.getChildNodes(NodeType.SHAPE, true).getCount() > 1
&& paragraph.getChildNodes(NodeType.SHAPE, true).getCount() == 0;
parallelFigcheck = ((Paragraph)paragraph.getPreviousSibling())
.getChildNodes(NodeType.SHAPE, true).getCount() == 0
&& paragraph.getChildNodes(NodeType.SHAPE, true).getCount() > 1;
}
catch (Exception e)
{
logger.info("Exception Occur, {0}", e.getMessage());
}
boolean isanchore = false;
if ((replaces > 1 && (seqcount == 2 || seqcount == 0)) && parallelPreviouscheck)
{
seqcount = 0;
//parallel check it 16
try
{
if (ParallelImage.parallelImage(toreplace, isanchore, paragraph,
(Paragraph)paragraph.getPreviousSibling(), doc))
{
continue;
}
}
catch (IndexOutOfBoundsException e)
{
logger.info("Exception Occur, {0}", e.getMessage());
e.printStackTrace();
}
//
}
else if ((replaces > 1 && (seqcount == 2 || seqcount == 0)) && parallelFigcheck)
{
seqcount = 0;
isanchore = true;
if (ParallelImage.parallelImage(toreplace, isanchore, paragraph, paragraph, doc))
{
continue;
}
}
// parallel check
// supplymentry check sample: JCIS_SRE_2020_1_2nd_revision.docx
if (AIE.supplymentryCheck(paragraph.toString(SaveFormat.TEXT).trim()))
{
AIE.insertBookmark(interimdoc, paragraph, AIE.fileName);
continue;
}
// supplymentry check
Node previousPara = paragraph.getPreviousSibling();
boolean hasImage = false;
boolean isLabel;
try
{
while (((Paragraph)previousPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0
&& previousPara.toString(SaveFormat.TEXT).trim().length() == 0)
{
previousPara = previousPara.getPreviousSibling();
}
isLabel = AIE.findLabel(previousPara, doc);
}
catch (NullPointerException e)
{
isLabel = false;
}
catch (ClassCastException e)
{
isLabel = false;
if (previousPara.getNodeType() == 9 || previousPara.getNodeType() == 10)
{
previousPara.remove();
previousPara = paragraph.getPreviousSibling();
}
else if (previousPara.getNodeType() == 5)
{
continue;
}
}
previousPara = isLabel ? previousPara.getPreviousSibling() : previousPara;
int shapeCount = 0;
try
{
NodeCollection<Shape> shapes = ((Paragraph)previousPara).getChildNodes(NodeType.SHAPE, true);
for (Shape shape : shapes)
{
if (shape.getShapeType() != -2)
{
shapeCount++;
}
if (shapeCount >= 1)
{
hasImage = true;
break;
}
}
}
catch (Exception e)
{
logger.info("Exception occur, {0}", e.getMessage());
e.printStackTrace();
}
while (previousPara != null && previousPara.getNodeType() == NodeType.PARAGRAPH
&& (!AIE.removeEquationShapes((Paragraph)previousPara) || shapeCount > 0)
&& ((Paragraph)previousPara).getChildNodes(NodeType.SHAPE, true).getCount() > 0
&& !previousPara.toString(SaveFormat.TEXT).trim().matches(matches))
{
hasImage = true;
try
{
shapeCount = 0;
previousPara = previousPara.getPreviousSibling();
while (((Paragraph)previousPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0
&& previousPara.toString(SaveFormat.TEXT).trim().length() == 0)
{
previousPara = previousPara.getPreviousSibling();
}
NodeCollection<Shape> shapesd = ((Paragraph)previousPara).getChildNodes(NodeType.SHAPE, true);
for (Shape shape : shapesd)
{
if (shape.getShapeType() != -2)
{
shapeCount++;
}
if (shapeCount >= 1)
{
break;
}
}
isLabel = AIE.findLabel(previousPara, doc);
}
catch (Exception e)
{
isLabel = false;
e.printStackTrace();
}
previousPara = isLabel ? previousPara.getPreviousSibling() : previousPara;
}
try
{
String bookStart = "Bookmark";
// new
while (previousPara != null
&& ((Paragraph)previousPara.getNextSibling()).getChildNodes(NodeType.SHAPE, true)
.getCount() == 0
&& previousPara.getNextSibling().toString(SaveFormat.TEXT).trim().length() == 0)
{
previousPara = previousPara.getNextSibling();
}
// new
if (previousPara == null)
{
Node node = (paragraph).getParentNode().insertBefore(new Paragraph(doc),
paragraph.getParentSection().getBody().getFirstChild());
// figure caption in text box sample:Revised manuscript with no changes marked
Node endNode = paragraph.getChildNodes(NodeType.SHAPE, true).getCount() > 0
? paragraph.getParentNode().insertAfter(new Paragraph(doc), paragraph)
: paragraph;
// figure caption in text box
builder.moveTo(node);
builder.startBookmark(bookStart + bookmark);
builder.moveTo(endNode);
builder.endBookmark(bookStart + bookmark);
bookmark++;
}
else if (((Paragraph)previousPara.getNextSibling()).getChildNodes(NodeType.SHAPE, true)
.getCount() != 0
// for equation removal sample:Paper-many-chains-revised2-20_11.docx
&& !AIE.removeEquationShapes((Paragraph)previousPara.getNextSibling())
// for equation removal
|| hasImage)
{
previousPara = previousPara.getNextSibling();
Node node = ((Paragraph)previousPara).getParentNode().insertBefore(new Paragraph(doc),
previousPara);
// figure caption in text box sample:Revised manuscript with no changes marked
Node endNode = paragraph.getChildNodes(NodeType.SHAPE, true).getCount() > 0
? paragraph.getParentNode().insertAfter(new Paragraph(doc), paragraph)
: paragraph;
// figure caption in text box
builder.moveTo(node);
builder.startBookmark(bookStart + bookmark);
builder.moveTo(endNode);
builder.endBookmark(bookStart + bookmark);
bookmark++;
}
}
catch (NullPointerException e)
{
logger.info("Exception occurs, {0}", e.getMessage());
}
}
Inuput and output : 23.zip (569.4 KB)