We're sorry Aspose doesn't work properply without JavaScript enabled.

Free Support Forum - aspose.com

Image Extraction 13

Dear team,

We are extracting Images from docx but below case its getting extracted but in black image please find input and output, Please find source code also

if ((paragraph.toString(SaveFormat.TEXT).toLowerCase().trim().startsWith("fig")
					|| paragraph.toString(SaveFormat.TEXT).startsWith("Scheme")
					|| paragraph.toString(SaveFormat.TEXT).startsWith("Plate")
					|| paragraph.toString(SaveFormat.TEXT).startsWith("Abb")
					|| paragraph.toString(SaveFormat.TEXT).startsWith("Abbildung"))
					&& !paragraph.toString(SaveFormat.TEXT).toLowerCase().startsWith("abbreviations")
					// for duplicate figure caption it-15
					&& (paragraph.getNextSibling() != null
							&& !paragraph.getNextSibling().toString(SaveFormat.TEXT).trim().matches(matches)
							|| (paragraph.getNextSibling() != null
									&& paragraph.getNextSibling().getNodeType() != NodeType.TABLE
									&& paragraph.getNextSibling().toString(SaveFormat.TEXT).trim().matches(matches)
									&& (((Paragraph)paragraph.getNextSibling()).getChildNodes(NodeType.SHAPE, true)
											.getCount() > 0
											|| (paragraph.getNextSibling().getNextSibling()) != null
													&& paragraph.getNextSibling().getNextSibling()
															.getNodeType() != NodeType.TABLE
													&& ((((Paragraph)paragraph.getNextSibling().getNextSibling())
															.getChildNodes(NodeType.SHAPE, true).getCount() == 0)

															//this codition added by pavi-14-12-2021   for duplicate captions
															|| (((Paragraph)paragraph.getNextSibling().getNextSibling())
																	.getChildNodes(NodeType.SHAPE, true).getCount() > 0))))
							|| paragraph.getParentSection().getBody().getLastParagraph().getText().trim()
									.matches(matches))
					// for duplicate figure caption
					&& ((paragraph.getPreviousSibling() != null
							&& paragraph.getPreviousSibling().getNodeType() != NodeType.TABLE)
							|| paragraph.getParentSection().getBody().getFirstParagraph().getText().trim()
									.matches(matches))
					&& paragraph.getNodeType() != NodeType.TABLE
					&& paragraph.getParentNode().getNodeType() != NodeType.CELL
					&& !paragraph.toString(SaveFormat.TEXT).contains(AIE.docName)

					//condition added by pavi -14-12-2021
					&& (!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figure Captions")) ||
							!(paragraph.toString(SaveFormat.TEXT).trim().startsWith("Figures"))))

//|| ((paragraph.getNextSibling() == null) && (builder.getCurrentParagraph().isEndOfDocument()))


{



	// parallel check it-15
	ArrayList<String> cap = new ArrayList<>();
	cap.add("Fig");
	cap.add("Scheme");
	cap.add("Plate");
	String captionName = paragraph.toString(SaveFormat.TEXT).trim();

	/*String toreplace = "";
	for (String caption : cap) {
		if (captionName.matches(caption + ".*")) {
			toreplace = caption;
					
			break;
		}
	}*/

	String toreplace = cap.stream().filter(i->captionName.matches(i + ".*")).collect(Collectors.toList()).toString().replace("[", "").replace("]", "");



	int replaces = toreplace.equals("") ? 0 : paragraph.deepClone(true).getRange().replace(toreplace, toreplace);


	//17.05.2022 - Mahe - 26
	if (replaces > 1)
	{
		String[] parts = captionName.split("\\s{2,}Fig");
		for (int i = 0; i < parts.length; i++)
		{
			String lastFourDigits = "";
			if (parts[i].length() > 4)
			{
				lastFourDigits = parts[i].substring(parts[i].length() - 10);
			}
			lastFourDigits = lastFourDigits.replace(" ", "").replace("\t", "");
			if (lastFourDigits.length() == 0 || parts.length == 1)
			{
				replaces = 1;
			}
		}
	}


	int seqcount = 0;
	for (Field field : paragraph.getRange().getFields())
	{
		if (field.getType() == 12)
		{
			seqcount += 1;

		}
	}


	//Mahe - 20.04.2022 - 2 
	boolean parallelPreviouscheck = false;
	boolean parallelFigcheck = false;

	try
	{
		parallelPreviouscheck = ((Paragraph)paragraph.getPreviousSibling())
					.getChildNodes(NodeType.SHAPE, true).getCount() > 1
					&& paragraph.getChildNodes(NodeType.SHAPE, true).getCount() == 0;

		parallelFigcheck = ((Paragraph)paragraph.getPreviousSibling())
				.getChildNodes(NodeType.SHAPE, true).getCount() == 0
				&& paragraph.getChildNodes(NodeType.SHAPE, true).getCount() > 1;
	}
	catch (Exception e)
	{
		logger.info("Exception Occur, {0}", e.getMessage());
	}


	boolean isanchore = false;
	if ((replaces > 1 && (seqcount == 2 || seqcount == 0)) && parallelPreviouscheck)
	{

		seqcount = 0;
		//parallel check it 16
		try
		{
			if (ParallelImage.parallelImage(toreplace, isanchore, paragraph,
					(Paragraph)paragraph.getPreviousSibling(), doc))
			{
				continue;
			}
		}
		catch (IndexOutOfBoundsException e)
		{
			logger.info("Exception Occur, {0}", e.getMessage());
			e.printStackTrace();
		}
		//

	}
	else if ((replaces > 1 && (seqcount == 2 || seqcount == 0)) && parallelFigcheck)
	{
		seqcount = 0;
		isanchore = true;
		if (ParallelImage.parallelImage(toreplace, isanchore, paragraph, paragraph, doc))
		{
			continue;
		}
	}
	// parallel check
	// supplymentry check sample: JCIS_SRE_2020_1_2nd_revision.docx
	if (AIE.supplymentryCheck(paragraph.toString(SaveFormat.TEXT).trim()))
	{
		AIE.insertBookmark(interimdoc, paragraph, AIE.fileName);

		continue;
	}
	// supplymentry check
	Node previousPara = paragraph.getPreviousSibling();

	boolean hasImage = false;
	boolean isLabel;
	try
	{
		while (((Paragraph)previousPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0
				&& previousPara.toString(SaveFormat.TEXT).trim().length() == 0)
		{
			previousPara = previousPara.getPreviousSibling();

		}
		isLabel = AIE.findLabel(previousPara, doc);
	}
	catch (NullPointerException e)
	{
		isLabel = false;
	}
	catch (ClassCastException e)
	{
		isLabel = false;
		if (previousPara.getNodeType() == 9 || previousPara.getNodeType() == 10)
		{
			previousPara.remove();
			previousPara = paragraph.getPreviousSibling();
		}
		else if (previousPara.getNodeType() == 5)
		{
			continue;
		}
	}

	previousPara = isLabel ? previousPara.getPreviousSibling() : previousPara;

	int shapeCount = 0;
	try
	{
		NodeCollection<Shape> shapes = ((Paragraph)previousPara).getChildNodes(NodeType.SHAPE, true);
		for (Shape shape : shapes)
		{
			if (shape.getShapeType() != -2)
			{
				shapeCount++;
			}
			if (shapeCount >= 1)
			{
				hasImage = true;
				break;
			}
		}
	}
	catch (Exception e)
	{
		logger.info("Exception occur, {0}", e.getMessage());
		e.printStackTrace();
	}

	while (previousPara != null && previousPara.getNodeType() == NodeType.PARAGRAPH
			&& (!AIE.removeEquationShapes((Paragraph)previousPara) || shapeCount > 0)

			&& ((Paragraph)previousPara).getChildNodes(NodeType.SHAPE, true).getCount() > 0
			&& !previousPara.toString(SaveFormat.TEXT).trim().matches(matches))
	{

		hasImage = true;
		try
		{
			shapeCount = 0;
			previousPara = previousPara.getPreviousSibling();

			while (((Paragraph)previousPara).getChildNodes(NodeType.SHAPE, true).getCount() == 0
					&& previousPara.toString(SaveFormat.TEXT).trim().length() == 0)
			{
				previousPara = previousPara.getPreviousSibling();
			}
			NodeCollection<Shape> shapesd = ((Paragraph)previousPara).getChildNodes(NodeType.SHAPE, true);
			for (Shape shape : shapesd)
			{
				if (shape.getShapeType() != -2)
				{

					shapeCount++;
				}
				if (shapeCount >= 1)
				{
					break;
				}
			}
			isLabel = AIE.findLabel(previousPara, doc);
		}
		catch (Exception e)
		{
			isLabel = false;
			e.printStackTrace();
		}
		previousPara = isLabel ? previousPara.getPreviousSibling() : previousPara;
	}

	try
	{
		String bookStart = "Bookmark";
		// new
		while (previousPara != null
				&& ((Paragraph)previousPara.getNextSibling()).getChildNodes(NodeType.SHAPE, true)
						.getCount() == 0
				&& previousPara.getNextSibling().toString(SaveFormat.TEXT).trim().length() == 0)
		{
			previousPara = previousPara.getNextSibling();

		}
		// new
		if (previousPara == null)
		{
			Node node = (paragraph).getParentNode().insertBefore(new Paragraph(doc),
					paragraph.getParentSection().getBody().getFirstChild());
			// figure caption in text box sample:Revised manuscript with no changes marked
			Node endNode = paragraph.getChildNodes(NodeType.SHAPE, true).getCount() > 0
					? paragraph.getParentNode().insertAfter(new Paragraph(doc), paragraph)
					: paragraph;
			// figure caption in text box
			builder.moveTo(node);
			builder.startBookmark(bookStart + bookmark);
			builder.moveTo(endNode);
			builder.endBookmark(bookStart + bookmark);
			bookmark++;
		}
		else if (((Paragraph)previousPara.getNextSibling()).getChildNodes(NodeType.SHAPE, true)
				.getCount() != 0
				// for equation removal sample:Paper-many-chains-revised2-20_11.docx
				&& !AIE.removeEquationShapes((Paragraph)previousPara.getNextSibling())
				// for equation removal
				|| hasImage)
		{


			previousPara = previousPara.getNextSibling();

			Node node = ((Paragraph)previousPara).getParentNode().insertBefore(new Paragraph(doc),
					previousPara);
			// figure caption in text box sample:Revised manuscript with no changes marked
			Node endNode = paragraph.getChildNodes(NodeType.SHAPE, true).getCount() > 0
					? paragraph.getParentNode().insertAfter(new Paragraph(doc), paragraph)
					: paragraph;
			// figure caption in text box
			builder.moveTo(node);

			builder.startBookmark(bookStart + bookmark);
			builder.moveTo(endNode);
			builder.endBookmark(bookStart + bookmark);
			bookmark++;
		}
	}
	catch (NullPointerException e)
	{
		logger.info("Exception occurs, {0}", e.getMessage());
	}
}

Inuput and output : 23.zip (569.4 KB)

@e503824 Thank you for reporting the problem to us. For a sake of correction it has been logged as WORDSJAVA-2751. We will keep you informed and let you know once it is resolved.

Dear team,

Its extracting fine only when we are converting docx to pdf we are getting this issue.
Please find below source code and Input output files

[18:44] Mahesh M
Document doc = new Document(“D:\AIE_Samples\IssueFile\August\05.08.2022\1\Bookmark2.docx”);
doc.save(“D:\\AIE_Samples\\IssueFile\\August\\05.08.2022\\1\out.pdf”, SaveFormat.PDF);

Input : Bookmark2.docx (59.1 KB)

output : Fig0002 (3).pdf (38.3 KB)

@e503824 Yes, we have noticed that the problem occurs upon conversion to PDF. We will keep you updated and let you know once the problem is resolved or we have mor information for you.