Dear team,
We are extracting images from docx using aspose java but below case its extracting wrongly please find source code and Input,output files
Source Code :
Document tableDoc = docSetup(interimdoc, table);
NodeImporter importers = new NodeImporter(interimdoc, tableDoc,
ImportFormatMode.KEEP_SOURCE_FORMATTING);
tableDoc.getFirstSection().getBody().appendChild(importers.importNode(table, true));
tableDoc.save(AIE.pdfFolder + saveAS);
private static Document docSetup(Document interimDoc, Table table) {
try {
//Document tableDoc = new Document();
Document tableDoc=generateDocument(interimDoc);
DocumentBuilder builder = new DocumentBuilder(interimDoc);
builder.moveTo(table);
PageSetup sourcePageSetup = builder.getCurrentSection().getPageSetup();
tableDoc.getFirstSection().getPageSetup().setOrientation(
((Paragraph) table.getNextSibling()).getParentSection().getPageSetup().getOrientation());
tableDoc.getFirstSection().getPageSetup().setPaperSize(sourcePageSetup.getPaperSize());
tableDoc.getFirstSection().getPageSetup().setLeftMargin(sourcePageSetup.getLeftMargin());
tableDoc.getFirstSection().getPageSetup().setRightMargin(sourcePageSetup.getRightMargin());
return tableDoc;
} catch (Exception e) {
return interimDoc;
}
}
Input and output files : test.zip (171.2 KB)
@e503824 The images are extracted properly using ImageExtractor
class, which I have suggested earlier. Just in case I provide source here again:
Document doc = new Document("C:\\Temp\\in.docx");
ImageExtractor extractor = new ImageExtractor("C:\\Temp\\");
doc.accept(extractor);
private static class ImageExtractor extends DocumentVisitor {
public ImageExtractor(String targetFolder) {
mTargetFolder = targetFolder;
}
@Override
public int visitRowStart(Row row) throws Exception {
if(rowHasImage(row))
mRows.push(row);
return super.visitRowStart(row);
}
@Override
public int visitGroupShapeStart(GroupShape groupShape) throws Exception {
if (groupShape.isTopLevel())
mTopShapes.push(groupShape);
saveShapeAsPdf();
return super.visitGroupShapeStart(groupShape);
}
@Override
public int visitShapeStart(Shape shape) throws Exception {
if (shape.isTopLevel() &&
(shape.getChildNodes(NodeType.PARAGRAPH, true).getCount() == 0) &&
!isOleEquation(shape)) {
mTopShapes.push(shape);
}
saveShapeAsPdf();
return super.visitShapeStart(shape);
}
@Override
public int visitParagraphStart(Paragraph paragraph) throws Exception {
if (isCaptionParagraph(paragraph)) {
mCaptions.push(paragraph.toString(SaveFormat.TEXT).trim());
saveShapeAsPdf();
return VisitorAction.SKIP_THIS_NODE;
}
return super.visitParagraphStart(paragraph);
}
/**
* Removes images from the source document and inserts bookmark at the image position.
*/
public void RemoveImagesFromSourceDocument()
{
for (String key : mNodesToRemove.keySet()) {
ArrayList<Node> nodesToRemove = mNodesToRemove.get(key);
if(nodesToRemove.size() == 0)
continue;
Node firstNode = nodesToRemove.get(0);
DocumentBuilder builder = new DocumentBuilder((Document)firstNode.getDocument());
// In case of table move cursor to the next paragraph.
if(firstNode.getNodeType() == NodeType.TABLE)
builder.moveTo(firstNode.getNextSibling());
else
builder.moveTo(firstNode);
// Insert bookmark.
builder.startBookmark(key);
builder.endBookmark(key);
// Remove all image nodes.
for (Node n : nodesToRemove) {
n.remove();
}
}
}
/**
* Checks whether row has shapes except formulas.
*/
private static boolean rowHasImage(Row row)
{
NodeCollection shapes = row.getChildNodes(NodeType.SHAPE, true);
if(shapes.getCount() == 0)
return false;
boolean hasImage = false;
for (Shape s : (Iterable<Shape>)shapes) {
hasImage |= !isOleEquation(s);
}
return hasImage;
}
/**
* Checks whether paragraph is likely to be an image caption.
*/
private static boolean isCaptionParagraph(Paragraph paragraph) throws Exception {
// Get only Run text in account because if caption is in a textbox
// paragraph.toString will return the same value for both
// paragraph inside textbox shape and for paragraph that contains textbox shape.
// Caption often contain SEQ fields.
boolean hasSeqFields = false;
for (Field f : paragraph.getRange().getFields())
hasSeqFields |= (f.getType() == FieldType.FIELD_SEQUENCE);
// More conditions might be added here to better distinguish captions.
// .........
String paraText = "";
for (Run r : paragraph.getRuns()) {
paraText += r.getText();
}
boolean hasCaptionLikeContent = (paraText.startsWith("Fig") ||
paraText.startsWith("Scheme") ||
paraText.startsWith("Plate") ||
paraText.startsWith("Figure") ||
paraText.startsWith("Flowchart"));
return (hasSeqFields || hasCaptionLikeContent);
}
/**
* Check whether shape is an embedded Equation.DSMT4 OLE object
*/
private static boolean isOleEquation(Shape shape)
{
return (shape.getOleFormat() != null) && (shape.getOleFormat().getProgId().equals("Equation.DSMT4"));
}
/**
* Save the last shape as a separate PDF document.
*/
private void saveShapeAsPdf() throws Exception {
if (!mTopShapes.empty() && !mCaptions.empty()) {
String caption = mCaptions.pop();
System.out.println(mShapeCounter);
System.out.println(caption);
// Create e temporary document which will be exported to PDF.
Document tmp = (Document) mTopShapes.peek().getDocument().deepClone(false);
Node tmpSection = tmp.importNode(mTopShapes.peek().getAncestor(NodeType.SECTION), false, ImportFormatMode.USE_DESTINATION_STYLES);
tmp.appendChild(tmpSection);
tmp.ensureMinimum();
// There might be several shape to import under one caption.
ArrayList<Node> nodesToImport = new ArrayList<Node>();
if(mTopShapes.size() > 1 && !mRows.isEmpty())
{
Table imagesTable = (Table)mRows.peek().getParentTable().deepClone(false);
while (!mRows.isEmpty()) {
Row r = mRows.pop();
imagesTable.prependChild(r.deepClone(true));
}
nodesToImport.add(imagesTable);
}
else
{
while (!mTopShapes.isEmpty()) {
ShapeBase s = mTopShapes.pop();
nodesToImport.add(s);
}
}
String key = "image_" + mShapeCounter;
mNodesToRemove.put(key, nodesToImport);
for (Node imageNode : nodesToImport) {
Node resultImage = tmp.importNode(imageNode, true, ImportFormatMode.USE_DESTINATION_STYLES);
if (resultImage.isComposite()) {
resultImage.getRange().unlinkFields();
Iterable<Paragraph> paragraphs = ((CompositeNode) resultImage).getChildNodes(NodeType.PARAGRAPH, true);
for (Paragraph p : paragraphs)
{
if(isCaptionParagraph(p))
p.remove();
}
//((CompositeNode) resultImage).getChildNodes(NodeType.RUN, true).clear();
}
if (resultImage.getNodeType() == NodeType.TABLE)
tmp.getFirstSection().getBody().prependChild(resultImage);
else
tmp.getFirstSection().getBody().getFirstParagraph().prependChild(resultImage);
}
// Format the output file path.
String outFilePath = mTargetFolder + key + ".pdf";
tmp.save(outFilePath);
mShapeCounter++;
// Empty stacks.
mTopShapes.clear();
mRows.clear();
mCaptions.clear();
}
}
// Upon visiting the shapes captions and shapes are pushed in stack.
// only top level shapes will be collected.
private Stack<ShapeBase> mTopShapes = new Stack<ShapeBase>();
private Stack<Row> mRows = new Stack<Row>();
private Stack<String> mCaptions = new Stack<String>();
private int mShapeCounter = 0;
private String mTargetFolder;
// Dictionary with nodes that should be deleted and replaced with bookmark.
HashMap<String, ArrayList<Node>> mNodesToRemove = new HashMap<String, ArrayList<Node>>();
}