Error - Missing some Hyperlink in Docx when try to read

Wuttipol · January 24, 2016, 10:22pm

Dear Dev. Team.

I found error when code to list all link in docx file.Number of link is 77. but with Aspose, I can read only 26, Please kindly help to check.

Wuttipol · January 24, 2016, 10:33pm

Here is the code.

/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
  */
  package LNApplication.test;

import com.aspose.words.FieldStart;
import com.aspose.words.FieldType;
import com.aspose.words.Node;
import com.aspose.words.NodeList;
import com.aspose.words.NodeType;
import com.aspose.words.Run;
import com.aspose.words.Shape;
import java.io.File;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author WMads2429175
  */
  public class TestAllDebug {

public static void main(String[] args) {
 try {
     new TestAllDebug().testLinkInDocx();
 } catch (Exception ex) {
     Logger.getLogger(TestAllDebug.class.getName()).log(Level.SEVERE, null, ex);
 }
}
public void testLinkInDocx() throws Exception{
 File coverpageFile=new File("C:\\My Box Sync\\Notes Database\\ShareFolder\\Data\\Lotus Notes - Design simulation committee\\CAE Solutions\\2010_log\\(Coverpage) - 2010_log.docx");
 ArrayList urlList=new ArrayList<String>();
 int cnt=0;
 System.out.println("check link for:" + coverpageFile.getAbsolutePath());

     com.aspose.words.Document doc = new com.aspose.words.Document(coverpageFile.getAbsolutePath());
     boolean flagChange = false;

     NodeList fieldStarts = doc.selectNodes("//FieldStart");

     for (FieldStart fieldStart : (Iterable<FieldStart>) fieldStarts) {

         if (fieldStart.getFieldType() == FieldType.FIELD_HYPERLINK) {
             // The field is a hyperlink field, use the "facade" class to help to deal with the field.
             Hyperlink hyperlink = new Hyperlink(fieldStart);

             // Some hyperlinks can be local (links to bookmarks inside the document), ignore these.
             if (hyperlink.isLocal()) {
                 continue;
             }

             // The Hyperlink class allows to set the target URL and the display name
             // of the link easily by setting the properties.
             System.out.println("found hyperlink 1:" + hyperlink.getTarget());
             //-------------------- code to convert url

             cnt++;

             urlList.add(hyperlink.getTarget());

             //hyperlink.setTarget(newUrl);
             // hyperlink.setName(newName);
         }
     }

     for (Shape shape : (Iterable<Shape>) doc.getChildNodes(NodeType.SHAPE, true)) {
         if (shape.hasImage() && !shape.getHRef().equals("")) {
             System.out.println(shape.getHRef());

             //-------------------- code to convert url
             String newUrl = "";
              System.out.println("found hyperlink 2:" + shape.getHRef());
             cnt++;
             urlList.add(shape.getHRef());

         }
     }

     if (urlList.size() > 0) {
         System.out.println("Getting from file:" + coverpageFile.getAbsolutePath());
         System.out.println("Size:"+urlList.size());
         System.out.println("cnt:"+cnt);
         //System.exit(0);
     } else {
         System.out.println("No link for " + coverpageFile.getAbsolutePath());
     }
}
}
class Hyperlink {
Hyperlink(FieldStart fieldStart) throws Exception {
    if (fieldStart == null) {
        throw new IllegalArgumentException("fieldStart");
    }
    if (fieldStart.getFieldType() != FieldType.FIELD_HYPERLINK) {
        throw new IllegalArgumentException("Field start type must be FieldHyperlink.");
    }

    mFieldStart = fieldStart;

    // Find the field separator node.
    mFieldSeparator = findNextSibling(mFieldStart, NodeType.FIELD_SEPARATOR);
    if (mFieldSeparator == null) {
        throw new IllegalStateException("Cannot find field separator.");
    }

    // Find the field end node. Normally field end will always be found, but in the example document
    // there happens to be a paragraph break included in the hyperlink and this puts the field end
    // in the next paragraph. It will be much more complicated to handle fields which span several
    // paragraphs correctly, but in this case allowing field end to be null is enough for our purposes.
    mFieldEnd = findNextSibling(mFieldSeparator, NodeType.FIELD_END);

    // Field code looks something like [ HYPERLINK "http:\\www.myurl.com" ], but it can consist of several runs.
    String fieldCode = getTextSameParent(mFieldStart.getNextSibling(), mFieldSeparator);
    Matcher matcher = G_REGEX.matcher(fieldCode.trim());
    matcher.find();
    mIsLocal = (matcher.group(1) != null) && (matcher.group(1).length() > 0);    //The link is local if \l is present in the field code.
    mTarget = matcher.group(2).toString();
}

/**
 * Gets or sets the display name of the hyperlink.
 */
String getName() throws Exception {
    return getTextSameParent(mFieldSeparator, mFieldEnd);
}

void setName(String value) throws Exception {

    try {
        // Hyperlink display name is stored in the field result which is a Run
        // node between field separator and field end.
        Run fieldResult = (Run) mFieldSeparator.getNextSibling();
        fieldResult.setText(value);

        // But sometimes the field result can consist of more than one run, delete these runs.
        removeSameParent(fieldResult.getNextSibling(), mFieldEnd);
    } catch (ClassCastException ex) {

    } catch (NullPointerException ex) {

    }
}

/**
 * Gets or sets the target url or bookmark name of the hyperlink.
 */
String getTarget() throws Exception {
    return mTarget;
}

void setTarget(String value) throws Exception {

    mTarget = value;
    System.out.println("setting target to:" + mTarget);
    updateFieldCode();
}

/**
 * True if the hyperlink's target is a bookmark inside the document. False
 * if the hyperlink is a url.
 */
boolean isLocal() throws Exception {
    return mIsLocal;
}

void isLocal(boolean value) throws Exception {
    mIsLocal = value;
    updateFieldCode();
}

private void updateFieldCode() throws Exception {
    // Field code is stored in a Run node between field start and field separator.
    Run fieldCode = (Run) mFieldStart.getNextSibling();

    fieldCode.setText(java.text.MessageFormat.format("HYPERLINK {0}\"{1}\"", ((mIsLocal) ? "\\l " : ""), mTarget));

    // But sometimes the field code can consist of more than one run, delete these runs.
    removeSameParent(fieldCode.getNextSibling(), mFieldSeparator);
    System.out.println("Changed to be " + mTarget);
}

/**
 * Goes through siblings starting from the start node until it finds a node
 * of the specified type or null.
 */
private static Node findNextSibling(Node startNode, int nodeType) throws Exception {
    for (Node node = startNode; node != null; node = node.getNextSibling()) {
        if (node.getNodeType() == nodeType) {
            return node;
        }
    }
    return null;
}

/**
 * Retrieves text from start up to but not including the end node.
 */
private static String getTextSameParent(Node startNode, Node endNode) throws Exception {
    if ((endNode != null) && (startNode.getParentNode() != endNode.getParentNode())) {
        throw new IllegalArgumentException("Start and end nodes are expected to have the same parent.");
    }

    StringBuilder builder = new StringBuilder();
    for (Node child = startNode; !child.equals(endNode); child = child.getNextSibling()) {
        builder.append(child.getText());
    }

    return builder.toString();
}

/**
 * Removes nodes from start up to but not including the end node. Start and
 * end are assumed to have the same parent.
 */
private static void removeSameParent(Node startNode, Node endNode) throws Exception {
    if ((endNode != null) && (startNode.getParentNode() != endNode.getParentNode())) {
        throw new IllegalArgumentException("Start and end nodes are expected to have the same parent.");
    }

    Node curChild = startNode;
    while ((curChild != null) && (curChild != endNode)) {
        Node nextChild = curChild.getNextSibling();
        curChild.remove();
        curChild = nextChild;
    }
}

private final Node mFieldStart;
private final Node mFieldSeparator;
private final Node mFieldEnd;
private boolean mIsLocal;
private String mTarget;

/**
 * RK I am notoriously bad at regexes. It seems I don't understand their way
 * of thinking.
 */
private static final Pattern G_REGEX = Pattern.compile(
        "\\S+" + // one or more non spaces HYPERLINK or other word in other languages
        "\\s+" + // one or more spaces
        "(?:\"\"\\s+)?" + // non capturing optional "" and one or more spaces, found in one of the customers files.
        "(\\\\l\\s+)?" + // optional \l flag followed by one or more spaces
        "\"" + // one apostrophe
        "([^\"]+)" + // one or more chars except apostrophe (hyperlink target)
        "\"" // one closing apostrophe
);
}

Wuttipol · January 24, 2016, 10:36pm

Here is my code.

tahir.manzoor · January 26, 2016, 1:23am

Hi Wuttipol,

Thanks for your inquiry. We have tested the scenario using latest version of Aspose.Words for Java 15.12.0 and have not found the shared issue. Perhaps, you are using Aspose.Words without license. Please note that in evaluation mode there are some limitations applied. E.g Aspose.Words injects an evaluation watermark at the top of the document. The document’s content are truncated after a certain number of paragraphs during import or export.

Please request for temporary license from here:
Get temporary license

Please read about applying license from here:
Applying a License

Wuttipol · January 26, 2016, 1:31am

Ok, Aspose Team.
Let me try and this jar file.
Actually, we have license.