文本坐标提取

SunHao158 · April 11, 2024, 12:52pm

如何遍历全文所有段落，并将段落中每行的坐标(x1,y1,x2,y2)提取出来，存到list里

alexey.noskov · April 11, 2024, 6:33pm

@SunHao158 如您所知，由于 MS Word 文档的流动性质，因此没有页面或行的概念。消费者应用程序动态构建文档布局，Aspose.Words 使用其自己的布局引擎也是如此。 LayoutCollector 和 LayoutEnumerator 类提供对文档布局信息的有限访问。
例如，以下代码演示了将文档内容拆分为行的基本技术：

Document doc = new Document("C:\\Temp\\in.docx");

// Split all Run nodes in the document to make them not more than one word.
Node[] runs = doc.getChildNodes(NodeType.RUN, true).toArray();
for (Node n : runs)
{
    Run current = (Run)n;
    while (current.getText().indexOf(' ') >= 0)
        current = SplitRun(current, current.getText().indexOf(' ') + 1);
}

// Wrap all runs in the document with bookmarks to make it possible to work with LayoutCollector and LayoutEnumerator
runs = doc.getChildNodes(NodeType.RUN, true).toArray();
    
ArrayList<String> tmpBookmakrs = new ArrayList<String>();
int bkIndex = 0;
for (Node r : runs)
{
    // LayoutCollector and LayoutEnumerator does not work with nodes in header/footer or in textboxes.
    if (r.getAncestor(NodeType.HEADER_FOOTER) != null || r.getAncestor(NodeType.SHAPE) != null)
        continue;
        
    BookmarkStart start = new BookmarkStart(doc, "r" + bkIndex);
    BookmarkEnd end = new BookmarkEnd(doc, start.getName());
        
    r.getParentNode().insertBefore(start, r);
    r.getParentNode().insertAfter(end, r);
        
    tmpBookmakrs.add(start.getName());
    bkIndex++;
}

// Now we can use collector and enumerator to get runs per line in MS Word document.
LayoutCollector collector = new LayoutCollector(doc);
LayoutEnumerator enumerator = new LayoutEnumerator(doc);
    
Object currentLine = null;
for (String bkName : tmpBookmakrs)
{
    Bookmark bk = doc.getRange().getBookmarks().get(bkName);
        
    enumerator.setCurrent(collector.getEntity(bk.getBookmarkStart()));
    while (enumerator.getType() != LayoutEntityType.LINE)
        enumerator.moveParent();
            
    if (!enumerator.getCurrent().equals(currentLine))
    {
        currentLine = enumerator.getCurrent();
            
        System.out.println();
        System.out.println("-------=========Start Of Line=========-------");
        // Here you can get coordinates of the line.
        System.out.println(enumerator.getRectangle());
    }
        
    Node nextNode = bk.getBookmarkStart().getNextSibling();
    if (nextNode != null && nextNode.getNodeType() == NodeType.RUN)
        System.out.print(((Run)nextNode).getText());
}

private static Run SplitRun(Run run, int position)
{
    Run afterRun = (Run)run.deepClone(true);
    run.getParentNode().insertAfter(afterRun, run);
    afterRun.setText(run.getText().substring(position));
    run.setText(run.getText().substring(0, position));
    return afterRun;
}