How to identify a specific paragraph in newspaper-style document

AlpeshChaudhariDev · June 3, 2024, 7:50am

Hi team,

I have a document that contains newspaper-style paragraphs. I have added controls for selecting paragraphs on both sides. I want to identify (get) the starting and ending paragraphs or runs on the left side, and I also want to identify (get) the starting and ending paragraphs or runs on the right side. How can I achieve this ?

Document Sample :
Sample.docx (108.4 KB)

alexey.noskov · June 3, 2024, 11:38am

@AlpeshChaudhariDev @ANDREA.FARRIS As you may know MS Word documents are flow by their nature, so there is no neither “page” nor “column” concept. The consumer application reflows the document content on the fly.
You can use LayoutCollector and LayoutEnumerator classes to calculate nodes position on the page in the main document body. Then you can compare X coordinates and determine whether the node is moved to the next column.
For example such technique is used in the following code to split the tables where they breaks by page or column:

/// <summary>
/// Splits the tables on the document on page and text column basis.
/// </summary>
public static void SplitTableByPagesAndColumns(Document doc)
{
    LayoutCollector collector = new LayoutCollector(doc);
    LayoutEnumerator enumerator = new LayoutEnumerator(doc);

    NodeCollection tables = doc.GetChildNodes(NodeType.Table, true);
    foreach (Table t in tables)
    {
        // Process only top level table in the main document's body.
        if (t.ParentNode.NodeType != NodeType.Body)
            continue;

        Table table = t;
        while (table != null)
        {
            table = SplitTalbeTextColumns(table, collector, enumerator);
            if (table != null)
            {
                // Do not update layout if it is not required.
                collector.Clear();
                doc.UpdatePageLayout();
            }
        }
    }
}

private static Table SplitTalbeTextColumns(Table table, LayoutCollector collector, LayoutEnumerator enumerator)
{
    int startPageIndex = collector.GetStartPageIndex(table.FirstRow);

    enumerator.Current = collector.GetEntity(table.FirstRow.FirstCell.FirstParagraph);
    while (enumerator.Type != LayoutEntityType.Row)
        enumerator.MoveParent();
    double startRowX = enumerator.Rectangle.Left;

    int breakIndex = -1;
    int firstDataRowIndex = -1;

    // Determine index of row where page breaks. And index of the first data row.
    for (int i = 1; i < table.Rows.Count; i++)
    {
        Row r = table.Rows[i];
        if (!r.RowFormat.HeadingFormat && firstDataRowIndex < 0)
            firstDataRowIndex = i;

        int rowPageIndex = collector.GetEndPageIndex(r);
        if (rowPageIndex > startPageIndex)
        {
            breakIndex = i;
            break;
        }

        enumerator.Current = collector.GetEntity(r.FirstCell.FirstParagraph);
        while (enumerator.Type != LayoutEntityType.Row)
            enumerator.MoveParent();
        double currentRowX = enumerator.Rectangle.Left;
        if (startRowX != currentRowX)
        {
            breakIndex = i;
            break;
        }
    }

    if (breakIndex > 0)
    {
        Table clone = (Table)table.Clone(true);

        // Insert a cloned table after the main table.
        Paragraph para = new Paragraph(table.Document);
        para.AppendChild(new Run(table.Document, ControlChar.ColumnBreak + "Continuation of the table"));

        table.ParentNode.InsertAfter(para, table);
        para.ParentNode.InsertAfter(clone, para);

        // Remove content after the breaking row from the main table.
        while (table.Rows.Count > breakIndex)
            table.LastRow.Remove();

        // Remove rows before the breaking row from the clonned table.
        for (int i = 1; i < breakIndex; i++)
            clone.Rows.RemoveAt(firstDataRowIndex);

        return clone;
    }

    return null;
}