Replacing line breaks with paragraph breaks using replace evaluator

amitkishore · September 24, 2009, 1:54am

Hi,

My reqiurement is to convert the line breaks in a docuemnt to paragraph breaks so as to have proper numbering.

private static void NumberAllParagraphs(Node beginingNode, Node endingNode)
{
    Node currentNode = beginingNode;
    Paragraph paragraph = currentNode as Paragraph;
    Regex regex = new Regex(ControlChar.LineBreak);
    ReplaceEvaluator repEval = new ReplaceEvaluator(ReplaceActionPerformed);
    while (currentNode != endingNode)
    {
        if (currentNode is Paragraph)
        {
            //paragraph.Range.Replace(ControlChar.LineBreak, ControlChar.ParagraphBreak , true, false);
            //paragraph.Range.Replace(regex, repEval, true);
            (currentNode as Paragraph).Range.Replace(regex, repEval, true);
        }
        currentNode = currentNode.NextSibling;
    }
    //if (currentNode is Paragraph) (currentNode as Paragraph).Range.Replace(ControlChar.LineBreak, ControlChar.ParagraphBreak, true, false);
    // if (currentNode is Paragraph) (currentNode as Paragraph).Range.Replace(regex, repEval, true);
}

static ReplaceAction ReplaceActionPerformed(object sender, ReplaceEvaluatorArgs e)
{
    DocumentBuilder builder = new DocumentBuilder(e.MatchNode.Document);
    builder.MoveTo(e.MatchNode);
    // builder.InsertHtml("");
    builder.Write("/r");
    e.MatchNode.Remove();
    return ReplaceAction.Skip;
    //int index = e.MatchNode.Document.FirstSection.Body.Paragraphs.IndexOf(builder.CurrentParagraph);
    ////insert next paragraph content to current paragraph.
    //foreach (Run run in e.MatchNode.Document.FirstSection.Body.Paragraphs[index + 1].Runs)
    //{
    // builder.CurrentParagraph.AppendChild(run);
    //}
    ////remove next paragraph
    //e.MatchNode.Document.FirstSection.Body.Paragraphs[index + 1].Remove();
    // return ReplaceAction.Replace;
}

as it can be seen, I tried different things in the ReplaceActionPerformed. but none of that is working…actually when I change the builder.Write("") to some simple text string it works, so it is not wokring for paragraph breaks in particular and by the way for any special characters.

Any workaround that can solve the isse…

Thanks in Adv

Amit

alexey.noskov · September 24, 2009, 6:29am

Hi

Thanks for your inquiry. In your code you missed that line break can be placed not at the beginning of the matched run. So you should split the matched node. Please try using the following code:

// Open source document.
Document doc = new Document(@"Test001\in.doc");
// Search for line breaks
doc.Range.Replace(new Regex(ControlChar.LineBreak), new ReplaceEvaluator(ReplaceActionPerformed), false);
// Save output document.
doc.Save(@"Test001\out.doc");

static ReplaceAction ReplaceActionPerformed(object sender, ReplaceEvaluatorArgs e)
{
    // Create document builder.
    DocumentBuilder builder = new DocumentBuilder((Document)e.MatchNode.Document);
    // This is a Run node that contains either the beginning or the complete match.
    Node currentNode = e.MatchNode;
    // The first (and may be the only) run can contain text before the match, 
    // in this case it is necessary to split the run.
    if (e.MatchOffset > 0)
        currentNode = SplitRun((Run)currentNode, e.MatchOffset);
    // We should remove LineBreak. 
    Run currentRun = (Run)currentNode;
    currentRun.Text = currentRun.Text.Substring(1);
    // Move to the run.
    builder.MoveTo(currentRun);
    // Insert paragraph break.
    builder.Writeln();
    return ReplaceAction.Skip;
}

/// 
/// Splits text of the specified run into two runs.
/// Inserts the new run just after the specified run.
/// 
private static Run SplitRun(Run run, int position)
{
    Run afterRun = (Run)run.Clone(true);
    afterRun.Text = run.Text.Substring(position);
    run.Text = run.Text.Substring(0, position);
    run.ParentNode.InsertAfter(afterRun, run);
    return afterRun;
}

Hope this helps.

Best regards.

amitkishore · September 25, 2009, 1:40am

Thanks a lot for your inputs.

However even this is not working for me. I tried removing all the line breaks into paragraph breaks using InsertHtml and then using the span tags as well.but still the formatting is screwed up.

string cleanHTML = AsposeUtils.CleanHTMLForContract(language.Text,true );
documentBuilder.InsertHtml(cleanHTML);

public static string CleanHTMLForContract(string htmlString, bool isPara)
{
    HtmlDocument htmlDocument = new HtmlDocument();
    //string cleanString = ReplaceSoftReturns(htmlString);
    htmlDocument.LoadHtml(htmlString);
    if (!isPara)
    {
        RemoveExtraPTags(htmlDocument, true);
        RemoveBreakBeforeList(htmlDocument.DocumentNode.ChildNodes);
    }
    else
    {
        // replace with . 
        ReplaceBRWithP(htmlDocument);
        RemoveExtraPTags(htmlDocument, true);
        RemoveBreakBeforeList(htmlDocument.DocumentNode.ChildNodes);
    }
    return htmlDocument.DocumentNode.OuterHtml;
}

private static void RemoveExtraPTags(HtmlDocument document, bool numberAllParagraphs)
{
    // xpath "//td/p" finds p tags that a are direct decendants of all td tags
    HtmlNodeCollection tableNodes = document.DocumentNode.SelectNodes("//td/p");
    // xpath "//li/p" finds p tags that a are direct decendants of all li tags
    HtmlNodeCollection lineNodes = document.DocumentNode.SelectNodes("//li/p");
    bool appendBR = false;
    //if (tableNodes != null) ConvertPToSpan(tableNodes,appendBR);
    if (lineNodes != null) ConvertPToSpan(lineNodes, appendBR);
}

private static void ConvertPToSpan(HtmlNodeCollection nodes, bool appendBR)
{
    ConvertPToSpan(nodes, null, appendBR);
}

private static void ConvertPToSpan(HtmlNodeCollection nodes, HtmlNodeCollection excludeNodes, bool appendBR)
{
    foreach (HtmlNode node in nodes)
    {
        if (excludeNodes != null)
        {
            bool inExcludedList = false;
            foreach (HtmlNode excludedNode in excludeNodes)
            {
                if (excludedNode.Id == node.Id)
                {
                    inExcludedList = true;
                    break;
                }
            }
            if (inExcludedList) continue;
        }
        if (OPEN_P.IsMatch(node.OuterHtml) && CLOSED_P.IsMatch(node.OuterHtml))
        {
            string replacementNodeHtml = node.OuterHtml;
            // We are replacing with span for the case when style and other attributes where
            // inlined in the p-tag
            replacementNodeHtml = OPEN_P.Replace(replacementNodeHtml, "<span");
            replacementNodeHtml = CLOSED_P.Replace(replacementNodeHtml, "");
            HtmlNode newNode = HtmlNode.CreateNode(replacementNodeHtml);
            if (appendBR)
            {
                HtmlNode brNode = HtmlNode.CreateNode("");
                node.ParentNode.InsertAfter(brNode, node); // add br node
            }
            node.ParentNode.ReplaceChild(newNode, node);
        }
    }
}

private static void ReplaceBRWithP(HtmlDocument htmlDocument)
{
    HtmlNode body = htmlDocument.DocumentNode.SelectSingleNode("//body");
    string strBefore = body.InnerHtml;
    //string[] strTokens;
    Regex regex = new Regex(@"");
    string[] strTokens = regex.Split(strBefore);
    if (strTokens.Length > 1)
    {
        System.Text.StringBuilder strBuild = new System.Text.StringBuilder();
        foreach (string token in strTokens)
        {
            strBuild.AppendFormat(@"{0}", token);
            //if (token.Length > 1) 
            //{
            // strBuild.AppendFormat(@"{0}", token); 
            //}
        }
        body.InnerHtml = strBuild.ToString();
    }
    //throw new NotImplementedException();
}

private static void RemoveBreakBeforeList(HtmlNodeCollection nodes)
{
    HtmlNode lastElement = null;
    foreach (HtmlNode node in nodes)
    {
        if (node.NodeType != HtmlNodeType.Element) continue;
        if (lastElement != null && lastElement.Name == "br" && (node.Name == "ol" || node.Name == "ul"))
        {
            lastElement.ParentNode.RemoveChild(lastElement);
        }
        lastElement = node;
        if (node.HasChildNodes)
        {
            RemoveBreakBeforeList(node.ChildNodes);
        }
    }
}

then after that I used the method

NumberAllParagraphs(beginingNode,endingNode);

using the code provided by you…

Any pointers how I can achieve this?

Thanks in advance

Amit

AndreyN · September 25, 2009, 3:09am

Hi

Thank you for additional information. Could you please attach your input, output and expected documents here? I will investigate the problem on my side and provide you more information.

Best regards,

amitkishore · September 25, 2009, 3:50am

Hi ,

Thanks a lot for your quick response.

I am attaching herewith the sample documents:

Beforechanges( this is the sample doc before change)
Afterchanges( this is the sample doc after changes)- desired output.
Draft Contract Summary( it is generated by the existing code)- it should have the same formatting as Afterchanges.

I have moved back to MS word 2003 for this purpose as this is the format required to be used.

I will really appreciate if you can provide me the solution pointers to the above issue.

Thanks in adv.

regards,

Amit

alexey.noskov · September 25, 2009, 2:43pm

Hi

Thank you for additional information. First of all, I think you should remove inserted paragraphs is they are empty. I modified the code:

static ReplaceAction ReplaceActionPerformed(object sender, ReplaceEvaluatorArgs e)
{
    // Create document builder.
    DocumentBuilder builder = new DocumentBuilder((Document)e.MatchNode.Document);
    // This is a Run node that contains either the beginning or the complete match.
    Node currentNode = e.MatchNode;
    // The first (and may be the only) run can contain text before the match, 
    // in this case it is necessary to split the run.
    if (e.MatchOffset > 0)
        currentNode = SplitRun((Run)currentNode, e.MatchOffset);
    // We should remove LineBreak. 
    Run currentRun = (Run)currentNode;
    currentRun.Text = currentRun.Text.Substring(1);
    // Move to the run.
    builder.MoveTo(currentRun);
    // Insert paragraph break.
    builder.Writeln();
    if (string.IsNullOrEmpty(builder.CurrentParagraph.ToTxt().Trim()))
        builder.CurrentParagraph.Remove();
    return ReplaceAction.Skip;
}

However, this modification will not do all you need. In your desired result after “Definitions” item list level is changed, but in other places where line breaks will be replaced with paragraph breaks list level is not changed. How are you going to determine whether changing if the list level is needed or not?

Another thing text starting with “If multiple Force Account Equipment Rates apply” is not a list item, because the source paragraph is not list item.

Best regards.

amitkishore · September 30, 2009, 12:53am

Thanks much for your inputs. I tried modifying the stuff and came to one point.

I am able to get the formatting now using the below method

public static void AlignAndFormatNode(Node referenceNode, Node currentNode, Style paragraphStyle, Style fontStyle, bool isNumber)
{
    if (currentNode is Paragraph)
    {
        Paragraph paragraph = currentNode as Paragraph;
        // If a list item, incorporate list with privious list
        //if (paragraph.IsListItem)
        //{
        // if (paragraph.ListFormat.List.ListId
        // != (referenceNode as Paragraph).ListFormat.List.ListId)
        // {
        // paragraph.ListFormat.List = (referenceNode as Paragraph).ListFormat.List;
        // paragraph.ListFormat.ListLevelNumber = paragraph.ListFormat.ListLevelNumber
        // + (referenceNode as Paragraph).ListFormat.ListLevelNumber + 1;
        // }
        //}
        if (!isNumber)
        {
            if (paragraph.IsListItem)
            {
                if (paragraph.ListFormat.List.ListId
                != (referenceNode as Paragraph).ListFormat.List.ListId)
                {
                    paragraph.ListFormat.List = (referenceNode as Paragraph).ListFormat.List;
                    paragraph.ListFormat.ListLevelNumber = paragraph.ListFormat.ListLevelNumber
                        + (referenceNode as Paragraph).ListFormat.ListLevelNumber + 1;
                }
            }
        }
        else
        {
            if (paragraph.IsListItem)
            {
                if (paragraph.ListFormat.List.ListId
                != (referenceNode as Paragraph).ListFormat.List.ListId)
                {
                    paragraph.ListFormat.List = (referenceNode as Paragraph).ListFormat.List;
                    paragraph.ListFormat.ListLevelNumber = paragraph.ListFormat.ListLevelNumber
                        + (referenceNode as Paragraph).ListFormat.ListLevelNumber + 2;
                }
            }
            else
            {
                paragraph.ListFormat.List = (referenceNode as Paragraph).ListFormat.List;
                paragraph.ListFormat.ListLevelNumber = paragraph.ListFormat.ListLevelNumber
                    + (referenceNode as Paragraph).ListFormat.ListLevelNumber + 1;
            }
        }
        paragraph.ParagraphFormat.SpaceAfterAuto = false;
        paragraph.ParagraphFormat.SpaceAfter = 0;
        if (paragraphStyle != null)
        {
            paragraph.ParagraphFormat.SpaceAfterAuto = paragraphStyle.ParagraphFormat.SpaceAfterAuto;
            paragraph.ParagraphFormat.SpaceAfter = paragraphStyle.ParagraphFormat.SpaceAfter;
            paragraph.ParagraphFormat.Alignment = paragraphStyle.ParagraphFormat.Alignment;
            //paragraph = (Paragraph)paragraph.PreviousSibling;
            //paragraph.ParagraphFormat.Style = paragraphStyle.ParagraphFormat.Style;
        }
        if (fontStyle != null)
        {
            foreach (Run run in paragraph.GetChildNodes(NodeType.Run, true))
            {
                run.Font.Size = fontStyle.Font.Size;
                run.Font.Name = fontStyle.Font.Name;
            }
        }
    }
    else if (currentNode is Table)
    {
        Table table = currentNode as Table;
        foreach (Row row in table.Rows)
        {
            row.RowFormat.LeftIndent = (referenceNode as Paragraph).ParagraphFormat.LeftIndent; ;
        }
        if (fontStyle != null)
        {
            foreach (Run run in table.GetChildNodes(NodeType.Run, true))
            {
                run.Font.Size = fontStyle.Font.Size;
                run.Font.Name = fontStyle.Font.Name;
            }
        }
    }
}

I am attaching a sample generated document. Now the issue is there is some extra space coming.

Any idea how this can be rectified?

thanks in adv

Amit

alexey.noskov · September 30, 2009, 2:34am

Hi Amit,

Thank you for additional information. Could you also show me code where you use this method or create simple application (this would be better), which will allow me to reproduce the problem on my side? I will check the issue and provide you more information.

Best regards.

amitkishore · September 30, 2009, 3:42am

Hi,

Thanks a lot. PFA the code sections

private static void WriteLanguageText(DocumentBuilder documentBuilder, SectionStyle sectionStyle, Language language)
{
    Paragraph beginingParagraph = documentBuilder.CurrentParagraph;
    Node beginingNode = beginingParagraph as Node;
    string cleanHTML = CleanHTMLForContract(language.Text, true);// attached is the sample doc for language(html)
    documentBuilder.InsertHtml(cleanHTML);
    Node endingNode = documentBuilder.CurrentParagraph as Node;
    Node currentNode = beginingNode;
    while (currentNode != endingNode)
    {
        AlignAndFormatNode(beginingNode, currentNode, sectionStyle.ParagraphStyle, sectionStyle.FontStyle, true);
        currentNode = currentNode.NextSibling;
    }
    AlignAndFormatNode(beginingNode, currentNode, sectionStyle.ParagraphStyle, sectionStyle.FontStyle, true);
    // Writeln to strat a new list UNLESS if the previous inserting of HTML contained a list
    // This is just black magic to get the formatting work
    // Note: I am sure that you could simplify this logic. But it works and I am afraid to
    // change it.
    if (currentNode is Paragraph
    && (currentNode as Paragraph).IsListItem)
    {
        if (beginingParagraph.IsListItem
        && (currentNode as Paragraph).ListFormat.ListLevelNumber > beginingParagraph.ListFormat.ListLevelNumber)
        {
            //no nothing
        }
        else
        {
            documentBuilder.Writeln();
        }
    }
    else
    {
        documentBuilder.Writeln();
    }
    documentBuilder.ListFormat.List = beginingParagraph.ListFormat.List;
    documentBuilder.ListFormat.ListLevelNumber = beginingParagraph.ListFormat.ListLevelNumber;
}

public static void AlignAndFormatNode(Node referenceNode, Node currentNode, Style paragraphStyle, Style fontStyle, bool isNumber)
{
    if (currentNode is Paragraph)
    {
        Paragraph paragraph = currentNode as Paragraph;
        // If a list item, incorporate list with privious list
        //if (paragraph.IsListItem)
        //{
        // if (paragraph.ListFormat.List.ListId
        // != (referenceNode as Paragraph).ListFormat.List.ListId)
        // {
        // paragraph.ListFormat.List = (referenceNode as Paragraph).ListFormat.List;
        // paragraph.ListFormat.ListLevelNumber = paragraph.ListFormat.ListLevelNumber
        // + (referenceNode as Paragraph).ListFormat.ListLevelNumber + 1;
        // }
        //}
        if (!isNumber)
        {
            if (paragraph.IsListItem)
            {
                if (paragraph.ListFormat.List.ListId
                != (referenceNode as Paragraph).ListFormat.List.ListId)
                {
                    paragraph.ListFormat.List = (referenceNode as Paragraph).ListFormat.List;
                    paragraph.ListFormat.ListLevelNumber = paragraph.ListFormat.ListLevelNumber
                        +(referenceNode as Paragraph).ListFormat.ListLevelNumber + 1;
                }
            }
        }
        else
        {
            if (paragraph.IsListItem)
            {
                if (paragraph.ListFormat.List.ListId
                != (referenceNode as Paragraph).ListFormat.List.ListId)
                {
                    paragraph.ListFormat.List = (referenceNode as Paragraph).ListFormat.List;
                    paragraph.ListFormat.ListLevelNumber = paragraph.ListFormat.ListLevelNumber
                        +(referenceNode as Paragraph).ListFormat.ListLevelNumber + 2;
                }
            }
            else
            {
                paragraph.ListFormat.List = (referenceNode as Paragraph).ListFormat.List;
                paragraph.ListFormat.ListLevelNumber = paragraph.ListFormat.ListLevelNumber
                    +(referenceNode as Paragraph).ListFormat.ListLevelNumber + 1;
            }
        }
        paragraph.ParagraphFormat.SpaceAfterAuto = false;
        paragraph.ParagraphFormat.SpaceAfter = 0;
        if (paragraphStyle != null)
        {
            paragraph.ParagraphFormat.SpaceAfterAuto = paragraphStyle.ParagraphFormat.SpaceAfterAuto;
            paragraph.ParagraphFormat.SpaceAfter = paragraphStyle.ParagraphFormat.SpaceAfter;
            paragraph.ParagraphFormat.Alignment = paragraphStyle.ParagraphFormat.Alignment;
            //paragraph = (Paragraph)paragraph.PreviousSibling;
            //paragraph.ParagraphFormat.Style = paragraphStyle.ParagraphFormat.Style;
        }
        if (fontStyle != null)
        {
            foreach (Run run in paragraph.GetChildNodes(NodeType.Run, true))
            {
                run.Font.Size = fontStyle.Font.Size;
                run.Font.Name = fontStyle.Font.Name;
            }
        }
    }
    else if (currentNode is Table)
    {
        Table table = currentNode as Table;
        foreach (Row row in table.Rows)
        {
            row.RowFormat.LeftIndent = (referenceNode as Paragraph).ParagraphFormat.LeftIndent; ;
        }
        if (fontStyle != null)
        {
            foreach (Run run in table.GetChildNodes(NodeType.Run, true))
            {
                run.Font.Size = fontStyle.Font.Size;
                run.Font.Name = fontStyle.Font.Name;
            }
        }
    }
}

public static string CleanHTMLForContract(string htmlString, bool isPara)
{
    HtmlDocument htmlDocument = new HtmlDocument();
    //string cleanString = ReplaceSoftReturns(htmlString);
    htmlDocument.LoadHtml(htmlString);
    if (!isPara)
    {
        RemoveExtraPTags(htmlDocument, true);
        RemoveBreakBeforeList(htmlDocument.DocumentNode.ChildNodes);
    }
    else
    {
        //RemoveExtraPTags(htmlDocument, true);
        // replace with . 
        ReplaceBRWithP(htmlDocument);
        RemoveBreakBeforeList(htmlDocument.DocumentNode.ChildNodes);
    }
    return htmlDocument.DocumentNode.OuterHtml;
}

private static void RemoveExtraPTags(HtmlDocument document, bool numberAllParagraphs)
{
    // xpath "//td/p" finds p tags that a are direct decendants of all td tags
    HtmlNodeCollection tableNodes = document.DocumentNode.SelectNodes("//td/p");
    // xpath "//li/p" finds p tags that a are direct decendants of all li tags
    HtmlNodeCollection lineNodes = document.DocumentNode.SelectNodes("//li/p");
    bool appendBR = false;
    //if (tableNodes != null) ConvertPToSpan(tableNodes,appendBR);
    if (lineNodes != null) ConvertPToSpan(lineNodes, appendBR);
}

private static void ConvertPToSpan(HtmlNodeCollection nodes, bool appendBR)
{
    ConvertPToSpan(nodes, null, appendBR);
}

private static void ConvertPToSpan(HtmlNodeCollection nodes, HtmlNodeCollection excludeNodes, bool appendBR)
{
    foreach (HtmlNode node in nodes)
    {
        if (excludeNodes != null)
        {
            bool inExcludedList = false;
            foreach (HtmlNode excludedNode in excludeNodes)
            {
                if (excludedNode.Id == node.Id)
                {
                    inExcludedList = true;
                    break;
                }
            }
            if (inExcludedList) continue;
        }
        if (OPEN_P.IsMatch(node.OuterHtml) && CLOSED_P.IsMatch(node.OuterHtml))
        {
            string replacementNodeHtml = node.OuterHtml;
            // We are replacing with span for the case when style and other attributes where
            // inlined in the p-tag
            replacementNodeHtml = OPEN_P.Replace(replacementNodeHtml, "<span");
            replacementNodeHtml = CLOSED_P.Replace(replacementNodeHtml, "");
            HtmlNode newNode = HtmlNode.CreateNode(replacementNodeHtml);
            if (appendBR)
            {

                HtmlNode brNode = HtmlNode.CreateNode("   ");
                node.ParentNode.InsertAfter(brNode, node); // add br node
            }
            node.ParentNode.ReplaceChild(newNode, node);
        }
    }
}

private static void ReplaceBRWithP(HtmlDocument htmlDocument)
{
    HtmlNode body = htmlDocument.DocumentNode.SelectSingleNode("//body");
    string strBefore = body.InnerHtml;
    //string[] strTokens;
    Regex regex = new Regex(@"");
    string[] strTokens = regex.Split(strBefore);
    if (strTokens.Length > 1)
    {
        System.Text.StringBuilder strBuild = new System.Text.StringBuilder();
        foreach (string token in strTokens)
        {
            strBuild.AppendFormat(@"{0}", token);
            //if (token.Length > 1) 
            //{
            // strBuild.AppendFormat(@"{0}", token); 
            //}
        }
        body.InnerHtml = strBuild.ToString();
    }
    //throw new NotImplementedException();
}

private static void RemoveBreakBeforeList(HtmlNodeCollection nodes)
{
    HtmlNode lastElement = null;
    foreach (HtmlNode node in nodes)
    {
        if (node.NodeType != HtmlNodeType.Element) continue;
        if (lastElement != null && lastElement.Name == "br" && (node.Name == "ol" || node.Name == "ul"))
        {
            lastElement.ParentNode.RemoveChild(lastElement);
        }
        lastElement = node;
        if (node.HasChildNodes)
        {
            RemoveBreakBeforeList(node.ChildNodes);
        }
    }
}

alexey.noskov · September 30, 2009, 11:57pm

Hi

Thanks you for additional information. I will take a look on your issue shortly. Please expect an answer in few hours.

Best regards.

alexey.noskov · October 1, 2009, 8:09am

Hi

Thank you for addition information. Unfortunately, I cannot run your code on my side, since you use external assemblies. It seems you use HTML Agility Pack or something else to process your HTML. Could you please simplify your code and create simple application, which I can run on my side. Sorry for inconvenience.

Best regards.