Find Tags in Word Document using Regex Pattern & Add Unique Keywords in List Dictionary using C# .NET

saranyasrinivasan92 · June 2, 2020, 7:26am

step 1: finding start tags from input document like pattern $…{
example tags: $abcstart{
step 2: add unquie tags in dictionary
Input document:
Tags.zip (18.3 KB)

expected output : in dictionary
$abcstart{
$defstart{
$xyzstart{

Please suggest with code sample

awais.hafeez · June 2, 2020, 9:51am

@saranyasrinivasan92,

You can build logic on the following code to get the desired output:

Document doc = new Document("E:\\Temp\\Tags\\Tags.docx");

ReplaceHandler handler = new ReplaceHandler();
FindReplaceOptions opts = new FindReplaceOptions();
opts.ReplacingCallback = handler;

string searchPattern = @"(\$.*)\{";
foreach (Paragraph para in doc.GetChildNodes(NodeType.Paragraph, true))
    para.Range.Replace(new Regex(searchPattern), "", opts);

foreach (string str in handler.list)
    Console.WriteLine(str);

private class ReplaceHandler : IReplacingCallback
{
    public ArrayList list = new ArrayList();
    public ReplaceAction Replacing(ReplacingArgs e)
    {
        // This is a Run node that contains either the beginning or the complete match.
        Node currentNode = e.MatchNode;

        // The first (and may be the only) run can contain text before the match,
        // in this case it is necessary to split the run.
        if (e.MatchOffset > 0)
            currentNode = SplitRun((Run)currentNode, e.MatchOffset);

        // This array is used to store all nodes of the match for further removing.
        ArrayList runs = new ArrayList();

        // Find all runs that contain parts of the match string.
        int remainingLength = e.Match.Value.Length;
        while (
        (remainingLength > 0) &&
        (currentNode != null) &&
        (currentNode.GetText().Length <= remainingLength))
        {
            runs.Add(currentNode);
            remainingLength = remainingLength - currentNode.GetText().Length;

            // Select the next Run node.
            // Have to loop because there could be other nodes such as BookmarkStart etc.
            do
            {
                currentNode = currentNode.NextSibling;
            }
            while ((currentNode != null) && (currentNode.NodeType != NodeType.Run));
        }

        // Split the last run that contains the match if there is any text left.
        if ((currentNode != null) && (remainingLength > 0))
        {
            SplitRun((Run)currentNode, remainingLength);
            runs.Add(currentNode);
        }

        string value = e.Match.Groups[0].Value.Trim();
        if (!list.Contains(value))
            list.Add(value);

        return ReplaceAction.Skip;
    }

    private static Run SplitRun(Run run, int position)
    {
        Run afterRun = (Run)run.Clone(true);
        afterRun.Text = run.Text.Substring(position);
        run.Text = run.Text.Substring(0, position);
        run.ParentNode.InsertAfter(afterRun, run);
        return afterRun;
    }
}

saranyasrinivasan92 · June 18, 2020, 11:18am

Its not working properly .Please check samplesource.zip (42.6 KB)

current output : $$$PATIP START{pre-admission services}PATIP END$$$ or $$$PATOP START{
Expected output :$$$PATIP START{
$$$PATOP START{

saranyasrinivasan92 · June 18, 2020, 11:20am

Its not working properly .Please check samplesource.zip (42.6 KB)

current output : $$$PATIP START{pre-admission services}PATIP END$$$ or $$$PATOP START{
Expected output :$$$PATIP START{
$$$PATOP START{

awais.hafeez · June 19, 2020, 5:52am

@saranyasrinivasan92,

The problem occurs because of the incorrect regular expression @"(\$.*)\{". You need to improve the regular expression so that it captures the desired group of text. Alternatively, you can manually parse the text like this:

Document doc = new Document("E:\\Temp\\source\\Input.docx");

Node[] runs = doc.GetChildNodes(NodeType.Run, true).ToArray();
for (int i = 0; i < runs.Length; i++)
{
    Run run = (Run)runs[i];
    int length = run.Text.Length;

    Run currentNode = run;
    for (int x = 1; x < length; x++)
    {
        currentNode = SplitRun(currentNode, 1);
    }
}

ArrayList list = new ArrayList();
NodeCollection runNodes = doc.GetChildNodes(NodeType.Run, true);
for (int i = 0; i < runNodes.Count; i++)
{
    Run run = (Run)runNodes[i];
    if (run.Text == "$")
    {
        bool flag = true;
        int x = i;
        string match = "";
        while (run != null && run.Text != "{")
        {
            match = match + run.Text;

            if (match.Contains("$ "))
            {
                flag = false;
                break;
            }

            x++;
            run = (Run)runNodes[x];
        }

        if (run != null && flag)
        {
            match = match + run.Text;
            if (!list.Contains(match))
                list.Add(match);
        }

        i = x;
    }
}

foreach (string str in list)
    Console.WriteLine(str);