step 1: finding start tags from input document like pattern $…{
example tags: $abcstart{
step 2: add unquie tags in dictionary
Input document:
Tags.zip (18.3 KB)
expected output : in dictionary
$abcstart{
$defstart{
$xyzstart{
Please suggest with code sample
@saranyasrinivasan92,
You can build logic on the following code to get the desired output:
Document doc = new Document("E:\\Temp\\Tags\\Tags.docx");
ReplaceHandler handler = new ReplaceHandler();
FindReplaceOptions opts = new FindReplaceOptions();
opts.ReplacingCallback = handler;
string searchPattern = @"(\$.*)\{";
foreach (Paragraph para in doc.GetChildNodes(NodeType.Paragraph, true))
para.Range.Replace(new Regex(searchPattern), "", opts);
foreach (string str in handler.list)
Console.WriteLine(str);
private class ReplaceHandler : IReplacingCallback
{
public ArrayList list = new ArrayList();
public ReplaceAction Replacing(ReplacingArgs e)
{
// This is a Run node that contains either the beginning or the complete match.
Node currentNode = e.MatchNode;
// The first (and may be the only) run can contain text before the match,
// in this case it is necessary to split the run.
if (e.MatchOffset > 0)
currentNode = SplitRun((Run)currentNode, e.MatchOffset);
// This array is used to store all nodes of the match for further removing.
ArrayList runs = new ArrayList();
// Find all runs that contain parts of the match string.
int remainingLength = e.Match.Value.Length;
while (
(remainingLength > 0) &&
(currentNode != null) &&
(currentNode.GetText().Length <= remainingLength))
{
runs.Add(currentNode);
remainingLength = remainingLength - currentNode.GetText().Length;
// Select the next Run node.
// Have to loop because there could be other nodes such as BookmarkStart etc.
do
{
currentNode = currentNode.NextSibling;
}
while ((currentNode != null) && (currentNode.NodeType != NodeType.Run));
}
// Split the last run that contains the match if there is any text left.
if ((currentNode != null) && (remainingLength > 0))
{
SplitRun((Run)currentNode, remainingLength);
runs.Add(currentNode);
}
string value = e.Match.Groups[0].Value.Trim();
if (!list.Contains(value))
list.Add(value);
return ReplaceAction.Skip;
}
private static Run SplitRun(Run run, int position)
{
Run afterRun = (Run)run.Clone(true);
afterRun.Text = run.Text.Substring(position);
run.Text = run.Text.Substring(0, position);
run.ParentNode.InsertAfter(afterRun, run);
return afterRun;
}
}
Its not working properly .Please check samplesource.zip (42.6 KB)
current output : $$$PATIP START{pre-admission services}PATIP END$$$ or $$$PATOP START{
Expected output :$$$PATIP START{
$$$PATOP START{
Its not working properly .Please check samplesource.zip (42.6 KB)
current output : $$$PATIP START{pre-admission services}PATIP END$$$ or $$$PATOP START{
Expected output :$$$PATIP START{
$$$PATOP START{
@saranyasrinivasan92,
The problem occurs because of the incorrect regular expression @"(\$.*)\{"
. You need to improve the regular expression so that it captures the desired group of text. Alternatively, you can manually parse the text like this:
Document doc = new Document("E:\\Temp\\source\\Input.docx");
Node[] runs = doc.GetChildNodes(NodeType.Run, true).ToArray();
for (int i = 0; i < runs.Length; i++)
{
Run run = (Run)runs[i];
int length = run.Text.Length;
Run currentNode = run;
for (int x = 1; x < length; x++)
{
currentNode = SplitRun(currentNode, 1);
}
}
ArrayList list = new ArrayList();
NodeCollection runNodes = doc.GetChildNodes(NodeType.Run, true);
for (int i = 0; i < runNodes.Count; i++)
{
Run run = (Run)runNodes[i];
if (run.Text == "$")
{
bool flag = true;
int x = i;
string match = "";
while (run != null && run.Text != "{")
{
match = match + run.Text;
if (match.Contains("$ "))
{
flag = false;
break;
}
x++;
run = (Run)runNodes[x];
}
if (run != null && flag)
{
match = match + run.Text;
if (!list.Contains(match))
list.Add(match);
}
i = x;
}
}
foreach (string str in list)
Console.WriteLine(str);