Hello,
I have a word document from which I need to extract a few lines of text. the text i need to extract can be found in between the two strings: “must haves” and “could haves”. Does anyone know what I should do to achieve this?
Hello,
I have a word document from which I need to extract a few lines of text. the text i need to extract can be found in between the two strings: “must haves” and “could haves”. Does anyone know what I should do to achieve this?
In your case, we suggest you please read the following article about extracting content.
How to Extract Selected Content Between Nodes in a Document
You can use find and replace feature of Aspose.Words to find the text and insert bookmark before your strings. After that you can extract the content between bookmarks.
You can use following FindAndInsertBookmark class to find the text and bookmark it.
public class FindAndInsertBookmark : IReplacingCallback
{
string bmname;
public int i = 1;
Boolean isStart;
DocumentBuilder builder;
public FindAndInsertBookmark(string bmname, Boolean isStart, int i)
{
this.bmname = bmname;
this.isStart = isStart;
this.i = i;
}
ReplaceAction IReplacingCallback.Replacing(ReplacingArgs e)
{
// This is a Run node that contains either the beginning or the complete match.
Node currentNode = e.MatchNode;
if (builder == null)
builder = new DocumentBuilder((Document)currentNode.Document);
// The first (and may be the only) run can contain text before the match,
// in this case it is necessary to split the run.
if (e.MatchOffset > 0)
currentNode = SplitRun((Run)currentNode, e.MatchOffset);
ArrayList runs = new ArrayList();
// Find all runs that contain parts of the match string.
int remainingLength = e.Match.Value.Length;
while (
(remainingLength > 0) &&
(currentNode != null) &&
(currentNode.GetText().Length <= remainingLength))
{
runs.Add(currentNode);
remainingLength = remainingLength - currentNode.GetText().Length;
// Select the next Run node.
// Have to loop because there could be other nodes such as BookmarkStart etc.
do
{
currentNode = currentNode.NextSibling;
}
while ((currentNode != null) && (currentNode.NodeType != NodeType.Run));
}
// Split the last run that contains the match if there is any text left.
if ((currentNode != null) && (remainingLength > 0))
{
SplitRun((Run)currentNode, remainingLength);
runs.Add(currentNode);
}
if (isStart)
{
Run run = (Run)runs[0];
run.ParentNode.InsertBefore(new BookmarkStart(run.Document, bmname + i), run);
i++;
}
else
{
Run run = (Run)runs[runs.Count - 1];
run.ParentNode.InsertAfter(new BookmarkEnd(run.Document, bmname + i), run);
i++;
}
// Signal to the replace engine to do nothing because we have already done all what we wanted.
return ReplaceAction.Skip;
}
/// <summary>
/// Splits text of the specified run into two runs.
/// Inserts the new run just after the specified run.
/// </summary>
private static Run SplitRun(Run run, int position)
{
Run afterRun = (Run)run.Clone(true);
afterRun.Text = run.Text.Substring(position);
run.Text = run.Text.Substring(0, position);
run.ParentNode.InsertAfter(afterRun, run);
return afterRun;
}
}
Thank you for your reply but i don’t understand.
Sorry if i’m being stupid but i don’t understand how this is extracting the text between the strings “must haves” and “could haves”.
could you explain?
Please ZIP and attach your input and expected output documents. We will then provide you complete code example according to your requirement.
thank you very much for helping me.
i attached the zip with input and output file in this message
there is also a file with some more explanation.
Aspose files.zip (347.3 KB)
Please use the following code example to get the desired list items under specified text. Moreover, we suggest you please read the following article.
Find and Replace
Document doc = new Document(MyDir + "aspose input.docx");
FindAndGetListItems findAndGetList = new FindAndGetListItems();
FindReplaceOptions options = new FindReplaceOptions();
options.ReplacingCallback = findAndGetList;
doc.Range.Replace("must have&p", "", options);
foreach (Paragraph paragraph in findAndGetList.arrayList)
{
Console.WriteLine(paragraph.ToString(SaveFormat.Text));
}
public class FindAndGetListItems : IReplacingCallback
{
public ArrayList arrayList;
public FindAndGetListItems()
{
arrayList = new ArrayList();
}
ReplaceAction IReplacingCallback.Replacing(ReplacingArgs e)
{
Node currentNode = e.MatchNode.ParentNode.NextSibling;
while (currentNode != null
&& currentNode.NodeType == NodeType.Paragraph
&& ((Paragraph)currentNode).IsListItem)
{
arrayList.Add(currentNode);
currentNode = currentNode.NextSibling;
}
return ReplaceAction.Skip;
}
}