Thank you for your quick response. I believe there was a misunderstanding of my original question: that is the code to replace with regex in Java, yes - but I am using Python.
To be clear, according to the instructions on the official site, Python can be used to access “Aspose.Words for Java” through the JPype library. So I am writing my code in Python, which then is translated and passed to Java.
The examples in the Python guide are very helpful, but do not show how to correctly use a regex pattern in Python with the “replace” function. I have tried writing the regex pattern in the usual Python way (r"o[r|s|f]t"
) but it does not produce a change (i.e., Aspose.Words for Java does not find a phrase to replace in the text). I believe this is because Aspose.Words for Java is searching for “o[r|s|f]t” as a literal string instead of a regex pattern.
I have attached a zip file (aspose_words_python_regex.zip (27.7 KB)) of the code and input/output documents.
- AsposePythonRegexExample_Before.docx: input file
- AsposePythonRegexExample_After.docx: desired output file
- AsposePythonRegexExample_ActualAfter.docx: the actual output file with undesired behavior
- python_regex_example.py: source code
I have also included the code here. The ExtractContent
class is directly from the Aspose Python examples (init.py). My code is at the bottom, in the if "__main__":
section:
import jpype
import os.path
import re
class ExtractContent:
def __init__(self, dataDir):
self.dataDir = dataDir
self.Document = jpype.JClass("com.aspose.words.Document")
self.DocumentBuilder = jpype.JClass("com.aspose.words.DocumentBuilder")
self.NodeType = jpype.JClass("com.aspose.words.NodeType")
self.NodeImporter = jpype.JClass("com.aspose.words.NodeImporter")
self.ImportFormatMode = jpype.JClass("com.aspose.words.ImportFormatMode")
self.Collections = jpype.JClass("java.util.Collections")
self.SaveFormat = jpype.JClass("com.aspose.words.SaveFormat")
def extractContents(self, startNode, endNode, isInclusive):
# First check that the nodes passed to this method are valid for use.
self.verifyParameterNodes(startNode, endNode)
# Create a list to store the extracted nodes.
nodes = []
# Keep a record of the original nodes passed to this method so we can split marker nodes if needed.
originalStartNode = startNode
originalEndNode = endNode
# Extract content based on block level nodes (paragraphs and tables). Traverse through parent nodes to find them.
# We will split the content of first and last nodes depending if the marker nodes are inline
while startNode.getParentNode().getNodeType() != self.NodeType.BODY:
startNode = startNode.getParentNode()
while (endNode.getParentNode().getNodeType() != self.NodeType.BODY):
endNode = endNode.getParentNode()
print(str(originalStartNode) + " = " + str(startNode))
print(str(originalEndNode) + " = " + str(endNode))
isExtracting = True
isStartingNode = True
# The current node we are extracting from the document.
currNode = startNode
# Begin extracting content. Process all block level nodes and specifically split the first and last nodes when needed so paragraph formatting is retained.
# Method is little more complex than a regular extractor as we need to factor in extracting using inline nodes, fields, bookmarks etc as to make it really useful.
while (isExtracting):
# Clone the current node and its children to obtain a copy.
cloneNode = currNode.deepClone(True)
isEndingNode = currNode.equals(endNode)
if (isStartingNode or isEndingNode):
# We need to process each marker separately so pass it off to a separate method instead.
if (isStartingNode):
self.processMarker(cloneNode, nodes, originalStartNode, isInclusive, isStartingNode, isEndingNode)
isStartingNode = False
# Conditional needs to be separate as the block level start and end markers maybe the same node.
if (isEndingNode):
self.processMarker(cloneNode, nodes, originalEndNode, isInclusive, isStartingNode, isEndingNode)
isExtracting = False
else:
# Node is not a start or end marker, simply add the copy to the list.
nodes.append(cloneNode)
# Move to the next node and extract it. If next node is null that means the rest of the content is found in a different section.
if (currNode.getNextSibling() is None and isExtracting):
# Move to the next section.
nextSection = currNode.getAncestor(self.NodeType.SECTION).getNextSibling()
currNode = nextSection.getBody().getFirstChild()
else:
# Move to the next node in the body.
currNode = currNode.getNextSibling()
# Return the nodes between the node markers.
return nodes
# ExEnd
# ExStart
# ExId:ExtractBetweenNodes_Helpers
# ExSummary:The helper methods used by the ExtractContent method.
#
# Checks the input parameters are correct and can be used. Throws an exception if there is any problem.
#
def verifyParameterNodes(self, startNode, endNode):
# The order in which these checks are done is important.
if (startNode is None):
raise ValueError('Start node cannot be null')
if (endNode is None):
raise ValueError('End node cannot be null')
if (startNode.getDocument() != endNode.getDocument()):
raise ValueError('Start node and end node must belong to the same document')
if (startNode.getAncestor(self.NodeType.BODY) is None or endNode.getAncestor(self.NodeType.BODY) is None):
raise ValueError('Start node and end node must be a child or descendant of a body')
# Check the end node is after the start node in the DOM tree
# First check if they are in different sections, then if they're not check their position in the body of the same section they are in.
startSection = startNode.getAncestor(self.NodeType.SECTION)
endSection = endNode.getAncestor(self.NodeType.SECTION)
startIndex = startSection.getParentNode().indexOf(startSection)
endIndex = endSection.getParentNode().indexOf(endSection)
if (startIndex == endIndex):
if (startSection.getBody().indexOf(startNode) > endSection.getBody().indexOf(endNode)):
raise ValueError('The end node must be after the start node in the body')
elif (startIndex > endIndex):
raise ValueError('The section of end node must be after the section start node')
def isInline(self, node):
# Test if the node is desendant of a Paragraph or Table node and also is not a paragraph or a table a paragraph inside a comment class which is decesant of a pararaph is possible.
return ((node.getAncestor(self.NodeType.PARAGRAPH) is not None or node.getAncestor(
self.NodeType.TABLE) is not None) and not (
node.getNodeType() == self.NodeType.PARAGRAPH or node.getNodeType() == self.NodeType.TABLE))
def processMarker(self, cloneNode, nodes, node, isInclusive, isStartMarker, isEndMarker):
# If we are dealing with a block level node just see if it should be included and add it to the list.
if (not (self.isInline(node))):
# Don't add the node twice if the markers are the same node
if (not (isStartMarker and isEndMarker)):
if (isInclusive):
nodes.append(cloneNode)
return
# If a marker is a FieldStart node check if it's to be included or not.
# We assume for simplicity that the FieldStart and FieldEnd appear in the same paragraph.
if (node.getNodeType() == self.NodeType.FIELD_START):
# If the marker is a start node and is not be included then skip to the end of the field.
# If the marker is an end node and it is to be included then move to the end field so the field will not be removed.
if ((isStartMarker and not (isInclusive)) or (not (isStartMarker) and isInclusive)):
while ((node.getNextSibling() is not None) and (node.getNodeType() != self.NodeType.FIELD_END)):
node = node.getNextSibling()
# If either marker is part of a comment then to include the comment itself we need to move the pointer forward to the Comment
# node found after the CommentRangeEnd node.
if (node.getNodeType() == self.NodeType.COMMENT_RANGE_END):
while (node.getNextSibling() is not None and node.getNodeType() != self.NodeType.COMMENT):
node = node.getNextSibling()
# Find the corresponding node in our cloned node by index and return it.
# If the start and end node are the same some child nodes might already have been removed. Subtract the
# difference to get the right index.
indexDiff = node.getParentNode().getChildNodes().getCount() - cloneNode.getChildNodes().getCount()
# Child node count identical.
if (indexDiff == 0):
node = cloneNode.getChildNodes().get(node.getParentNode().indexOf(node))
else:
node = cloneNode.getChildNodes().get(node.getParentNode().indexOf(node) - indexDiff)
# Remove the nodes up to/from the marker.
isProcessing = True
isRemoving = isStartMarker
nextNode = cloneNode.getFirstChild()
while (isProcessing and nextNode is not None):
currentNode = nextNode
isSkip = False
if (currentNode.equals(node)):
if (isStartMarker):
isProcessing = False
if (isInclusive):
isRemoving = False
else:
isRemoving = True
if (isInclusive):
isSkip = True
nextNode = nextNode.getNextSibling()
if (isRemoving and not (isSkip)):
currentNode.remove()
# After processing the composite node may become empty. If it has don't include it.
if (not (isStartMarker and isEndMarker)):
if (cloneNode.hasChildNodes()):
nodes.append(cloneNode)
def generateDocument(self, srcDoc, nodes):
# Create a blank document.
dstDoc = self.Document()
# Remove the first paragraph from the empty document.
dstDoc.getFirstSection().getBody().removeAllChildren()
# Import each node from the list into the new document. Keep the original formatting of the node.
importer = self.NodeImporter(srcDoc, dstDoc, self.ImportFormatMode.KEEP_SOURCE_FORMATTING)
for node in nodes:
importNode = importer.importNode(node, True)
dstDoc.getFirstSection().getBody().appendChild(importNode)
# Return the generated document.
return dstDoc
if "__main__":
# initialize JPype
jarpath = os.path.join(os.path.abspath("."), "lib")
jpype.startJVM("/usr/lib/jvm/java-1.8.0-openjdk-amd64/jre/lib/amd64/server/libjvm.so", "-Djava.ext.dirs=%s" % jarpath)
# intitialize extract object and document
extractObject = ExtractContent('')
doc = extractObject.Document(extractObject.dataDir + "AsposePythonRegexExample_Before.docx")
FindReplaceDirection =jpype.JClass("com.aspose.words.FindReplaceDirection")
FindReplaceOptions =jpype.JClass("com.aspose.words.FindReplaceOptions")
# define start and end of search
startObj = doc.getFirstSection().getBody().getFirstParagraph()
endObj = doc.getLastSection().getBody().getLastParagraph()
extractedNodes = extractObject.extractContents(startObj, endObj, True)
#search through nodes
for node in extractedNodes:
if "Paragraph" in str(node.__class__):
# this is the node's text
text_to_search = node.getRange().getText()
print("text before regex: " + text_to_search)
# here is the replace function with the desired regex pattern (pattern, replace_string, FindReplaceOptions)
node.getRange().replace(r'o[r|s|f]t', "XXX", FindReplaceOptions(FindReplaceDirection.FORWARD))
print("text after regex: " + node.getRange().getText())
print("expected text after regex: " + "Aspose.Words suppXXXs mXXX elements of MicrosXXX Word documents.")
# save document
dstDoc = extractObject.generateDocument(doc, extractedNodes)
dstDoc.save(extractObject.dataDir + "AsposePythonRegexExample_ActualAfter.docx")