Convert word to json in parent child hierarchy using aspose words python

Hello,

I want to convert a word document to a JSON structure with python. I want to preserve the relationship and hierarchy of title and nested lists. Is this possible with Aspose using python?

Regards,
SM

@ln22 There is no built-in method for converting document to Json structure using Aspose.Words. But you can use approach similar to the one suggested in the following topic to achieve this:
https://forum.aspose.com/t/html-to-json/281651

@alexey.noskov Could you help in converting that posts code to Aspose.Words for Python via .NET?

@ln22 You can try using code like the following:

def get_json(doc: aw.Document):
    sb = ""
    indent = 1
    sb += open_json()
    indent += 1
    sb += open_element(get_node_name(doc), indent)
    for section in doc.sections :
        indent += 1
        sb += open_element(get_node_name(section), indent)
        sb += handle_container(section.as_section().body, indent)
        indent -= 1
        sb += close_element(indent, section.next_sibling == None)
    indent -= 1
    sb += close_element(indent, True)
    sb += close_element(0, True)
    return sb

def handle_container(container, indent) :
    if (not container.has_child_nodes):
        return open_and_close_element(container, indent, (container.next_sibling == None))
    else :
        indent += 1
        sb = open_element(get_node_name(container), indent)
        for node in container.get_child_nodes(aw.NodeType.ANY, False) :
            childContainer = None
            try:
                childContainer = node.as_composite_node()
            except:
                childContainer = None
            if (childContainer != None) :
                sb += handle_container(childContainer, indent)
            else :
                sb += handle_node(node, indent)
        indent -= 1
        sb += close_element(indent, (container.next_sibling == None))
        return sb

def handle_node(node, indent) :
    if node.node_type == aw.NodeType.RUN :
        indent += 1
        sb = open_element(get_node_name(node), indent)
        run = node.as_run()
        sb += write_element("text", run.text, indent, True)
        indent -= 1
        sb += close_element(indent, (node.next_sibling == None))
        return sb
    else:
        return ""


def open_json():
    return "{\n"

def open_element(name: str, indent):
    return get_indent(indent) + get_quoted(name) + " : {\n"

def open_and_close_element(node: aw.Node, indent, isLast):
    return get_indent(indent) + get_quoted(get_node_name(node)) + " : { }" + get_comma(isLast) + "\n";

def write_element(name, value, indent, isLast):
    return get_indent(indent) + get_quoted(name) + " : " + get_quoted(value) + get_comma(isLast) + "\n"

def close_element(indent, isLast):
    return get_indent(indent) + "}" + get_comma(isLast) + "\n"

def get_node_name(node : aw.Node):
    return node.node_type

def get_quoted(value):
    return "\"" + str(value) + "\""

def get_comma(is_last_element):
    if is_last_element:
        return ""
    else:
        return ","

def get_indent(indent):
    val = ""
    for i in range(0,indent * 2):
        val += " "
    return val
doc = aw.Document("C:\\Temp\\in.docx")
print(get_json(doc))

@alexey.noskov

Would you be able to have this get_json function output a python dictionary object instead of a string. I have been trying to convert it into a dictionary and cannot seem to do it.

@ln22 The code is provided for demonstration purposes to demonstrate the technique you can use to convert document to JSON string. You are free to modify it to get the output in the form that is required by your application.

1 Like