Merging ALOT of word documents into a single PDF, How to use Threading/Parallel Processing

a.boubleh · August 23, 2024, 9:52am

I have been struggling how to make my app use parallel processing to accelerate the merge process. Here is my C++ code.

using namespace Aspose::Words;
using namespace System;
using namespace System::IO;
using namespace System::Collections::Generic;

int main() {
    try {
        String folderPath = u"C:\\Users\\baymane\\Projects\\facture-jumlee2\\temp\\2024-07-03_11_52_46\\202405050101\\docx\\";

        auto docxFiles = Directory::GetFiles(folderPath, u"*.docx");
        auto finalDoc = MakeObject<Document>();
        int i = 1;
        int length = docxFiles->get_Length();
        for (auto docxFile : docxFiles) {
            i++;
            auto tempDoc = MakeObject<Document>(docxFile);
            finalDoc->AppendDocument(tempDoc, ImportFormatMode::KeepSourceFormatting);
            std::cout << "Reached document " << i << " on "  << length << std::endl;
        }

        // Save the merged document as a PDF
        finalDoc->Save(u"output.pdf", SaveFormat::Pdf);

        std::cout << "Finished." << std::endl;
    }
    catch (const Exception& e) {
        std::cerr << "An error occurred: " << e->get_Message().ToUtf8String() << std::endl;
    }

    return 0;
}

I also have developed a Java app that does the same thing. I tried ExecutorService and using Threads, but my app crashes with a memory exception, even though I still have enough RAM. I looked up the problem and found no leads.

Here is the Java Code for parallel processing:

package or.aspose;
import com.aspose.words.*;
import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Array;
import java.util.*;
import java.io.File;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.stream.Stream;

import static or.aspose.helpers.DocxHelper.isSectionEmpty;
import static or.aspose.helpers.Helpers.*;

public class Main {
    // Example public static method to be executed in parallel
    public static String processDocxFolderPath(int taskId,List<String> docxFoldersPath, String originalFolderPath, String destinationFolderPath) {
        // Simulating some task processing
        try {
            Thread.sleep(5000);
                for(String folder: docxFoldersPath){
                System.out.println("Task_" + taskId+ ": Processing " + folder);
                generatePDFsForFolder(taskId,originalFolderPath,folder, destinationFolderPath);
            }
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        return "Task " + taskId + " completed";
    }
    public static List<List<String>> divideArray(List<String> list, int splits) {
        List<List<String>> chunks = new ArrayList<>();
        for(int i = 0;i<splits;i++){
            chunks.add(new ArrayList<>());
        }
        int i = 0;
        while (i < list.size()){
            chunks.get(i%splits).add(list.get(i));
            i++;
        }
        return chunks;
    }

    public static void generatePDFs(String originalFolderPath, int numTasks) throws Exception {
        License wordLicense = new License();
        wordLicense.setLicense("Aspose.WordsforJava.lic");
        ArrayList<String> docxPaths = new ArrayList<>();
        Set<String> foundFolders = new HashSet<>(); // Using Set to avoid duplicate relative paths

        findDocxFiles(originalFolderPath, docxPaths, foundFolders);
        List<List<String>> splitFoundFolders =  divideArray(foundFolders.stream().toList(), numTasks);
        for(List<String> folders: splitFoundFolders){
            System.out.print("[");
            for(String folder: folders){
                System.out.print(folder + ",");
            }
            System.out.println("]");
        }
        String destinationFolderPath = "C:\\Users\\baymane\\IdeaProjects\\test\\resources";
        ExecutorService executorService = Executors.newFixedThreadPool(numTasks);

        List<CompletableFuture<String>> futures = new ArrayList<>();
        for (int i = 0; i < numTasks; i++) {
            final int taskId = i;
            CompletableFuture<String> future = CompletableFuture.supplyAsync(() -> processDocxFolderPath(taskId,splitFoundFolders.get(taskId), originalFolderPath, destinationFolderPath), executorService);
            futures.add(future);
        }
        List<String> results = new ArrayList<>();
        for (CompletableFuture<String> future : futures) {
            try {
                String result = future.get(); // Blocking call to get the result
                results.add(result);
            } catch (InterruptedException | ExecutionException e) {
                e.printStackTrace();
            }
        }

        // Shutdown the executor service
        executorService.shutdown();

        // Print results
        System.out.println("Parallel execution results:");
        results.forEach(System.out::println);
        }
    public static void generatePDFsForFolder(int taskId,String originalPath,String folderRelativePath, String targetFolder) throws Exception {
        Document mergedDoc = new Document();
        // Call the method to find DOCX files and their containing folders
        List<String> docxPaths = listDocxFiles(originalPath + folderRelativePath);
        for (int i=0;i < docxPaths.size();i++){
            System.out.println("Task_" + taskId + ":" + "merging document " + (i+1) + "/" + docxPaths.size() + " in folder " + originalPath  + folderRelativePath);
            Document doc1 = new Document(docxPaths.get(i));
            doc1.cleanup();
            // Iterate through sections to find and remove empty pages
            for (Section section : doc1.getSections()) {
                // Check if the section has no body content
                if (isSectionEmpty(section)) {
                    // Remove the section (which effectively removes the page)
                    doc1.getSections().remove(section);
                }
            }
            mergedDoc.appendDocument(doc1, ImportFormatMode.KEEP_SOURCE_FORMATTING);

        }
        //createFoldersInDestination(targetFolder, foundFolders);
        String documentName = targetFolder + "\\" + folderRelativePath.replace("/","_").replace("\\","_") + ".pdf";
        mergedDoc.save(documentName, SaveFormat.PDF);
        System.out.println("Task_" + taskId + ": Documents merged as " + documentName);
    }
    public static void main(String[] args) throws Exception {
        //getFoldersOfDocxText();
        System.out.println(Runtime.getRuntime().maxMemory());
        generatePDFs("C:\\Users\\baymane\\Projects\\facture-jumlee2\\temp", 1);

    }
}

Is there any mistake im making? Is there any example of Threading/Parallel Processing in Java of Word documents? I appreciate any help. Thanks

alexey.noskov · August 23, 2024, 11:26am

@a.boubleh Please note, Aspose.Words is multithread safe as long as only one thread works on a document at a time. This is a typical scenario to have one thread working on one document. Different threads can safely work on different documents at the same time.

Also, you can use Merger class to merge documents.