Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ability to use multiple word instances to process multiple documents concurrently #103

Open
MarcellRoos opened this issue May 3, 2024 · 1 comment

Comments

@MarcellRoos
Copy link

I've been trying to speed up the conversion process by implementing some basic multithreading but it seems like the single instance of word is my bottleneck. Is there a workaround for this?

@mrchengshunlong
Copy link

I also encountered the same problem, and hope someone can discuss it together.
I use ProcessPoolExecutor to accomplish run at the same time, in order to avoid Word's threads cannot be closed normally, I add "pythoncom.CoInitialize()", "doc.Close(0)", "word.Quit()", "pythoncom.CoUninitialize()". howerver, The CPU usage is still 20-30% , usually 2-3 processes running at the same time, the rest of the process created, but while waiting, I wonder if it is due to the conversion ceiling of MS office itself, here is my code:

# Only set up multiple processes to convert DOCX to PDF
from pathlib import Path
from tqdm.auto import tqdm
import win32com.client
import pythoncom
import time
import os
import concurrent.futures

def convert(docx_filepath):
    pythoncom.CoInitialize()
    word = win32com.client.DispatchEx('word.application')
    wdFormatPDF = 17

    docx_filepath = Path(docx_filepath).resolve()
    pdf_filepath = docx_filepath.with_suffix('.pdf')
    doc = word.Documents.Open(str(docx_filepath))
    try:
        doc.ExportAsFixedFormat(str(pdf_filepath), wdFormatPDF, False, 0)
    except Exception as e:
        print(f"Error converting {docx_filepath}: {e}")
    finally:
        doc.Close(0)
        word.Quit()
        pythoncom.CoUninitialize()

if __name__ == "__main__":
    start_time = time.time()
    directory = "rusult/"  # replace_with_your_directory_path
    docx_files = [os.path.join(root, f)
                  for root, dirs, files in os.walk(directory)
                  for f in files if f.endswith('.docx')]

    max_workers = min(32, len(docx_files))

    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        with tqdm(total=len(docx_files), desc="Converting DOCX to PDF") as pbar:
            futures = {executor.submit(convert, docx_file): docx_file for docx_file in docx_files}
            for future in concurrent.futures.as_completed(futures):
                pbar.update(1)

    end_time = time.time()
    print(f"Full processing time: {end_time - start_time:.2f} seconds")

To avoid being every' word. Application' on and off wasted time, and later I added a for loop in convert, but the total time difference is not great.

# Also set to resize batches and the number of process pools to optimize performance
from tqdm.auto import tqdm
import win32com.client
import pythoncom
import time
import os
import concurrent.futures

def convert_batch(docx_filepaths):
    pythoncom.CoInitialize()
    word = win32com.client.DispatchEx('word.application')
    wdFormatPDF = 17

    for docx_filepath in docx_filepaths:
        docx_filepath = Path(docx_filepath).resolve()
        pdf_filepath = docx_filepath.with_suffix('.pdf')
        doc = word.Documents.Open(str(docx_filepath))
        try:
            doc.ExportAsFixedFormat(str(pdf_filepath), wdFormatPDF, False, 0)
        except Exception as e:
            print(f"Error converting {docx_filepath}: {e}")
        finally:
            doc.Close(0)

    word.Quit()
    pythoncom.CoUninitialize()


if __name__ == "__main__":
    start_time = time.time()
    directory = "result/"  # replace_with_your_directory_path
    docx_files = [os.path.join(root, f)
                  for root, dirs, files in os.walk(directory)
                  for f in files if f.endswith('.docx')]

    num_batches = min(32, len(docx_files))
    batch_size = len(docx_files) // num_batches + (1 if len(docx_files) % num_batches != 0 else 0)
    batches = [docx_files[i:i + batch_size] for i in range(0, len(docx_files), batch_size)]

    with concurrent.futures.ProcessPoolExecutor(max_workers=num_batches) as executor:
        with tqdm(total=len(docx_files), desc="Converting DOCX to PDF") as pbar:
            futures = {executor.submit(convert_batch, batch): batch for batch in batches}
            for future in concurrent.futures.as_completed(futures):
                pbar.update(len(futures[future]))

    end_time = time.time()
    print(f"Full processing time: {end_time - start_time:.2f} second")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants