class DocumentConverter
A class that converts various document formats (Word, Excel, PowerPoint, OpenDocument, Visio) to PDF using LibreOffice's headless conversion capabilities, with support for parallel processing and directory structure preservation.
/tf/active/vicechatdev/pdfconverter.py
15 - 190
moderate
Purpose
DocumentConverter provides a robust solution for batch converting office documents to PDF format. It recursively scans an input directory for supported document types, converts them to PDF using LibreOffice in headless mode, and preserves the original directory structure in the output location. The class supports concurrent conversions for improved performance, handles PDF files by copying them directly, tracks conversion errors with detailed reporting, and manages temporary files during the conversion process. It's designed for scenarios requiring bulk document conversion with error tracking and progress monitoring.
Source Code
class DocumentConverter:
"""Convert various document formats to PDF using LibreOffice"""
# Supported file extensions
SUPPORTED_EXTENSIONS = [
# Word documents
'.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf',
# Excel documents
'.xls', '.xlsx', '.xlsm', '.xlt', '.xltx', '.xltm', '.xlsb',
# PowerPoint documents
'.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx',
# Other formats
'.odt', '.ods', '.odp', '.vsd', '.vsdx',
# Include PDF to handle PDF files in the source
'.pdf'
]
def __init__(self, input_dir, output_dir, max_workers=1):
"""
Initialize the converter
Args:
input_dir: Directory with source documents
output_dir: Directory to save PDF files
max_workers: Maximum number of concurrent conversions
"""
self.input_dir = Path(input_dir).absolute()
self.output_dir = Path(output_dir).absolute()
self.max_workers = max_workers
self.error_details = {} # Store detailed error information
# Create output directory if not exists
os.makedirs(self.output_dir, exist_ok=True)
def find_documents(self):
"""Find all supported documents in input directory"""
documents = []
for ext in self.SUPPORTED_EXTENSIONS:
documents.extend(self.input_dir.glob(f'**/*{ext}'))
return documents
def get_relative_output_path(self, input_file):
"""Determine the output path that preserves the original directory structure"""
# Get the relative path from the input_dir
rel_path = input_file.relative_to(self.input_dir)
# Calculate the output directory path preserving folder structure
output_dir = self.output_dir / rel_path.parent
# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Return the full output path with .pdf extension
return output_dir / f"{input_file.stem}.pdf"
def convert_document(self, input_file):
"""Convert a document to PDF using LibreOffice"""
file_id = str(input_file)
output_file = self.get_relative_output_path(input_file)
try:
# Skip if already converted
if output_file.exists():
logger.info(f"Skipping {input_file.name} - already exists at {output_file}")
return False
logger.info(f"Converting {input_file} to {output_file}")
# Special handling for PDF files - just copy them
if input_file.suffix.lower() == '.pdf':
shutil.copy2(input_file, output_file)
logger.info(f"Copied PDF file {input_file.name} to {output_file}")
return True
# Use LibreOffice for actual conversion
# We'll use a temporary directory for LibreOffice output
temp_dir = self.output_dir / "_temp"
os.makedirs(temp_dir, exist_ok=True)
cmd = [
'libreoffice',
'--headless',
'--convert-to',
'pdf',
'--outdir',
str(temp_dir),
str(input_file)
]
process = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120 # 2 minute timeout
)
if process.returncode != 0:
error_msg = f"Error converting {input_file.name}: {process.stderr}"
logger.error(error_msg)
self.error_details[file_id] = {
'file': str(input_file),
'error': process.stderr,
'return_code': process.returncode
}
return False
# Move the converted file from temp dir to the proper location
temp_output = temp_dir / f"{input_file.stem}.pdf"
if temp_output.exists():
# Ensure target directory exists
os.makedirs(output_file.parent, exist_ok=True)
# Move the file to preserve structure
shutil.move(temp_output, output_file)
logger.info(f"Successfully converted {input_file.name} to {output_file}")
return True
else:
error_msg = f"Conversion produced no output for {input_file.name}"
logger.error(error_msg)
self.error_details[file_id] = {
'file': str(input_file),
'error': error_msg
}
return False
except subprocess.TimeoutExpired as e:
error_msg = f"Timeout converting {input_file.name}"
logger.error(error_msg)
self.error_details[file_id] = {
'file': str(input_file),
'error': str(e)
}
return False
except Exception as e:
error_msg = f"Error converting {input_file.name}: {str(e)}"
logger.error(error_msg)
self.error_details[file_id] = {
'file': str(input_file),
'error': str(e)
}
return False
def convert_all(self):
"""Convert all documents in parallel"""
documents = self.find_documents()
logger.info(f"Found {len(documents)} documents to convert")
success_count = 0
failure_count = 0
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
results = list(executor.map(self.convert_document, documents))
success_count = results.count(True)
failure_count = results.count(False)
# Clean up any temporary directory
temp_dir = self.output_dir / "_temp"
if temp_dir.exists():
shutil.rmtree(temp_dir, ignore_errors=True)
logger.info(f"Conversion complete: {success_count} succeeded, {failure_count} failed")
# Print detailed error information
if self.error_details:
logger.info(f"\n{'='*80}\nDETAILED ERROR REPORT\n{'='*80}")
for idx, (file_id, details) in enumerate(self.error_details.items(), 1):
logger.info(f"\nError #{idx}:")
logger.info(f"File: {details['file']}")
logger.info(f"Error: {details['error']}")
if 'return_code' in details:
logger.info(f"Return code: {details['return_code']}")
logger.info('-' * 50)
return success_count, failure_count
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
input_dir: Path to the directory containing source documents to convert. Can be a string or Path object. The converter will recursively search this directory for all supported file types. The path is converted to an absolute path internally.
output_dir: Path to the directory where converted PDF files will be saved. Can be a string or Path object. The directory structure from input_dir is preserved in this location. The directory is created automatically if it doesn't exist. The path is converted to an absolute path internally.
max_workers: Maximum number of concurrent document conversions to run in parallel using ThreadPoolExecutor. Default is 1 (sequential processing). Higher values can improve performance but increase system resource usage. Should be tuned based on available CPU cores and memory.
Return Value
Instantiation returns a DocumentConverter object. The convert_all() method returns a tuple of (success_count, failure_count) indicating how many documents were successfully converted and how many failed. The convert_document() method returns a boolean (True for success, False for failure or skip). The find_documents() method returns a list of Path objects representing all supported documents found. The get_relative_output_path() method returns a Path object representing the output file path.
Class Interface
Methods
__init__(self, input_dir, output_dir, max_workers=1)
Purpose: Initialize the DocumentConverter with input/output directories and concurrency settings
Parameters:
input_dir: Directory path containing source documents to convertoutput_dir: Directory path where converted PDFs will be savedmax_workers: Maximum number of concurrent conversions (default: 1)
Returns: None - initializes the instance
find_documents(self) -> list
Purpose: Recursively search the input directory for all supported document types
Returns: List of Path objects representing all found documents with supported extensions
get_relative_output_path(self, input_file) -> Path
Purpose: Calculate the output PDF path that preserves the original directory structure relative to input_dir
Parameters:
input_file: Path object of the input document
Returns: Path object representing the output PDF file location with preserved directory structure
convert_document(self, input_file) -> bool
Purpose: Convert a single document to PDF using LibreOffice, or copy if already PDF
Parameters:
input_file: Path object of the document to convert
Returns: True if conversion succeeded, False if failed or file was skipped (already exists)
convert_all(self) -> tuple
Purpose: Convert all documents found in input_dir to PDF using parallel processing
Returns: Tuple of (success_count, failure_count) indicating conversion results
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
SUPPORTED_EXTENSIONS |
list | Class variable containing all supported file extensions including Word (.doc, .docx), Excel (.xls, .xlsx), PowerPoint (.ppt, .pptx), OpenDocument (.odt, .ods, .odp), Visio (.vsd, .vsdx), and PDF (.pdf) | class |
input_dir |
Path | Absolute path to the directory containing source documents to convert | instance |
output_dir |
Path | Absolute path to the directory where converted PDF files are saved | instance |
max_workers |
int | Maximum number of concurrent document conversions allowed | instance |
error_details |
dict | Dictionary storing detailed error information for failed conversions, keyed by file path with values containing file path, error message, and optionally return code | instance |
Dependencies
ossubprocessloggingconcurrent.futurespathlibshutil
Required Imports
import os
import subprocess
import logging
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import shutil
Usage Example
import os
import logging
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import subprocess
import shutil
# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Basic usage - sequential conversion
converter = DocumentConverter(
input_dir='/path/to/documents',
output_dir='/path/to/pdfs',
max_workers=1
)
# Find all documents first (optional)
documents = converter.find_documents()
print(f'Found {len(documents)} documents')
# Convert all documents
success, failures = converter.convert_all()
print(f'Converted {success} documents, {failures} failed')
# Check for errors
if converter.error_details:
for file_id, error_info in converter.error_details.items():
print(f"Error in {error_info['file']}: {error_info['error']}")
# Parallel conversion with 4 workers
parallel_converter = DocumentConverter(
input_dir='/path/to/documents',
output_dir='/path/to/pdfs',
max_workers=4
)
success, failures = parallel_converter.convert_all()
# Convert a single document
single_file = Path('/path/to/document.docx')
result = converter.convert_document(single_file)
Best Practices
- Always ensure LibreOffice is installed before instantiating the class
- Configure a logger before using the class as it relies on a module-level 'logger' variable
- Start with max_workers=1 for testing, then increase based on system resources and performance needs
- Monitor the error_details attribute after conversion to handle failed conversions
- The class creates a temporary directory (_temp) in output_dir during conversion, which is cleaned up automatically
- PDF files in the input directory are copied rather than converted, preserving the original file
- Already converted files (existing PDFs in output_dir) are skipped to avoid redundant work
- The 120-second timeout per document can be adjusted by modifying the timeout parameter in subprocess.run
- Directory structure from input_dir is preserved in output_dir, making it easy to maintain organization
- Use convert_all() for batch processing or convert_document() for individual file conversion
- The class is not thread-safe for multiple instances operating on the same output_dir simultaneously
- Ensure adequate disk space as temporary files are created during conversion
- Error details are stored in memory in error_details dictionary - for very large batches, consider processing in chunks
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentConverter_v1 88.6% similar
-
class PDFConverter 87.5% similar
-
class PDFConverter_v1 79.6% similar
-
class ControlledDocumentConverter 73.6% similar
-
function test_libreoffice_conversion 71.1% similar