class DocumentConverter_v1
A class that converts various document formats (Word, Excel, PowerPoint, images) to PDF format using LibreOffice, unoconv, or PIL.
/tf/active/vicechatdev/document_auditor/src/document_converter.py
8 - 136
moderate
Purpose
DocumentConverter provides a unified interface for converting multiple document and image formats to PDF. It handles Microsoft Office documents (.docx, .doc, .ppt, .pptx, .xlsx, .xls) using LibreOffice or unoconv as fallback, and image formats (.png, .jpg, .jpeg, .tiff, .bmp, .gif) using PIL. The class automatically detects file types and applies the appropriate conversion method, with built-in error handling and logging.
Source Code
class DocumentConverter:
"""Converts various document formats to PDF"""
def __init__(self):
self.logger = logging.getLogger(__name__)
def convert_to_pdf(self, input_path, output_path):
"""
Convert document to PDF if it's not already in PDF format
Args:
input_path (str): Path to the input document
output_path (str): Path where PDF will be saved
Returns:
str: Path to the PDF document
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
ext = os.path.splitext(input_path)[1].lower()
if ext == '.pdf':
# Already a PDF, just copy
self.logger.info(f"File is already PDF, copying to {output_path}")
shutil.copy(input_path, output_path)
elif ext in ['.docx', '.doc','.ppt','.pptx', '.xlsx', '.xls']:
# Convert Word, Excel or Powerpoint document using LibreOffice
self.logger.info(f"Converting Word document to PDF: {input_path}")
self._convert_with_libreoffice(input_path, output_path)
elif ext in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif']:
# Convert image to PDF
self.logger.info(f"Converting image to PDF: {input_path}")
self._convert_image_to_pdf(input_path, output_path)
else:
raise ValueError(f"Unsupported file format: {ext}")
return output_path
def _convert_with_libreoffice(self, word_path, pdf_path):
"""Convert Word document to PDF using LibreOffice"""
try:
# Create output directory if it doesn't exist
output_dir = os.path.dirname(pdf_path)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Get base filename without extension
base_name = os.path.basename(word_path)
base_name_without_ext = os.path.splitext(base_name)[0]
# Create a temporary directory for conversion
with tempfile.TemporaryDirectory() as temp_dir:
# Run LibreOffice to convert the file
cmd = [
'libreoffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', temp_dir,
word_path
]
self.logger.info(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
self.logger.debug(f"LibreOffice conversion output: {result.stdout}")
# Check if the converted file exists
expected_output = os.path.join(temp_dir, f"{base_name_without_ext}.pdf")
if os.path.exists(expected_output):
# Move the file to the desired location
shutil.move(expected_output, pdf_path)
else:
# List files in temp dir for debugging
files_in_temp = os.listdir(temp_dir)
self.logger.error(f"Expected output {expected_output} not found. Files in temp dir: {files_in_temp}")
raise FileNotFoundError(f"Converted PDF not found: {expected_output}")
except subprocess.CalledProcessError as e:
self.logger.error(f"LibreOffice conversion failed: {e}")
self.logger.error(f"STDOUT: {e.stdout}")
self.logger.error(f"STDERR: {e.stderr}")
# Try alternate method if LibreOffice fails
self._convert_with_unoconv(word_path, pdf_path)
def _convert_with_unoconv(self, word_path, pdf_path):
"""Alternate conversion method using unoconv"""
try:
self.logger.info("Trying unoconv for conversion")
cmd = ['unoconv', '-f', 'pdf', '-o', pdf_path, word_path]
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=True
)
self.logger.debug(f"unoconv conversion output: {result.stdout}")
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"unoconv failed to create PDF: {pdf_path}")
except subprocess.CalledProcessError as e:
self.logger.error(f"unoconv conversion failed: {e}")
self.logger.error(f"STDOUT: {e.stdout}")
self.logger.error(f"STDERR: {e.stderr}")
raise RuntimeError("All document conversion methods failed")
except FileNotFoundError:
self.logger.error("unoconv command not found. Please install with: sudo apt-get install unoconv")
raise RuntimeError("Document conversion failed and unoconv not installed")
def _convert_image_to_pdf(self, image_path, pdf_path):
"""Convert image file to PDF"""
try:
img = Image.open(image_path)
img_rgb = img.convert('RGB')
img_rgb.save(pdf_path)
except Exception as e:
self.logger.error(f"Error converting image to PDF: {e}")
raise
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
__init__: No parameters required. The constructor initializes a logger instance for tracking conversion operations.
Return Value
Instantiation returns a DocumentConverter object. The main method convert_to_pdf() returns a string containing the path to the converted PDF file. Private methods (_convert_with_libreoffice, _convert_with_unoconv, _convert_image_to_pdf) do not return values but modify files on disk.
Class Interface
Methods
__init__(self)
Purpose: Initialize the DocumentConverter with a logger instance
Returns: None
convert_to_pdf(self, input_path: str, output_path: str) -> str
Purpose: Main public method to convert various document formats to PDF, automatically detecting file type and applying appropriate conversion method
Parameters:
input_path: String path to the input document file. Supported formats: .pdf, .docx, .doc, .ppt, .pptx, .xlsx, .xls, .png, .jpg, .jpeg, .tiff, .bmp, .gifoutput_path: String path where the converted PDF file will be saved. Directory must exist or be creatable
Returns: String containing the path to the converted PDF file (same as output_path parameter)
_convert_with_libreoffice(self, word_path: str, pdf_path: str) -> None
Purpose: Private method to convert Office documents (Word, Excel, PowerPoint) to PDF using LibreOffice in headless mode
Parameters:
word_path: String path to the input Office document filepdf_path: String path where the converted PDF will be saved
Returns: None. Raises subprocess.CalledProcessError if LibreOffice conversion fails, which triggers fallback to unoconv
_convert_with_unoconv(self, word_path: str, pdf_path: str) -> None
Purpose: Private fallback method to convert Office documents to PDF using unoconv when LibreOffice conversion fails
Parameters:
word_path: String path to the input Office document filepdf_path: String path where the converted PDF will be saved
Returns: None. Raises RuntimeError if unoconv conversion fails or unoconv is not installed
_convert_image_to_pdf(self, image_path: str, pdf_path: str) -> None
Purpose: Private method to convert image files to PDF format using PIL/Pillow library
Parameters:
image_path: String path to the input image file. Supported formats: .png, .jpg, .jpeg, .tiff, .bmp, .gifpdf_path: String path where the converted PDF will be saved
Returns: None. Raises exceptions if image cannot be opened or saved
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
logger |
logging.Logger | Logger instance for tracking conversion operations, errors, and debug information. Initialized with the module's __name__ | instance |
Dependencies
osloggingshutilsubprocesstempfilePIL
Required Imports
import os
import logging
import shutil
import subprocess
import tempfile
from PIL import Image
Usage Example
import logging
from document_converter import DocumentConverter
# Configure logging
logging.basicConfig(level=logging.INFO)
# Create converter instance
converter = DocumentConverter()
# Convert a Word document to PDF
try:
pdf_path = converter.convert_to_pdf(
input_path='/path/to/document.docx',
output_path='/path/to/output.pdf'
)
print(f'PDF created at: {pdf_path}')
except FileNotFoundError as e:
print(f'File not found: {e}')
except ValueError as e:
print(f'Unsupported format: {e}')
except RuntimeError as e:
print(f'Conversion failed: {e}')
# Convert an image to PDF
pdf_path = converter.convert_to_pdf(
input_path='/path/to/image.png',
output_path='/path/to/image.pdf'
)
# Copy existing PDF (no conversion needed)
pdf_path = converter.convert_to_pdf(
input_path='/path/to/existing.pdf',
output_path='/path/to/copy.pdf'
)
Best Practices
- Always wrap convert_to_pdf() calls in try-except blocks to handle FileNotFoundError, ValueError, and RuntimeError exceptions
- Ensure LibreOffice is installed before attempting to convert Office documents
- Check that output directory exists or has write permissions before conversion
- The class is stateless except for the logger, so a single instance can be reused for multiple conversions
- For batch conversions, create one DocumentConverter instance and reuse it
- Monitor disk space when converting large documents as temporary files are created during conversion
- The _convert_with_libreoffice method uses temporary directories that are automatically cleaned up
- If LibreOffice conversion fails, the class automatically attempts unoconv as fallback
- Image conversions are converted to RGB mode before saving to ensure compatibility
- Log output at INFO level to track conversion progress and DEBUG level for detailed diagnostics
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class PDFConverter 89.8% similar
-
class DocumentConverter 88.6% similar
-
class PDFConverter_v1 87.7% similar
-
class ControlledDocumentConverter 68.3% similar
-
class DocumentExtractor 64.8% similar