class PDFConverter
A class that converts various document formats (Word, PowerPoint, Excel, images) to PDF format using LibreOffice and ReportLab libraries.
/tf/active/vicechatdev/msg_to_eml.py
262 - 410
moderate
Purpose
PDFConverter provides a unified interface for converting multiple document types to PDF. It handles Word documents (.doc, .docx), PowerPoint presentations (.ppt, .pptx), Excel spreadsheets (.xls, .xlsx), and images (.jpg, .png, etc.) by routing them to appropriate conversion methods. The class uses LibreOffice for office documents and ReportLab/PIL for image conversions. It manages temporary directories for intermediate files and handles file path resolution, output directory creation, and error handling during conversion.
Source Code
class PDFConverter:
"""Converts various document formats to PDF"""
# Supported file extensions by type
WORD_EXTENSIONS = ['.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf', '.odt']
PPT_EXTENSIONS = ['.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx', '.odp']
EXCEL_EXTENSIONS = ['.xls', '.xlsx', '.xlsm', '.xlt', '.xltx', '.xltm', '.xlsb', '.ods']
IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif']
def __init__(self, temp_dir=None):
"""Initialize the PDF converter with an optional temp directory"""
self.temp_dir = temp_dir if temp_dir else tempfile.mkdtemp()
os.makedirs(self.temp_dir, exist_ok=True)
def convert_to_pdf(self, input_path, output_path):
"""Convert a document to PDF format"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# Get file extension and determine conversion method
ext = os.path.splitext(input_path)[1].lower()
# If already a PDF, just copy it
if ext == '.pdf':
shutil.copy2(input_path, output_path)
return output_path
# Convert based on file type
if ext in self.WORD_EXTENSIONS:
return self._convert_with_libreoffice(input_path, output_path)
elif ext in self.PPT_EXTENSIONS:
return self._convert_with_libreoffice(input_path, output_path)
elif ext in self.EXCEL_EXTENSIONS:
return self._convert_with_libreoffice(input_path, output_path)
elif ext in self.IMAGE_EXTENSIONS:
return self._convert_image_to_pdf(input_path, output_path)
else:
# Try LibreOffice as fallback for unknown types
return self._convert_with_libreoffice(input_path, output_path)
def _convert_with_libreoffice(self, input_path, output_path):
"""Convert a document to PDF using LibreOffice"""
try:
# Absolute paths to avoid directory issues
abs_input = os.path.abspath(input_path)
abs_output_dir = os.path.abspath(os.path.dirname(output_path))
# Use LibreOffice for conversion
cmd = [
'libreoffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', abs_output_dir,
abs_input
]
# Run with timeout
process = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60 # 60 second timeout
)
# Check if successful
if process.returncode != 0:
logger.error(f"LibreOffice conversion failed: {process.stderr}")
return None
# LibreOffice creates output with original filename but .pdf extension
expected_output = os.path.join(
abs_output_dir,
os.path.splitext(os.path.basename(input_path))[0] + '.pdf'
)
# Rename if necessary
if expected_output != output_path and os.path.exists(expected_output):
shutil.move(expected_output, output_path)
if os.path.exists(output_path):
return output_path
else:
logger.error(f"Expected output not found: {output_path}")
return None
except subprocess.TimeoutExpired:
logger.error(f"Timeout while converting: {input_path}")
return None
except Exception as e:
logger.error(f"Error in LibreOffice conversion: {str(e)}")
return None
def _convert_image_to_pdf(self, input_path, output_path):
"""Convert an image to PDF"""
try:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
from PIL import Image
# Open image to get dimensions
img = Image.open(input_path)
img_width, img_height = img.size
# Create PDF
doc = SimpleDocTemplate(
output_path,
pagesize=letter,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=72
)
# Calculate image size to fit on page
page_width, page_height = letter
avail_width = page_width - 144 # Account for margins
avail_height = page_height - 144
# Scale image to fit available space
width_ratio = avail_width / img_width
height_ratio = avail_height / img_height
ratio = min(width_ratio, height_ratio)
# Create elements for the PDF
styles = getSampleStyleSheet()
elements = []
# Add filename as title
elements.append(Paragraph(os.path.basename(input_path), styles['Heading2']))
# Add image
img_obj = RLImage(input_path, width=img_width*ratio, height=img_height*ratio)
elements.append(img_obj)
# Build PDF
doc.build(elements)
return output_path
except Exception as e:
logger.error(f"Error converting image to PDF: {str(e)}")
return None
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
temp_dir: Optional path to a temporary directory for storing intermediate conversion files. If not provided, a new temporary directory is created using tempfile.mkdtemp(). The directory is created if it doesn't exist. This is useful for controlling where temporary files are stored during conversion operations.
Return Value
Instantiation returns a PDFConverter object. The convert_to_pdf method returns the output_path string if conversion succeeds, or None if conversion fails. The private methods _convert_with_libreoffice and _convert_image_to_pdf also return the output_path on success or None on failure.
Class Interface
Methods
__init__(self, temp_dir=None) -> None
Purpose: Initialize the PDF converter with an optional temporary directory for intermediate files
Parameters:
temp_dir: Optional string path to temporary directory. If None, creates a new temp directory using tempfile.mkdtemp()
Returns: None - initializes the instance
convert_to_pdf(self, input_path: str, output_path: str) -> str | None
Purpose: Main public method to convert any supported document format to PDF
Parameters:
input_path: String path to the input file to convert. Must exist or FileNotFoundError is raisedoutput_path: String path where the PDF output should be saved. Directory is created if it doesn't exist
Returns: String path to the output PDF file on success, or None if conversion fails
_convert_with_libreoffice(self, input_path: str, output_path: str) -> str | None
Purpose: Private method to convert office documents (Word, Excel, PowerPoint) to PDF using LibreOffice command-line tool
Parameters:
input_path: String path to the input office documentoutput_path: String path where the PDF output should be saved
Returns: String path to the output PDF file on success, or None if conversion fails or times out
_convert_image_to_pdf(self, input_path: str, output_path: str) -> str | None
Purpose: Private method to convert image files to PDF using ReportLab and PIL, with automatic scaling to fit letter-size pages
Parameters:
input_path: String path to the input image fileoutput_path: String path where the PDF output should be saved
Returns: String path to the output PDF file on success, or None if conversion fails
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
WORD_EXTENSIONS |
list[str] | Class variable containing supported Word document file extensions including .doc, .docx, .docm, .dot, .dotx, .dotm, .rtf, .odt | class |
PPT_EXTENSIONS |
list[str] | Class variable containing supported PowerPoint presentation file extensions including .ppt, .pptx, .pptm, .pot, .potx, .potm, .pps, .ppsx, .odp | class |
EXCEL_EXTENSIONS |
list[str] | Class variable containing supported Excel spreadsheet file extensions including .xls, .xlsx, .xlsm, .xlt, .xltx, .xltm, .xlsb, .ods | class |
IMAGE_EXTENSIONS |
list[str] | Class variable containing supported image file extensions including .jpg, .jpeg, .png, .gif, .bmp, .tiff, .tif | class |
temp_dir |
str | Instance variable storing the path to the temporary directory used for intermediate conversion files. Set during initialization and created if it doesn't exist | instance |
Dependencies
ostempfileshutilsubprocessreportlabPILlogging
Required Imports
import os
import tempfile
import shutil
import subprocess
import logging
Conditional/Optional Imports
These imports are only needed under specific conditions:
from reportlab.lib.pagesizes import letter
Condition: only when converting images to PDF using _convert_image_to_pdf method
Required (conditional)from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph
Condition: only when converting images to PDF using _convert_image_to_pdf method
Required (conditional)from reportlab.lib.styles import getSampleStyleSheet
Condition: only when converting images to PDF using _convert_image_to_pdf method
Required (conditional)from reportlab.lib.units import inch
Condition: only when converting images to PDF using _convert_image_to_pdf method
Required (conditional)from PIL import Image
Condition: only when converting images to PDF using _convert_image_to_pdf method
Required (conditional)Usage Example
import os
import tempfile
import logging
# Setup logger (required by the class)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Create converter with default temp directory
converter = PDFConverter()
# Or specify custom temp directory
converter = PDFConverter(temp_dir='/tmp/my_conversions')
# Convert a Word document to PDF
input_file = 'document.docx'
output_file = 'output.pdf'
result = converter.convert_to_pdf(input_file, output_file)
if result:
print(f'Successfully converted to {result}')
else:
print('Conversion failed')
# Convert an image to PDF
image_result = converter.convert_to_pdf('photo.jpg', 'photo.pdf')
# Convert Excel spreadsheet
excel_result = converter.convert_to_pdf('data.xlsx', 'data.pdf')
# If input is already PDF, it will be copied
pdf_result = converter.convert_to_pdf('existing.pdf', 'copy.pdf')
Best Practices
- Always check the return value of convert_to_pdf() - it returns None on failure
- Ensure LibreOffice is installed and accessible in system PATH before using for office document conversions
- The temp_dir is created during initialization but not automatically cleaned up - manage cleanup manually if needed
- Input files must exist before calling convert_to_pdf() or FileNotFoundError will be raised
- Output directories are created automatically if they don't exist
- The class has a 60-second timeout for LibreOffice conversions to prevent hanging
- For image conversions, the original filename is added as a title in the PDF
- If the input file is already a PDF, it is simply copied to the output path
- Private methods (_convert_with_libreoffice, _convert_image_to_pdf) should not be called directly - use convert_to_pdf() instead
- The class uses class-level constants for supported file extensions - these can be referenced but should not be modified
- Error logging requires a module-level 'logger' object to be configured
- LibreOffice conversion creates intermediate files with predictable names that may need to be moved to the final output path
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentConverter_v1 89.8% similar
-
class DocumentConverter 87.5% similar
-
class PDFConverter_v1 86.7% similar
-
class ControlledDocumentConverter 70.2% similar
-
class PDFAConverter 66.2% similar