class PDFConverter_v1
A comprehensive document-to-PDF converter class that handles multiple file formats (Word, Excel, PowerPoint, images) with multiple conversion methods and automatic fallbacks for reliability.
/tf/active/vicechatdev/CDocs/utils/pdf_utils.py
55 - 561
complex
Purpose
PDFConverter provides a unified interface for converting various document formats to PDF. It supports Word documents (.doc, .docx, .rtf, .odt), PowerPoint presentations (.ppt, .pptx, .odp), Excel spreadsheets (.xls, .xlsx, .ods), and images (.jpg, .png, .gif, .bmp, .tiff). The class implements a fallback strategy, attempting multiple conversion methods (LibreOffice, unoconv, Python libraries) to ensure successful conversion even when some tools are unavailable. It manages temporary files and handles edge cases like already-PDF files.
Source Code
class PDFConverter:
"""
Converts various document formats to PDF
This class provides methods to convert different document formats
(Word, Excel, PowerPoint, images) to PDF format using various
conversion methods, with fallbacks for reliability.
"""
# Supported file extensions by type
WORD_EXTENSIONS = ['.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf', '.odt']
PPT_EXTENSIONS = ['.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx', '.odp']
EXCEL_EXTENSIONS = ['.xls', '.xlsx', '.xlsm', '.xlt', '.xltx', '.xltm', '.xlsb', '.ods']
IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif']
def __init__(self, temp_dir: Optional[str] = None):
"""
Initialize the PDF converter
Parameters
----------
temp_dir : str, optional
Directory for temporary files. If not provided, a system temp directory is used.
"""
self.temp_dir = temp_dir if temp_dir else tempfile.mkdtemp()
os.makedirs(self.temp_dir, exist_ok=True)
def convert_to_pdf(self, input_path: str, output_path: str) -> str:
"""
Convert a document to PDF format
Parameters
----------
input_path : str
Path to the input document
output_path : str
Path where the PDF will be saved
Returns
-------
str
Path to the generated PDF file
Raises
------
FileNotFoundError
If the input file doesn't exist
ValueError
If the file format is not supported
RuntimeError
If the conversion fails
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# Get file extension and determine conversion method
ext = Path(input_path).suffix.lower()
# Check if already a PDF
if ext == '.pdf':
logger.info(f"File is already a PDF, copying: {input_path}")
shutil.copy(input_path, output_path)
return output_path
# Convert based on file type
if ext in self.WORD_EXTENSIONS:
return self._convert_word_to_pdf(input_path, output_path)
elif ext in self.PPT_EXTENSIONS:
return self._convert_presentation_to_pdf(input_path, output_path)
elif ext in self.EXCEL_EXTENSIONS:
return self._convert_spreadsheet_to_pdf(input_path, output_path)
elif ext in self.IMAGE_EXTENSIONS:
return self._convert_image_to_pdf(input_path, output_path)
else:
raise ValueError(f"Unsupported file format: {ext}")
def _convert_word_to_pdf(self, input_path: str, output_path: str) -> str:
"""
Convert a Word document to PDF
Parameters
----------
input_path : str
Path to the Word document
output_path : str
Path where the PDF will be saved
Returns
-------
str
Path to the generated PDF
Raises
------
RuntimeError
If all conversion methods fail
"""
logger.info(f"Converting Word document to PDF: {input_path}")
# Try LibreOffice first
if self._convert_with_libreoffice(input_path, output_path):
return output_path
# Try unoconv as fallback
if self._convert_with_unoconv(input_path, output_path):
return output_path
# Try Python libraries as last resort
try:
# Import here to avoid dependency if not used
from docx2pdf import convert
convert(input_path, output_path)
if os.path.exists(output_path):
logger.info(f"Converted using docx2pdf: {output_path}")
return output_path
except ImportError:
logger.warning("docx2pdf not installed, skipping this conversion method")
except Exception as e:
logger.error(f"Error using docx2pdf: {str(e)}")
raise RuntimeError(f"Failed to convert Word document to PDF: {input_path}")
def _convert_presentation_to_pdf(self, input_path: str, output_path: str) -> str:
"""
Convert a PowerPoint presentation to PDF
Parameters
----------
input_path : str
Path to the PowerPoint presentation
output_path : str
Path where the PDF will be saved
Returns
-------
str
Path to the generated PDF
Raises
------
RuntimeError
If all conversion methods fail
"""
logger.info(f"Converting PowerPoint to PDF: {input_path}")
# Try LibreOffice first
if self._convert_with_libreoffice(input_path, output_path):
return output_path
# Try unoconv as fallback
if self._convert_with_unoconv(input_path, output_path):
return output_path
# No reliable Python library for PowerPoint to PDF conversion
# We've already tried the best methods
raise RuntimeError(f"Failed to convert PowerPoint presentation to PDF: {input_path}")
def _convert_spreadsheet_to_pdf(self, input_path: str, output_path: str) -> str:
"""
Convert an Excel spreadsheet to PDF
Parameters
----------
input_path : str
Path to the Excel spreadsheet
output_path : str
Path where the PDF will be saved
Returns
-------
str
Path to the generated PDF
Raises
------
RuntimeError
If all conversion methods fail
"""
logger.info(f"Converting Excel to PDF: {input_path}")
# Try LibreOffice first
if self._convert_with_libreoffice(input_path, output_path):
return output_path
# Try unoconv as fallback
if self._convert_with_unoconv(input_path, output_path):
return output_path
# Try to convert to CSV and then to PDF as last resort
try:
return self._convert_excel_via_csv(input_path, output_path)
except Exception as e:
logger.error(f"Error in Excel CSV conversion: {str(e)}")
raise RuntimeError(f"Failed to convert Excel spreadsheet to PDF: {input_path}")
def _convert_image_to_pdf(self, input_path: str, output_path: str) -> str:
"""
Convert an image to PDF
Parameters
----------
input_path : str
Path to the image file
output_path : str
Path where the PDF will be saved
Returns
-------
str
Path to the generated PDF
Raises
------
RuntimeError
If the conversion fails
"""
logger.info(f"Converting image to PDF: {input_path}")
if PILImage is None:
logger.error("PIL library not available, cannot convert image to PDF")
raise RuntimeError("PIL library required for image conversion")
try:
# Open the image
img = PILImage.open(input_path)
# Convert to RGB mode if not already (required for PDF)
if img.mode != 'RGB':
img = img.convert('RGB')
# Save as PDF
img.save(output_path, 'PDF', resolution=100.0)
if os.path.exists(output_path):
logger.info(f"Converted image to PDF: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error converting image to PDF: {str(e)}")
# Try with PyMuPDF as fallback
try:
doc = fitz.open()
img = fitz.open(input_path)
pdfbytes = img.convert_to_pdf()
img.close()
pdf = fitz.open("pdf", pdfbytes)
doc.insert_pdf(pdf)
doc.save(output_path)
doc.close()
if os.path.exists(output_path):
logger.info(f"Converted image to PDF with PyMuPDF: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error converting image to PDF with PyMuPDF: {str(e)}")
raise RuntimeError(f"Failed to convert image to PDF: {input_path}")
def _convert_with_libreoffice(self, input_path: str, output_path: str) -> bool:
"""
Convert a document to PDF using LibreOffice
Parameters
----------
input_path : str
Path to the input document
output_path : str
Path where the PDF will be saved
Returns
-------
bool
True if conversion succeeded, False otherwise
"""
try:
# Get base filename without extension
base_name = os.path.basename(input_path)
base_name_without_ext = os.path.splitext(base_name)[0]
# Create a temporary directory for conversion
with tempfile.TemporaryDirectory() as temp_dir:
# Run LibreOffice to convert the file
cmd = [
'libreoffice',
'--headless',
'--norestore',
'--nofirststartwizard',
'--convert-to', 'pdf',
'--outdir', temp_dir,
input_path
]
logger.info(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=180 # 3 minute timeout
)
# Check if the conversion was successful
if result.returncode != 0:
logger.error(f"LibreOffice conversion failed: {result.stderr}")
return False
# Check if the converted file exists
expected_output = os.path.join(temp_dir, f"{base_name_without_ext}.pdf")
if os.path.exists(expected_output):
# Move the file to the desired location
shutil.move(expected_output, output_path)
logger.info(f"Converted to PDF with LibreOffice: {output_path}")
return True
else:
# Try to find any PDF file in the output directory
pdf_files = [f for f in os.listdir(temp_dir) if f.endswith('.pdf')]
if pdf_files:
# Move the first PDF file found
shutil.move(os.path.join(temp_dir, pdf_files[0]), output_path)
logger.info(f"Found alternative PDF output: {pdf_files[0]}")
return True
logger.error(f"Expected output {expected_output} not found")
return False
except subprocess.TimeoutExpired:
logger.error("LibreOffice conversion timed out")
return False
except Exception as e:
logger.error(f"Error in LibreOffice conversion: {str(e)}")
return False
def _convert_with_unoconv(self, input_path: str, output_path: str) -> bool:
"""
Convert a document to PDF using unoconv
Parameters
----------
input_path : str
Path to the input document
output_path : str
Path where the PDF will be saved
Returns
-------
bool
True if conversion succeeded, False otherwise
"""
try:
cmd = [
'unoconv',
'-f', 'pdf',
'-o', output_path,
input_path
]
logger.info(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=180 # 3 minute timeout
)
# Check if the conversion was successful
if result.returncode != 0:
logger.error(f"unoconv conversion failed: {result.stderr}")
return False
if os.path.exists(output_path):
logger.info(f"Converted to PDF with unoconv: {output_path}")
return True
else:
logger.error(f"unoconv did not create the output file: {output_path}")
return False
except subprocess.TimeoutExpired:
logger.error("unoconv conversion timed out")
return False
except FileNotFoundError:
logger.error("unoconv command not found.")
return False
except Exception as e:
logger.error(f"Error in unoconv conversion: {str(e)}")
return False
def _convert_excel_via_csv(self, input_path: str, output_path: str) -> str:
"""
Convert Excel to PDF via CSV conversion as a fallback method
Parameters
----------
input_path : str
Path to the Excel file
output_path : str
Path where the PDF will be saved
Returns
-------
str
Path to the generated PDF
Raises
------
RuntimeError
If the conversion fails
"""
# Import pandas for Excel handling
try:
import pandas as pd
except ImportError:
logger.error("pandas not installed, cannot convert Excel via CSV")
raise RuntimeError("pandas library required for Excel CSV conversion")
# Create a temporary directory for CSV files
csv_dir = os.path.join(self.temp_dir, "csv_output")
os.makedirs(csv_dir, exist_ok=True)
try:
# Create a PDF document
pdf_buffer = io.BytesIO()
doc = SimpleDocTemplate(
pdf_buffer,
pagesize=A4,
leftMargin=cm,
rightMargin=cm,
topMargin=cm,
bottomMargin=cm
)
# Get styles
styles = getSampleStyleSheet()
title_style = styles['Heading1']
heading_style = styles['Heading2']
normal_style = styles['Normal']
# Create story for the PDF
story = []
# Try to read Excel file with pandas
excel_file = pd.ExcelFile(input_path)
# Process each sheet
for sheet_name in excel_file.sheet_names:
# Add sheet title
story.append(Paragraph(f"Sheet: {sheet_name}", heading_style))
story.append(Spacer(1, 0.2*inch))
# Read sheet data
df = pd.read_excel(excel_file, sheet_name=sheet_name)
if not df.empty:
# Convert dataframe to list of lists for table
data = [df.columns.tolist()]
data.extend(df.values.tolist())
# Create table
table = Table(data)
# Add style to table
table_style = TableStyle([
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
('ALIGN', (0, 0), (-1, 0), 'CENTER'),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, 0), 10),
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
('BACKGROUND', (0, 1), (-1, -1), colors.white),
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
])
# Add zebra striping for better readability
for i in range(1, len(data), 2):
table_style.add('BACKGROUND', (0, i), (-1, i), colors.lightgrey)
table.setStyle(table_style)
story.append(table)
else:
story.append(Paragraph("(Empty sheet)", normal_style))
# Add page break after each sheet except the last one
if sheet_name != excel_file.sheet_names[-1]:
story.append(PageBreak())
# Build the PDF
doc.build(story)
# Save the PDF
with open(output_path, 'wb') as f:
f.write(pdf_buffer.getvalue())
logger.info(f"Converted Excel to PDF via pandas: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error converting Excel via CSV: {str(e)}")
raise RuntimeError(f"Failed to convert Excel via CSV: {str(e)}")
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
temp_dir: Optional directory path for storing temporary files during conversion. If not provided, the system's default temporary directory is used. The directory is created if it doesn't exist. This is useful for controlling where intermediate files are stored during multi-step conversions.
Return Value
Instantiation returns a PDFConverter object. The main convert_to_pdf method returns a string containing the path to the successfully generated PDF file. Private conversion methods return either a string (path to PDF) or boolean (True/False for success/failure). All conversion methods raise RuntimeError if all conversion attempts fail, FileNotFoundError if input file doesn't exist, or ValueError if the file format is unsupported.
Class Interface
Methods
__init__(self, temp_dir: Optional[str] = None)
Purpose: Initialize the PDF converter with optional temporary directory
Parameters:
temp_dir: Optional directory path for temporary files; uses system temp if not provided
Returns: None (constructor)
convert_to_pdf(self, input_path: str, output_path: str) -> str
Purpose: Main public method to convert any supported document format to PDF
Parameters:
input_path: Path to the input document (Word, Excel, PowerPoint, image, or PDF)output_path: Desired path for the output PDF file
Returns: String path to the generated PDF file
_convert_word_to_pdf(self, input_path: str, output_path: str) -> str
Purpose: Convert Word documents to PDF using LibreOffice, unoconv, or docx2pdf as fallbacks
Parameters:
input_path: Path to Word document (.doc, .docx, .rtf, .odt, etc.)output_path: Path where PDF will be saved
Returns: String path to the generated PDF file
_convert_presentation_to_pdf(self, input_path: str, output_path: str) -> str
Purpose: Convert PowerPoint presentations to PDF using LibreOffice or unoconv
Parameters:
input_path: Path to PowerPoint file (.ppt, .pptx, .odp, etc.)output_path: Path where PDF will be saved
Returns: String path to the generated PDF file
_convert_spreadsheet_to_pdf(self, input_path: str, output_path: str) -> str
Purpose: Convert Excel spreadsheets to PDF using LibreOffice, unoconv, or CSV conversion as fallbacks
Parameters:
input_path: Path to Excel file (.xls, .xlsx, .ods, etc.)output_path: Path where PDF will be saved
Returns: String path to the generated PDF file
_convert_image_to_pdf(self, input_path: str, output_path: str) -> str
Purpose: Convert image files to PDF using PIL or PyMuPDF as fallback
Parameters:
input_path: Path to image file (.jpg, .png, .gif, .bmp, .tiff, etc.)output_path: Path where PDF will be saved
Returns: String path to the generated PDF file
_convert_with_libreoffice(self, input_path: str, output_path: str) -> bool
Purpose: Attempt conversion using LibreOffice command-line interface
Parameters:
input_path: Path to input documentoutput_path: Path where PDF will be saved
Returns: Boolean: True if conversion succeeded, False otherwise
_convert_with_unoconv(self, input_path: str, output_path: str) -> bool
Purpose: Attempt conversion using unoconv command-line tool
Parameters:
input_path: Path to input documentoutput_path: Path where PDF will be saved
Returns: Boolean: True if conversion succeeded, False otherwise
_convert_excel_via_csv(self, input_path: str, output_path: str) -> str
Purpose: Convert Excel to PDF by reading with pandas and rendering with ReportLab as last resort fallback
Parameters:
input_path: Path to Excel fileoutput_path: Path where PDF will be saved
Returns: String path to the generated PDF file
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
WORD_EXTENSIONS |
List[str] | Class variable containing supported Word document file extensions | class |
PPT_EXTENSIONS |
List[str] | Class variable containing supported PowerPoint presentation file extensions | class |
EXCEL_EXTENSIONS |
List[str] | Class variable containing supported Excel spreadsheet file extensions | class |
IMAGE_EXTENSIONS |
List[str] | Class variable containing supported image file extensions | class |
temp_dir |
str | Instance variable storing the path to the temporary directory used for intermediate conversion files | instance |
Dependencies
osiologgingtempfileshutilsubprocesspathlibreportlabfitzpikepdfPILdocx2pdfpandas
Required Imports
import os
import io
import tempfile
import shutil
import subprocess
from pathlib import Path
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch, cm
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
import fitz
Conditional/Optional Imports
These imports are only needed under specific conditions:
from PIL import Image as PILImage
Condition: Required for image to PDF conversion (_convert_image_to_pdf method)
Required (conditional)from docx2pdf import convert
Condition: Used as fallback for Word document conversion if LibreOffice and unoconv fail
Optionalimport pandas as pd
Condition: Required for Excel to PDF conversion via CSV fallback method (_convert_excel_via_csv)
OptionalUsage Example
# Basic usage
from pdf_converter import PDFConverter
# Initialize converter with default temp directory
converter = PDFConverter()
# Convert a Word document
try:
pdf_path = converter.convert_to_pdf('document.docx', 'output.pdf')
print(f'PDF created at: {pdf_path}')
except FileNotFoundError as e:
print(f'File not found: {e}')
except ValueError as e:
print(f'Unsupported format: {e}')
except RuntimeError as e:
print(f'Conversion failed: {e}')
# Convert with custom temp directory
converter = PDFConverter(temp_dir='/tmp/my_conversions')
# Convert various formats
converter.convert_to_pdf('presentation.pptx', 'slides.pdf')
converter.convert_to_pdf('spreadsheet.xlsx', 'data.pdf')
converter.convert_to_pdf('image.jpg', 'photo.pdf')
# Already PDF files are simply copied
converter.convert_to_pdf('existing.pdf', 'copy.pdf')
Best Practices
- Always wrap convert_to_pdf calls in try-except blocks to handle FileNotFoundError, ValueError, and RuntimeError exceptions
- Ensure LibreOffice is installed for best conversion results across all document types
- The class attempts multiple conversion methods automatically, so no manual fallback handling is needed
- For production use, provide a custom temp_dir to control temporary file storage and cleanup
- The converter creates output directories automatically if they don't exist
- PDF files are simply copied, not re-converted, for efficiency
- Conversion methods have 3-minute timeouts to prevent hanging on large or corrupted files
- Image conversion requires PIL/Pillow; the class will raise RuntimeError if not available
- Excel conversion via CSV is a last resort and may not preserve formatting perfectly
- The class is stateless except for temp_dir, so a single instance can be reused for multiple conversions
- Clean up temp_dir manually if using custom directory to avoid disk space issues
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentConverter_v1 87.7% similar
-
class PDFConverter 86.7% similar
-
class DocumentConverter 79.6% similar
-
class ControlledDocumentConverter 67.1% similar
-
class PDFAConverter 65.6% similar