PDFConverter - Code Extractor

class PDFConverter

Maturity: 54

A class that converts various document formats (Word, PowerPoint, Excel, images) to PDF format using LibreOffice and ReportLab libraries.

File:
/tf/active/vicechatdev/msg_to_eml.py

Lines:
262 - 410

Complexity:
moderate

Purpose

PDFConverter provides a unified interface for converting multiple document types to PDF. It handles Word documents (.doc, .docx), PowerPoint presentations (.ppt, .pptx), Excel spreadsheets (.xls, .xlsx), and images (.jpg, .png, etc.) by routing them to appropriate conversion methods. The class uses LibreOffice for office documents and ReportLab/PIL for image conversions. It manages temporary directories for intermediate files and handles file path resolution, output directory creation, and error handling during conversion.

Source Code

class PDFConverter:
    """Converts various document formats to PDF"""
    
    # Supported file extensions by type
    WORD_EXTENSIONS = ['.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf', '.odt']
    PPT_EXTENSIONS = ['.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx', '.odp']
    EXCEL_EXTENSIONS = ['.xls', '.xlsx', '.xlsm', '.xlt', '.xltx', '.xltm', '.xlsb', '.ods']
    IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.tif']
    
    def __init__(self, temp_dir=None):
        """Initialize the PDF converter with an optional temp directory"""
        self.temp_dir = temp_dir if temp_dir else tempfile.mkdtemp()
        os.makedirs(self.temp_dir, exist_ok=True)
    
    def convert_to_pdf(self, input_path, output_path):
        """Convert a document to PDF format"""
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"Input file not found: {input_path}")
        
        # Create output directory if it doesn't exist
        output_dir = os.path.dirname(output_path)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)
        
        # Get file extension and determine conversion method
        ext = os.path.splitext(input_path)[1].lower()
        
        # If already a PDF, just copy it
        if ext == '.pdf':
            shutil.copy2(input_path, output_path)
            return output_path
        
        # Convert based on file type
        if ext in self.WORD_EXTENSIONS:
            return self._convert_with_libreoffice(input_path, output_path)
        elif ext in self.PPT_EXTENSIONS:
            return self._convert_with_libreoffice(input_path, output_path)
        elif ext in self.EXCEL_EXTENSIONS:
            return self._convert_with_libreoffice(input_path, output_path)
        elif ext in self.IMAGE_EXTENSIONS:
            return self._convert_image_to_pdf(input_path, output_path)
        else:
            # Try LibreOffice as fallback for unknown types
            return self._convert_with_libreoffice(input_path, output_path)
    
    def _convert_with_libreoffice(self, input_path, output_path):
        """Convert a document to PDF using LibreOffice"""
        try:
            # Absolute paths to avoid directory issues
            abs_input = os.path.abspath(input_path)
            abs_output_dir = os.path.abspath(os.path.dirname(output_path))
            
            # Use LibreOffice for conversion
            cmd = [
                'libreoffice',
                '--headless',
                '--convert-to', 'pdf',
                '--outdir', abs_output_dir,
                abs_input
            ]
            
            # Run with timeout
            process = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=60  # 60 second timeout
            )
            
            # Check if successful
            if process.returncode != 0:
                logger.error(f"LibreOffice conversion failed: {process.stderr}")
                return None
                
            # LibreOffice creates output with original filename but .pdf extension
            expected_output = os.path.join(
                abs_output_dir, 
                os.path.splitext(os.path.basename(input_path))[0] + '.pdf'
            )
            
            # Rename if necessary
            if expected_output != output_path and os.path.exists(expected_output):
                shutil.move(expected_output, output_path)
                
            if os.path.exists(output_path):
                return output_path
            else:
                logger.error(f"Expected output not found: {output_path}")
                return None
                
        except subprocess.TimeoutExpired:
            logger.error(f"Timeout while converting: {input_path}")
            return None
        except Exception as e:
            logger.error(f"Error in LibreOffice conversion: {str(e)}")
            return None
    
    def _convert_image_to_pdf(self, input_path, output_path):
        """Convert an image to PDF"""
        try:
            from reportlab.lib.pagesizes import letter
            from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph
            from reportlab.lib.styles import getSampleStyleSheet
            from reportlab.lib.units import inch
            from PIL import Image
            
            # Open image to get dimensions
            img = Image.open(input_path)
            img_width, img_height = img.size
            
            # Create PDF
            doc = SimpleDocTemplate(
                output_path,
                pagesize=letter,
                rightMargin=72,
                leftMargin=72,
                topMargin=72,
                bottomMargin=72
            )
            
            # Calculate image size to fit on page
            page_width, page_height = letter
            avail_width = page_width - 144  # Account for margins
            avail_height = page_height - 144
            
            # Scale image to fit available space
            width_ratio = avail_width / img_width
            height_ratio = avail_height / img_height
            ratio = min(width_ratio, height_ratio)
            
            # Create elements for the PDF
            styles = getSampleStyleSheet()
            elements = []
            
            # Add filename as title
            elements.append(Paragraph(os.path.basename(input_path), styles['Heading2']))
            
            # Add image
            img_obj = RLImage(input_path, width=img_width*ratio, height=img_height*ratio)
            elements.append(img_obj)
            
            # Build PDF
            doc.build(elements)
            
            return output_path
            
        except Exception as e:
            logger.error(f"Error converting image to PDF: {str(e)}")
            return None

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

temp_dir: Optional path to a temporary directory for storing intermediate conversion files. If not provided, a new temporary directory is created using tempfile.mkdtemp(). The directory is created if it doesn't exist. This is useful for controlling where temporary files are stored during conversion operations.

Return Value

Instantiation returns a PDFConverter object. The convert_to_pdf method returns the output_path string if conversion succeeds, or None if conversion fails. The private methods _convert_with_libreoffice and _convert_image_to_pdf also return the output_path on success or None on failure.

Class Interface

Methods

`init(self, temp_dir=None) -> None`

Purpose: Initialize the PDF converter with an optional temporary directory for intermediate files

Parameters:

temp_dir: Optional string path to temporary directory. If None, creates a new temp directory using tempfile.mkdtemp()

Returns: None - initializes the instance

`convert_to_pdf(self, input_path: str, output_path: str) -> str | None`

Purpose: Main public method to convert any supported document format to PDF

Parameters:

input_path: String path to the input file to convert. Must exist or FileNotFoundError is raised
output_path: String path where the PDF output should be saved. Directory is created if it doesn't exist

Returns: String path to the output PDF file on success, or None if conversion fails

`_convert_with_libreoffice(self, input_path: str, output_path: str) -> str | None`

Purpose: Private method to convert office documents (Word, Excel, PowerPoint) to PDF using LibreOffice command-line tool

Parameters:

input_path: String path to the input office document
output_path: String path where the PDF output should be saved

Returns: String path to the output PDF file on success, or None if conversion fails or times out

`_convert_image_to_pdf(self, input_path: str, output_path: str) -> str | None`

Purpose: Private method to convert image files to PDF using ReportLab and PIL, with automatic scaling to fit letter-size pages

Parameters:

input_path: String path to the input image file
output_path: String path where the PDF output should be saved

Returns: String path to the output PDF file on success, or None if conversion fails

Attributes

Name	Type	Description	Scope
`WORD_EXTENSIONS`	list[str]	Class variable containing supported Word document file extensions including .doc, .docx, .docm, .dot, .dotx, .dotm, .rtf, .odt	class
`PPT_EXTENSIONS`	list[str]	Class variable containing supported PowerPoint presentation file extensions including .ppt, .pptx, .pptm, .pot, .potx, .potm, .pps, .ppsx, .odp	class
`EXCEL_EXTENSIONS`	list[str]	Class variable containing supported Excel spreadsheet file extensions including .xls, .xlsx, .xlsm, .xlt, .xltx, .xltm, .xlsb, .ods	class
`IMAGE_EXTENSIONS`	list[str]	Class variable containing supported image file extensions including .jpg, .jpeg, .png, .gif, .bmp, .tiff, .tif	class
`temp_dir`	str	Instance variable storing the path to the temporary directory used for intermediate conversion files. Set during initialization and created if it doesn't exist	instance

Dependencies

os
tempfile
shutil
subprocess
reportlab
PIL
logging

Required Imports

import os
import tempfile
import shutil
import subprocess
import logging