DocumentConverter_v1 - Code Extractor

class DocumentConverter_v1

Maturity: 50

A class that converts various document formats (Word, Excel, PowerPoint, images) to PDF format using LibreOffice, unoconv, or PIL.

File:
/tf/active/vicechatdev/document_auditor/src/document_converter.py

Lines:
8 - 136

Complexity:
moderate

Purpose

DocumentConverter provides a unified interface for converting multiple document and image formats to PDF. It handles Microsoft Office documents (.docx, .doc, .ppt, .pptx, .xlsx, .xls) using LibreOffice or unoconv as fallback, and image formats (.png, .jpg, .jpeg, .tiff, .bmp, .gif) using PIL. The class automatically detects file types and applies the appropriate conversion method, with built-in error handling and logging.

Source Code

class DocumentConverter:
    """Converts various document formats to PDF"""
    
    def __init__(self):
        self.logger = logging.getLogger(__name__)
    
    def convert_to_pdf(self, input_path, output_path):
        """
        Convert document to PDF if it's not already in PDF format
        
        Args:
            input_path (str): Path to the input document
            output_path (str): Path where PDF will be saved
            
        Returns:
            str: Path to the PDF document
        """
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"Input file not found: {input_path}")
        
        ext = os.path.splitext(input_path)[1].lower()
        
        if ext == '.pdf':
            # Already a PDF, just copy
            self.logger.info(f"File is already PDF, copying to {output_path}")
            shutil.copy(input_path, output_path)
        elif ext in ['.docx', '.doc','.ppt','.pptx', '.xlsx', '.xls']:
            # Convert Word, Excel or Powerpoint document using LibreOffice
            self.logger.info(f"Converting Word document to PDF: {input_path}")
            self._convert_with_libreoffice(input_path, output_path)
        elif ext in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif']:
            # Convert image to PDF
            self.logger.info(f"Converting image to PDF: {input_path}")
            self._convert_image_to_pdf(input_path, output_path)
        else:
            raise ValueError(f"Unsupported file format: {ext}")
        
        return output_path
    
    def _convert_with_libreoffice(self, word_path, pdf_path):
        """Convert Word document to PDF using LibreOffice"""
        try:
            # Create output directory if it doesn't exist
            output_dir = os.path.dirname(pdf_path)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            
            # Get base filename without extension
            base_name = os.path.basename(word_path)
            base_name_without_ext = os.path.splitext(base_name)[0]
            
            # Create a temporary directory for conversion
            with tempfile.TemporaryDirectory() as temp_dir:
                # Run LibreOffice to convert the file
                cmd = [
                    'libreoffice',
                    '--headless',
                    '--convert-to', 'pdf',
                    '--outdir', temp_dir,
                    word_path
                ]
                
                self.logger.info(f"Running command: {' '.join(cmd)}")
                result = subprocess.run(
                    cmd,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    text=True,
                    check=True
                )
                
                self.logger.debug(f"LibreOffice conversion output: {result.stdout}")
                
                # Check if the converted file exists
                expected_output = os.path.join(temp_dir, f"{base_name_without_ext}.pdf")
                if os.path.exists(expected_output):
                    # Move the file to the desired location
                    shutil.move(expected_output, pdf_path)
                else:
                    # List files in temp dir for debugging
                    files_in_temp = os.listdir(temp_dir)
                    self.logger.error(f"Expected output {expected_output} not found. Files in temp dir: {files_in_temp}")
                    raise FileNotFoundError(f"Converted PDF not found: {expected_output}")
                
        except subprocess.CalledProcessError as e:
            self.logger.error(f"LibreOffice conversion failed: {e}")
            self.logger.error(f"STDOUT: {e.stdout}")
            self.logger.error(f"STDERR: {e.stderr}")
            
            # Try alternate method if LibreOffice fails
            self._convert_with_unoconv(word_path, pdf_path)
    
    def _convert_with_unoconv(self, word_path, pdf_path):
        """Alternate conversion method using unoconv"""
        try:
            self.logger.info("Trying unoconv for conversion")
            cmd = ['unoconv', '-f', 'pdf', '-o', pdf_path, word_path]
            
            result = subprocess.run(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                check=True
            )
            
            self.logger.debug(f"unoconv conversion output: {result.stdout}")
            
            if not os.path.exists(pdf_path):
                raise FileNotFoundError(f"unoconv failed to create PDF: {pdf_path}")
                
        except subprocess.CalledProcessError as e:
            self.logger.error(f"unoconv conversion failed: {e}")
            self.logger.error(f"STDOUT: {e.stdout}")
            self.logger.error(f"STDERR: {e.stderr}")
            raise RuntimeError("All document conversion methods failed")
        except FileNotFoundError:
            self.logger.error("unoconv command not found. Please install with: sudo apt-get install unoconv")
            raise RuntimeError("Document conversion failed and unoconv not installed")
    
    def _convert_image_to_pdf(self, image_path, pdf_path):
        """Convert image file to PDF"""
        try:
            img = Image.open(image_path)
            img_rgb = img.convert('RGB')
            img_rgb.save(pdf_path)
        except Exception as e:
            self.logger.error(f"Error converting image to PDF: {e}")
            raise

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

__init__: No parameters required. The constructor initializes a logger instance for tracking conversion operations.

Return Value

Instantiation returns a DocumentConverter object. The main method convert_to_pdf() returns a string containing the path to the converted PDF file. Private methods (_convert_with_libreoffice, _convert_with_unoconv, _convert_image_to_pdf) do not return values but modify files on disk.

Class Interface

Methods

`init(self)`

Purpose: Initialize the DocumentConverter with a logger instance

Returns: None

`convert_to_pdf(self, input_path: str, output_path: str) -> str`

Purpose: Main public method to convert various document formats to PDF, automatically detecting file type and applying appropriate conversion method

Parameters:

input_path: String path to the input document file. Supported formats: .pdf, .docx, .doc, .ppt, .pptx, .xlsx, .xls, .png, .jpg, .jpeg, .tiff, .bmp, .gif
output_path: String path where the converted PDF file will be saved. Directory must exist or be creatable

Returns: String containing the path to the converted PDF file (same as output_path parameter)

`_convert_with_libreoffice(self, word_path: str, pdf_path: str) -> None`

Purpose: Private method to convert Office documents (Word, Excel, PowerPoint) to PDF using LibreOffice in headless mode

Parameters:

word_path: String path to the input Office document file
pdf_path: String path where the converted PDF will be saved

Returns: None. Raises subprocess.CalledProcessError if LibreOffice conversion fails, which triggers fallback to unoconv

`_convert_with_unoconv(self, word_path: str, pdf_path: str) -> None`

Purpose: Private fallback method to convert Office documents to PDF using unoconv when LibreOffice conversion fails

Parameters:

word_path: String path to the input Office document file
pdf_path: String path where the converted PDF will be saved

Returns: None. Raises RuntimeError if unoconv conversion fails or unoconv is not installed

`_convert_image_to_pdf(self, image_path: str, pdf_path: str) -> None`

Purpose: Private method to convert image files to PDF format using PIL/Pillow library

Parameters:

image_path: String path to the input image file. Supported formats: .png, .jpg, .jpeg, .tiff, .bmp, .gif
pdf_path: String path where the converted PDF will be saved

Returns: None. Raises exceptions if image cannot be opened or saved

Attributes

Name	Type	Description	Scope
`logger`	logging.Logger	Logger instance for tracking conversion operations, errors, and debug information. Initialized with the module's __name__	instance

Dependencies

os
logging
shutil
subprocess
tempfile
PIL

Required Imports

import os
import logging
import shutil
import subprocess
import tempfile
from PIL import Image

Usage Example

import logging
from document_converter import DocumentConverter

# Configure logging
logging.basicConfig(level=logging.INFO)

# Create converter instance
converter = DocumentConverter()

# Convert a Word document to PDF
try:
    pdf_path = converter.convert_to_pdf(
        input_path='/path/to/document.docx',
        output_path='/path/to/output.pdf'
    )
    print(f'PDF created at: {pdf_path}')
except FileNotFoundError as e:
    print(f'File not found: {e}')
except ValueError as e:
    print(f'Unsupported format: {e}')
except RuntimeError as e:
    print(f'Conversion failed: {e}')

# Convert an image to PDF
pdf_path = converter.convert_to_pdf(
    input_path='/path/to/image.png',
    output_path='/path/to/image.pdf'
)

# Copy existing PDF (no conversion needed)
pdf_path = converter.convert_to_pdf(
    input_path='/path/to/existing.pdf',
    output_path='/path/to/copy.pdf'
)

Best Practices

Always wrap convert_to_pdf() calls in try-except blocks to handle FileNotFoundError, ValueError, and RuntimeError exceptions
Ensure LibreOffice is installed before attempting to convert Office documents
Check that output directory exists or has write permissions before conversion
The class is stateless except for the logger, so a single instance can be reused for multiple conversions
For batch conversions, create one DocumentConverter instance and reuse it
Monitor disk space when converting large documents as temporary files are created during conversion
The _convert_with_libreoffice method uses temporary directories that are automatically cleaned up
If LibreOffice conversion fails, the class automatically attempts unoconv as fallback
Image conversions are converted to RGB mode before saving to ensure compatibility
Log output at INFO level to track conversion progress and DEBUG level for detailed diagnostics

Similar Components

AI-powered semantic similarity - components with related functionality:

class PDFConverter 89.8% similar

A class that converts various document formats (Word, PowerPoint, Excel, images) to PDF format using LibreOffice and ReportLab libraries.
From: /tf/active/vicechatdev/msg_to_eml.py
class DocumentConverter 88.6% similar

A class that converts various document formats (Word, Excel, PowerPoint, OpenDocument, Visio) to PDF using LibreOffice's headless conversion capabilities, with support for parallel processing and directory structure preservation.
From: /tf/active/vicechatdev/pdfconverter.py
class PDFConverter_v1 87.7% similar

A comprehensive document-to-PDF converter class that handles multiple file formats (Word, Excel, PowerPoint, images) with multiple conversion methods and automatic fallbacks for reliability.
From: /tf/active/vicechatdev/CDocs/utils/pdf_utils.py
class ControlledDocumentConverter 68.3% similar

A comprehensive document converter class that transforms controlled documents into archived PDFs with signature pages, audit trails, hash-based integrity verification, and PDF/A compliance for long-term archival.
From: /tf/active/vicechatdev/CDocs/utils/document_converter.py
class DocumentExtractor 64.8% similar

A document text extraction class that supports multiple file formats including Word, PowerPoint, PDF, and plain text files, with automatic format detection and conversion capabilities.
From: /tf/active/vicechatdev/leexi/document_extractor.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class DocumentConverter:
    """Converts various document formats to PDF"""
    
    def __init__(self):
        self.logger = logging.getLogger(__name__)
    
    def convert_to_pdf(self, input_path, output_path):
        """
        Convert document to PDF if it's not already in PDF format
        
        Args:
            input_path (str): Path to the input document
            output_path (str): Path where PDF will be saved
            
        Returns:
            str: Path to the PDF document
        """
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"Input file not found: {input_path}")
        
        ext = os.path.splitext(input_path)[1].lower()
        
        if ext == '.pdf':
            # Already a PDF, just copy
            self.logger.info(f"File is already PDF, copying to {output_path}")
            shutil.copy(input_path, output_path)
        elif ext in ['.docx', '.doc','.ppt','.pptx', '.xlsx', '.xls']:
            # Convert Word, Excel or Powerpoint document using LibreOffice
            self.logger.info(f"Converting Word document to PDF: {input_path}")
            self._convert_with_libreoffice(input_path, output_path)
        elif ext in ['.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif']:
            # Convert image to PDF
            self.logger.info(f"Converting image to PDF: {input_path}")
            self._convert_image_to_pdf(input_path, output_path)
        else:
            raise ValueError(f"Unsupported file format: {ext}")
        
        return output_path
    
    def _convert_with_libreoffice(self, word_path, pdf_path):
        """Convert Word document to PDF using LibreOffice"""
        try:
            # Create output directory if it doesn't exist
            output_dir = os.path.dirname(pdf_path)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            
            # Get base filename without extension
            base_name = os.path.basename(word_path)
            base_name_without_ext = os.path.splitext(base_name)[0]
            
            # Create a temporary directory for conversion
            with tempfile.TemporaryDirectory() as temp_dir:
                # Run LibreOffice to convert the file
                cmd = [
                    'libreoffice',
                    '--headless',
                    '--convert-to', 'pdf',
                    '--outdir', temp_dir,
                    word_path
                ]
                
                self.logger.info(f"Running command: {' '.join(cmd)}")
                result = subprocess.run(
                    cmd,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    text=True,
                    check=True
                )
                
                self.logger.debug(f"LibreOffice conversion output: {result.stdout}")
                
                # Check if the converted file exists
                expected_output = os.path.join(temp_dir, f"{base_name_without_ext}.pdf")
                if os.path.exists(expected_output):
                    # Move the file to the desired location
                    shutil.move(expected_output, pdf_path)
                else:
                    # List files in temp dir for debugging
                    files_in_temp = os.listdir(temp_dir)
                    self.logger.error(f"Expected output {expected_output} not found. Files in temp dir: {files_in_temp}")
                    raise FileNotFoundError(f"Converted PDF not found: {expected_output}")
                
        except subprocess.CalledProcessError as e:
            self.logger.error(f"LibreOffice conversion failed: {e}")
            self.logger.error(f"STDOUT: {e.stdout}")
            self.logger.error(f"STDERR: {e.stderr}")
            
            # Try alternate method if LibreOffice fails
            self._convert_with_unoconv(word_path, pdf_path)
    
    def _convert_with_unoconv(self, word_path, pdf_path):
        """Alternate conversion method using unoconv"""
        try:
            self.logger.info("Trying unoconv for conversion")
            cmd = ['unoconv', '-f', 'pdf', '-o', pdf_path, word_path]
            
            result = subprocess.run(
                cmd,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                check=True
            )
            
            self.logger.debug(f"unoconv conversion output: {result.stdout}")
            
            if not os.path.exists(pdf_path):
                raise FileNotFoundError(f"unoconv failed to create PDF: {pdf_path}")
                
        except subprocess.CalledProcessError as e:
            self.logger.error(f"unoconv conversion failed: {e}")
            self.logger.error(f"STDOUT: {e.stdout}")
            self.logger.error(f"STDERR: {e.stderr}")
            raise RuntimeError("All document conversion methods failed")
        except FileNotFoundError:
            self.logger.error("unoconv command not found. Please install with: sudo apt-get install unoconv")
            raise RuntimeError("Document conversion failed and unoconv not installed")
    
    def _convert_image_to_pdf(self, image_path, pdf_path):
        """Convert image file to PDF"""
        try:
            img = Image.open(image_path)
            img_rgb = img.convert('RGB')
            img_rgb.save(pdf_path)
        except Exception as e:
            self.logger.error(f"Error converting image to PDF: {e}")
            raise
                        

Improved Code

🔍 Code Extractor

class DocumentConverter_v1

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self)`

`convert_to_pdf(self, input_path: str, output_path: str) -> str`

`_convert_with_libreoffice(self, word_path: str, pdf_path: str) -> None`

`_convert_with_unoconv(self, word_path: str, pdf_path: str) -> None`

`_convert_image_to_pdf(self, image_path: str, pdf_path: str) -> None`

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class PDFConverter 89.8% similar

class DocumentConverter 88.6% similar

class PDFConverter_v1 87.7% similar

class ControlledDocumentConverter 68.3% similar

class DocumentExtractor 64.8% similar

class DocumentConverter_v1

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self)

convert_to_pdf(self, input_path: str, output_path: str) -> str

_convert_with_libreoffice(self, word_path: str, pdf_path: str) -> None

_convert_with_unoconv(self, word_path: str, pdf_path: str) -> None

_convert_image_to_pdf(self, image_path: str, pdf_path: str) -> None

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class PDFConverter 89.8% similar

class DocumentConverter 88.6% similar

class PDFConverter_v1 87.7% similar

class ControlledDocumentConverter 68.3% similar

class DocumentExtractor 64.8% similar

✨ Improve Code: DocumentConverter_v1

Code Comparison

`init(self)`

`convert_to_pdf(self, input_path: str, output_path: str) -> str`

`_convert_with_libreoffice(self, word_path: str, pdf_path: str) -> None`

`_convert_with_unoconv(self, word_path: str, pdf_path: str) -> None`

`_convert_image_to_pdf(self, image_path: str, pdf_path: str) -> None`