PDFAConverter - Code Extractor

class PDFAConverter

Maturity: 51

A class that converts PDF files to PDF/A format for long-term archiving and compliance, supporting multiple compliance levels (1b, 2b, 3b) with fallback conversion methods.

File:
/tf/active/vicechatdev/document_auditor/src/utils/pdf_utils.py

Lines:
8 - 145

Complexity:
moderate

Purpose

PDFAConverter provides robust PDF to PDF/A conversion functionality for archival and compliance purposes. It attempts multiple conversion strategies: first using pikepdf for simple conversions, then falling back to Ghostscript for more robust processing, and finally copying the original file if all methods fail. The class also provides validation capabilities to check if a PDF meets PDF/A compliance standards. This is useful for organizations that need to ensure long-term document preservation and meet regulatory requirements for document archiving.

Source Code

class PDFAConverter:
    """Converts PDFs to PDF/A format for archiving and compliance"""
    
    def __init__(self):
        self.logger = logging.getLogger(__name__)
    
    def convert_to_pdfa(self, input_path, output_path, compliance_level='2b'):
        """
        Convert PDF to PDF/A format
        
        Args:
            input_path (str): Path to the input PDF
            output_path (str): Path where PDF/A will be saved
            compliance_level (str): PDF/A compliance level ('1b', '2b', '3b')
            
        Returns:
            str: Path to the PDF/A compliant document
        """
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"Input PDF not found: {input_path}")
        
        # Validate compliance level
        if compliance_level not in ['1b', '2b', '3b']:
            self.logger.warning(f"Invalid compliance level: {compliance_level}. Using '2b'.")
            compliance_level = '2b'
        
        # Method 1: Try using pikepdf's convert method (simpler but less robust)
        try:
            self._convert_with_pikepdf(input_path, output_path, compliance_level)
            return output_path
        except Exception as e:
            self.logger.warning(f"pikepdf conversion failed: {e}. Trying alternative method...")
        
        # Method 2: Try using Ghostscript for more robust conversion
        if self._is_ghostscript_available():
            try:
                return self._convert_with_ghostscript(input_path, output_path, compliance_level)
            except Exception as e:
                self.logger.error(f"Ghostscript conversion failed: {e}")
        
        # Fallback: Just copy the original if all else fails
        self.logger.warning("All PDF/A conversion methods failed. Using original PDF.")
        shutil.copy(input_path, output_path)
        return output_path
    
    def _convert_with_pikepdf(self, input_path, output_path, compliance_level):
        """Convert using pikepdf"""
        with pikepdf.open(input_path) as pdf:
            # Add PDF/A identifier
            pdf_version = f"PDF/A-{compliance_level[0]}"
            pdf_conformance = compliance_level[1].upper()
            
            # Set metadata for PDF/A compliance
            with pdf.open_metadata() as meta:
                meta["pdfaid:part"] = compliance_level[0]
                meta["pdfaid:conformance"] = pdf_conformance
                
                # Set XMP properties
                meta["xmp:CreateDate"] = meta.get("xmp:ModifyDate", "")
            
            # Save with PDF/A compliance
            pdf.save(output_path)
        
        self.logger.info(f"Converted to {pdf_version}-{pdf_conformance} using pikepdf: {output_path}")
        return output_path
    
    def _convert_with_ghostscript(self, input_path, output_path, compliance_level):
        """Convert using Ghostscript (more robust but requires ghostscript)"""
        temp_dir = os.path.dirname(output_path) or '.'
        temp_output = os.path.join(temp_dir, "gs_pdfa_temp.pdf")
        
        # Determine GS parameters based on compliance level
        gs_pdfa_def = f"PDFA-{compliance_level}"
        
        # Ghostscript command
        gs_cmd = [
            "gs", "-dPDFA", "-dBATCH", "-dNOPAUSE", "-dNOOUTERSAVE",
            "-sProcessColorModel=DeviceCMYK", "-sDEVICE=pdfwrite",
            f"-dPDFACompatibilityPolicy=1",
            f"-sOutputFile={temp_output}", input_path
        ]
        
        try:
            subprocess.run(gs_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            
            # Copy result to final destination
            shutil.copy(temp_output, output_path)
            
            # Clean up temporary file
            os.remove(temp_output)
            
            self.logger.info(f"Converted to PDF/A-{compliance_level} using Ghostscript: {output_path}")
            return output_path
            
        except subprocess.CalledProcessError as e:
            self.logger.error(f"Ghostscript command failed: {e}")
            if os.path.exists(temp_output):
                os.remove(temp_output)
            raise
    
    def _is_ghostscript_available(self):
        """Check if Ghostscript is available on the system"""
        try:
            subprocess.run(["gs", "--version"], 
                          stdout=subprocess.PIPE, 
                          stderr=subprocess.PIPE)
            return True
        except (subprocess.SubprocessError, FileNotFoundError):
            return False

    def validate_pdfa(self, pdf_path):
        """
        Validate if a PDF is PDF/A compliant
        
        Args:
            pdf_path (str): Path to the PDF file
            
        Returns:
            bool: True if PDF/A compliant, False otherwise
        """
        try:
            with pikepdf.open(pdf_path) as pdf:
                with pdf.open_metadata() as meta:
                    # Check for PDF/A metadata markers
                    has_part = "pdfaid:part" in meta
                    has_conformance = "pdfaid:conformance" in meta
                    
                    if has_part and has_conformance:
                        part = meta["pdfaid:part"]
                        conformance = meta["pdfaid:conformance"]
                        self.logger.info(f"PDF/A-{part}{conformance} compliance detected in {pdf_path}")
                        return True
                    else:
                        self.logger.info(f"No PDF/A compliance detected in {pdf_path}")
                        return False
        except Exception as e:
            self.logger.error(f"Error validating PDF/A compliance: {e}")
            return False

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

No constructor parameters: The __init__ method takes no parameters. It only initializes an internal logger instance for tracking conversion operations and errors.

Return Value

Instantiation returns a PDFAConverter object. The main convert_to_pdfa method returns a string containing the path to the converted PDF/A file. The validate_pdfa method returns a boolean indicating whether the PDF is PDF/A compliant (True) or not (False).

Class Interface

Methods

`init(self)`

Purpose: Initialize the PDFAConverter instance with a logger

Returns: None - initializes the instance

`convert_to_pdfa(self, input_path: str, output_path: str, compliance_level: str = '2b') -> str`

Purpose: Convert a PDF file to PDF/A format with specified compliance level using multiple fallback methods

Parameters:

input_path: Path to the input PDF file to be converted
output_path: Path where the PDF/A compliant file will be saved
compliance_level: PDF/A compliance level - must be '1b', '2b', or '3b' (default: '2b')

Returns: String containing the path to the converted PDF/A file (same as output_path parameter)

`_convert_with_pikepdf(self, input_path: str, output_path: str, compliance_level: str) -> str`

Purpose: Internal method to convert PDF to PDF/A using the pikepdf library by setting appropriate metadata

Parameters:

input_path: Path to the input PDF file
output_path: Path where the converted file will be saved
compliance_level: PDF/A compliance level ('1b', '2b', or '3b')

Returns: String containing the path to the converted PDF/A file

`_convert_with_ghostscript(self, input_path: str, output_path: str, compliance_level: str) -> str`

Purpose: Internal method to convert PDF to PDF/A using Ghostscript command-line tool for more robust conversion

Parameters:

input_path: Path to the input PDF file
output_path: Path where the converted file will be saved
compliance_level: PDF/A compliance level ('1b', '2b', or '3b')

Returns: String containing the path to the converted PDF/A file

`_is_ghostscript_available(self) -> bool`

Purpose: Internal method to check if Ghostscript is installed and available on the system

Returns: Boolean - True if Ghostscript is available, False otherwise

`validate_pdfa(self, pdf_path: str) -> bool`

Purpose: Validate whether a PDF file is PDF/A compliant by checking for required metadata markers

Parameters:

pdf_path: Path to the PDF file to validate

Returns: Boolean - True if the PDF is PDF/A compliant (has pdfaid:part and pdfaid:conformance metadata), False otherwise

Attributes

Name	Type	Description	Scope
`logger`	logging.Logger	Logger instance for tracking conversion operations, warnings, and errors throughout the conversion process	instance

Dependencies

logging
os
pikepdf
subprocess
tempfile
shutil

Required Imports

import logging
import os
import pikepdf
import subprocess
import tempfile
import shutil

Conditional/Optional Imports

These imports are only needed under specific conditions:

Ghostscript system binary (gs command)

Condition: only needed if pikepdf conversion fails and more robust conversion is required

Optional

Usage Example

# Basic usage
from pdfa_converter import PDFAConverter

# Instantiate the converter
converter = PDFAConverter()

# Convert a PDF to PDF/A-2b (default)
output_path = converter.convert_to_pdfa(
    input_path='document.pdf',
    output_path='document_pdfa.pdf'
)

# Convert with specific compliance level
output_path = converter.convert_to_pdfa(
    input_path='document.pdf',
    output_path='document_pdfa.pdf',
    compliance_level='3b'
)

# Validate if a PDF is PDF/A compliant
is_compliant = converter.validate_pdfa('document_pdfa.pdf')
if is_compliant:
    print('Document is PDF/A compliant')
else:
    print('Document is not PDF/A compliant')

# Handle conversion errors
try:
    output = converter.convert_to_pdfa('input.pdf', 'output.pdf', '2b')
    print(f'Conversion successful: {output}')
except FileNotFoundError as e:
    print(f'Input file not found: {e}')

Best Practices

Always check that input files exist before calling convert_to_pdfa to avoid FileNotFoundError
Use compliance level '2b' (default) for most modern archiving needs; '1b' for older systems; '3b' for newest standards
Install Ghostscript on the system for more robust conversion capabilities, especially for complex PDFs
The class uses a fallback strategy: pikepdf -> Ghostscript -> copy original. Monitor logs to understand which method was used
Call validate_pdfa after conversion to verify the output meets PDF/A standards
Ensure sufficient disk space in the output directory as temporary files may be created during Ghostscript conversion
The logger attribute can be configured externally for custom logging behavior before calling conversion methods
Invalid compliance levels automatically fall back to '2b' with a warning logged
The class is stateless between method calls, so a single instance can be reused for multiple conversions
Handle exceptions appropriately as conversion may fail for corrupted or protected PDFs

Similar Components

AI-powered semantic similarity - components with related functionality:

class ControlledDocumentConverter 73.2% similar

A comprehensive document converter class that transforms controlled documents into archived PDFs with signature pages, audit trails, hash-based integrity verification, and PDF/A compliance for long-term archival.
From: /tf/active/vicechatdev/CDocs/utils/document_converter.py
class PDFConverter 66.2% similar

A class that converts various document formats (Word, PowerPoint, Excel, images) to PDF format using LibreOffice and ReportLab libraries.
From: /tf/active/vicechatdev/msg_to_eml.py
class PDFConverter_v1 65.6% similar

A comprehensive document-to-PDF converter class that handles multiple file formats (Word, Excel, PowerPoint, images) with multiple conversion methods and automatic fallbacks for reliability.
From: /tf/active/vicechatdev/CDocs/utils/pdf_utils.py
class DocumentConverter 63.2% similar

A class that converts various document formats (Word, Excel, PowerPoint, OpenDocument, Visio) to PDF using LibreOffice's headless conversion capabilities, with support for parallel processing and directory structure preservation.
From: /tf/active/vicechatdev/pdfconverter.py
class DocumentProcessor 61.1% similar

A comprehensive document processing class that converts documents to PDF, adds audit trails, applies security features (watermarks, signatures, hashing), and optionally converts to PDF/A format with document protection.
From: /tf/active/vicechatdev/document_auditor/src/document_processor.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class PDFAConverter:
    """Converts PDFs to PDF/A format for archiving and compliance"""
    
    def __init__(self):
        self.logger = logging.getLogger(__name__)
    
    def convert_to_pdfa(self, input_path, output_path, compliance_level='2b'):
        """
        Convert PDF to PDF/A format
        
        Args:
            input_path (str): Path to the input PDF
            output_path (str): Path where PDF/A will be saved
            compliance_level (str): PDF/A compliance level ('1b', '2b', '3b')
            
        Returns:
            str: Path to the PDF/A compliant document
        """
        if not os.path.exists(input_path):
            raise FileNotFoundError(f"Input PDF not found: {input_path}")
        
        # Validate compliance level
        if compliance_level not in ['1b', '2b', '3b']:
            self.logger.warning(f"Invalid compliance level: {compliance_level}. Using '2b'.")
            compliance_level = '2b'
        
        # Method 1: Try using pikepdf's convert method (simpler but less robust)
        try:
            self._convert_with_pikepdf(input_path, output_path, compliance_level)
            return output_path
        except Exception as e:
            self.logger.warning(f"pikepdf conversion failed: {e}. Trying alternative method...")
        
        # Method 2: Try using Ghostscript for more robust conversion
        if self._is_ghostscript_available():
            try:
                return self._convert_with_ghostscript(input_path, output_path, compliance_level)
            except Exception as e:
                self.logger.error(f"Ghostscript conversion failed: {e}")
        
        # Fallback: Just copy the original if all else fails
        self.logger.warning("All PDF/A conversion methods failed. Using original PDF.")
        shutil.copy(input_path, output_path)
        return output_path
    
    def _convert_with_pikepdf(self, input_path, output_path, compliance_level):
        """Convert using pikepdf"""
        with pikepdf.open(input_path) as pdf:
            # Add PDF/A identifier
            pdf_version = f"PDF/A-{compliance_level[0]}"
            pdf_conformance = compliance_level[1].upper()
            
            # Set metadata for PDF/A compliance
            with pdf.open_metadata() as meta:
                meta["pdfaid:part"] = compliance_level[0]
                meta["pdfaid:conformance"] = pdf_conformance
                
                # Set XMP properties
                meta["xmp:CreateDate"] = meta.get("xmp:ModifyDate", "")
            
            # Save with PDF/A compliance
            pdf.save(output_path)
        
        self.logger.info(f"Converted to {pdf_version}-{pdf_conformance} using pikepdf: {output_path}")
        return output_path
    
    def _convert_with_ghostscript(self, input_path, output_path, compliance_level):
        """Convert using Ghostscript (more robust but requires ghostscript)"""
        temp_dir = os.path.dirname(output_path) or '.'
        temp_output = os.path.join(temp_dir, "gs_pdfa_temp.pdf")
        
        # Determine GS parameters based on compliance level
        gs_pdfa_def = f"PDFA-{compliance_level}"
        
        # Ghostscript command
        gs_cmd = [
            "gs", "-dPDFA", "-dBATCH", "-dNOPAUSE", "-dNOOUTERSAVE",
            "-sProcessColorModel=DeviceCMYK", "-sDEVICE=pdfwrite",
            f"-dPDFACompatibilityPolicy=1",
            f"-sOutputFile={temp_output}", input_path
        ]
        
        try:
            subprocess.run(gs_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            
            # Copy result to final destination
            shutil.copy(temp_output, output_path)
            
            # Clean up temporary file
            os.remove(temp_output)
            
            self.logger.info(f"Converted to PDF/A-{compliance_level} using Ghostscript: {output_path}")
            return output_path
            
        except subprocess.CalledProcessError as e:
            self.logger.error(f"Ghostscript command failed: {e}")
            if os.path.exists(temp_output):
                os.remove(temp_output)
            raise
    
    def _is_ghostscript_available(self):
        """Check if Ghostscript is available on the system"""
        try:
            subprocess.run(["gs", "--version"], 
                          stdout=subprocess.PIPE, 
                          stderr=subprocess.PIPE)
            return True
        except (subprocess.SubprocessError, FileNotFoundError):
            return False

    def validate_pdfa(self, pdf_path):
        """
        Validate if a PDF is PDF/A compliant
        
        Args:
            pdf_path (str): Path to the PDF file
            
        Returns:
            bool: True if PDF/A compliant, False otherwise
        """
        try:
            with pikepdf.open(pdf_path) as pdf:
                with pdf.open_metadata() as meta:
                    # Check for PDF/A metadata markers
                    has_part = "pdfaid:part" in meta
                    has_conformance = "pdfaid:conformance" in meta
                    
                    if has_part and has_conformance:
                        part = meta["pdfaid:part"]
                        conformance = meta["pdfaid:conformance"]
                        self.logger.info(f"PDF/A-{part}{conformance} compliance detected in {pdf_path}")
                        return True
                    else:
                        self.logger.info(f"No PDF/A compliance detected in {pdf_path}")
                        return False
        except Exception as e:
            self.logger.error(f"Error validating PDF/A compliance: {e}")
            return False
                        

Improved Code

🔍 Code Extractor

class PDFAConverter

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self)`

`convert_to_pdfa(self, input_path: str, output_path: str, compliance_level: str = '2b') -> str`

`_convert_with_pikepdf(self, input_path: str, output_path: str, compliance_level: str) -> str`

`_convert_with_ghostscript(self, input_path: str, output_path: str, compliance_level: str) -> str`

`_is_ghostscript_available(self) -> bool`

`validate_pdfa(self, pdf_path: str) -> bool`

Attributes

Dependencies

Required Imports

Conditional/Optional Imports

Usage Example

Best Practices

Tags

Similar Components

class ControlledDocumentConverter 73.2% similar

class PDFConverter 66.2% similar

class PDFConverter_v1 65.6% similar

class DocumentConverter 63.2% similar

class DocumentProcessor 61.1% similar

class PDFAConverter

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self)

convert_to_pdfa(self, input_path: str, output_path: str, compliance_level: str = '2b') -> str

_convert_with_pikepdf(self, input_path: str, output_path: str, compliance_level: str) -> str

_convert_with_ghostscript(self, input_path: str, output_path: str, compliance_level: str) -> str

_is_ghostscript_available(self) -> bool

validate_pdfa(self, pdf_path: str) -> bool

Attributes

Dependencies

Required Imports

Conditional/Optional Imports

Usage Example

Best Practices

Tags

Similar Components

class ControlledDocumentConverter 73.2% similar

class PDFConverter 66.2% similar

class PDFConverter_v1 65.6% similar

class DocumentConverter 63.2% similar

class DocumentProcessor 61.1% similar

✨ Improve Code: PDFAConverter

Code Comparison

`init(self)`

`convert_to_pdfa(self, input_path: str, output_path: str, compliance_level: str = '2b') -> str`

`_convert_with_pikepdf(self, input_path: str, output_path: str, compliance_level: str) -> str`

`_convert_with_ghostscript(self, input_path: str, output_path: str, compliance_level: str) -> str`

`_is_ghostscript_available(self) -> bool`

`validate_pdfa(self, pdf_path: str) -> bool`