InputProcessor - Code Extractor

class InputProcessor

Maturity: 51

A class that handles extraction and encoding of images from various input formats including PDFs (single or multi-page) and standard image files, converting them to base64-encoded strings with metadata.

File:
/tf/active/vicechatdev/e-ink-llm/input_processor.py

Lines:
13 - 151

Complexity:
moderate

Purpose

InputProcessor provides a unified interface for processing different image input formats. It can extract images from PDF files (either first page only or all pages), encode standard image files to base64, and return comprehensive metadata about the processed content. The class supports multi-page PDF processing with text extraction and content analysis, making it suitable for document processing pipelines that need to convert visual content for API consumption or further processing.

Source Code

class InputProcessor:
    """Handles extraction and encoding of images from various input formats"""
    
    def __init__(self, enable_multi_page: bool = True, max_pages: int = 50):
        """
        Initialize input processor
        
        Args:
            enable_multi_page: Enable multi-page PDF processing
            max_pages: Maximum pages to process in multi-page mode
        """
        self.enable_multi_page = enable_multi_page
        self.max_pages = max_pages
        self.multi_page_processor = MultiPagePDFProcessor(max_pages=max_pages) if enable_multi_page else None
    
    def extract_image(self, file_path: str) -> Union[Tuple[str, dict], Tuple[List[str], dict]]:
        """
        Extract image(s) from PDF or encode image file to base64
        
        Args:
            file_path: Path to the input file
            
        Returns:
            For single page: Tuple of (base64_encoded_image, metadata_dict)
            For multi-page: Tuple of (list_of_base64_images, metadata_dict)
        """
        file_path = Path(file_path)
        
        if file_path.suffix.lower() == '.pdf':
            if self.enable_multi_page:
                return self._extract_multi_page_pdf(file_path)
            else:
                return self._extract_from_pdf(file_path)
        else:
            return self._encode_image(file_path)
    
    @staticmethod
    def _extract_from_pdf(pdf_path: Path) -> Tuple[str, dict]:
        """Extract first page of PDF as image"""
        try:
            doc = fitz.open(pdf_path)
            page = doc[0]  # Get first page
            
            # Render page as image (high DPI for e-ink clarity)
            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
            pix = page.get_pixmap(matrix=mat)
            
            # Convert to PIL Image
            img_data = pix.tobytes("png")
            img = PILImage.open(io.BytesIO(img_data))
            
            # Convert to base64
            buffer = io.BytesIO()
            img.save(buffer, format='PNG')
            img_b64 = base64.b64encode(buffer.getvalue()).decode()
            
            metadata = {
                'source_type': 'pdf',
                'source_file': str(pdf_path),
                'page_count': len(doc),
                'dimensions': (img.width, img.height)
            }
            
            doc.close()
            return img_b64, metadata
            
        except Exception as e:
            raise Exception(f"Error processing PDF {pdf_path}: {str(e)}")
    
    def _extract_multi_page_pdf(self, pdf_path: Path) -> Tuple[List[str], dict]:
        """Extract all pages from PDF using multi-page processor"""
        try:
            pages, metadata = self.multi_page_processor.extract_all_pages(pdf_path)
            
            # Convert to expected format
            page_images = [page.image_b64 for page in pages]
            
            # Enhanced metadata with multi-page info
            enhanced_metadata = {
                **metadata,
                'pages': [
                    {
                        'page_number': page.page_number,
                        'text_content': page.text_content,
                        'dimensions': page.dimensions,
                        'has_content': len(page.text_content.strip()) > 0
                    }
                    for page in pages
                ],
                'total_text_length': sum(len(page.text_content) for page in pages),
                'content_pages': sum(1 for page in pages if len(page.text_content.strip()) > 0)
            }
            
            return page_images, enhanced_metadata
            
        except Exception as e:
            raise Exception(f"Error processing multi-page PDF {pdf_path}: {str(e)}")
    
    @staticmethod
    def _encode_image(image_path: Path) -> Tuple[str, dict]:
        """Encode image file to base64"""
        try:
            # Open and process image
            with PILImage.open(image_path) as img:
                # Convert to RGB if necessary (for PNG with transparency)
                if img.mode in ('RGBA', 'LA', 'P'):
                    background = PILImage.new('RGB', img.size, (255, 255, 255))
                    if img.mode == 'P':
                        img = img.convert('RGBA')
                    background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
                    img = background
                
                # Resize if too large (optimize for API limits)
                max_size = 2048
                if img.width > max_size or img.height > max_size:
                    img.thumbnail((max_size, max_size), PILImage.Resampling.LANCZOS)
                
                # Convert to base64
                buffer = io.BytesIO()
                img.save(buffer, format='JPEG', quality=85)
                img_b64 = base64.b64encode(buffer.getvalue()).decode()
                
                metadata = {
                    'source_type': 'image',
                    'source_file': str(image_path),
                    'original_format': image_path.suffix.lower(),
                    'dimensions': img.size
                }
                
                return img_b64, metadata
                
        except Exception as e:
            raise Exception(f"Error processing image {image_path}: {str(e)}")
    
    @staticmethod
    def is_supported_file(file_path: Path) -> bool:
        """Check if file type is supported"""
        supported_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
        return file_path.suffix.lower() in supported_extensions

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

enable_multi_page: Boolean flag that determines whether to process all pages of a PDF (True) or only the first page (False). When enabled, uses MultiPagePDFProcessor for comprehensive page extraction. Default is True.

max_pages: Integer specifying the maximum number of pages to process from a multi-page PDF. This limit prevents excessive processing time and memory usage for very large documents. Default is 50 pages.

Return Value

Instantiation returns an InputProcessor object. The main extract_image method returns different types based on input: For single-page mode or image files, returns a tuple of (base64_string, metadata_dict). For multi-page PDFs, returns a tuple of (list_of_base64_strings, enhanced_metadata_dict). Metadata includes source_type, source_file, dimensions, and for multi-page PDFs, detailed per-page information including text content and content analysis.

Class Interface

Methods

`init(self, enable_multi_page: bool = True, max_pages: int = 50)`

Purpose: Initialize the InputProcessor with configuration for PDF processing mode

Parameters:

enable_multi_page: Boolean to enable multi-page PDF processing (default: True)
max_pages: Maximum number of pages to process in multi-page mode (default: 50)

Returns: None - initializes the instance

`extract_image(self, file_path: str) -> Union[Tuple[str, dict], Tuple[List[str], dict]]`

Purpose: Main method to extract and encode images from PDF or image files to base64 format

Parameters:

file_path: String path to the input file (PDF or image format)

Returns: For single page/image: Tuple of (base64_encoded_image, metadata_dict). For multi-page PDF: Tuple of (list_of_base64_images, enhanced_metadata_dict)

`_extract_from_pdf(pdf_path: Path) -> Tuple[str, dict]` static

Purpose: Extract the first page of a PDF as a base64-encoded image with metadata

Parameters:

pdf_path: Path object pointing to the PDF file

Returns: Tuple of (base64_encoded_image, metadata_dict) containing source info, page count, and dimensions

`_extract_multi_page_pdf(self, pdf_path: Path) -> Tuple[List[str], dict]`

Purpose: Extract all pages from a PDF using the multi-page processor with text extraction

Parameters:

pdf_path: Path object pointing to the PDF file

Returns: Tuple of (list_of_base64_images, enhanced_metadata_dict) with per-page text content and analysis

`_encode_image(image_path: Path) -> Tuple[str, dict]` static

Purpose: Encode a standard image file to base64 with automatic format conversion and resizing

Parameters:

image_path: Path object pointing to the image file

Returns: Tuple of (base64_encoded_image, metadata_dict) with source info, format, and dimensions

`is_supported_file(file_path: Path) -> bool` static

Purpose: Check if a file type is supported by the processor based on file extension

Parameters:

file_path: Path object to check for supported extension

Returns: Boolean indicating whether the file type is supported (PDF or common image formats)

Attributes

Name	Type	Description	Scope
`enable_multi_page`	bool	Flag indicating whether multi-page PDF processing is enabled	instance
`max_pages`	int	Maximum number of pages to process from multi-page PDFs	instance
`multi_page_processor`	Optional[MultiPagePDFProcessor]	Instance of MultiPagePDFProcessor for handling multi-page PDFs, or None if multi-page processing is disabled	instance

Dependencies

base64
asyncio
json
pathlib
PIL
fitz
io
typing
os
multi_page_processor

Required Imports

import base64
import asyncio
import json
from pathlib import Path
from PIL import Image as PILImage
import fitz
import io
from typing import Optional, Tuple, List, Union
import os
from multi_page_processor import MultiPagePDFProcessor, PageAnalysis

Conditional/Optional Imports

These imports are only needed under specific conditions:

from multi_page_processor import MultiPagePDFProcessor, PageAnalysis

Condition: required when enable_multi_page=True for multi-page PDF processing

Required (conditional)

Usage Example

# Basic usage with single-page PDF processing
from pathlib import Path
from input_processor import InputProcessor

# Initialize processor for single-page mode
processor = InputProcessor(enable_multi_page=False)

# Process a PDF (first page only)
image_b64, metadata = processor.extract_image('document.pdf')
print(f"Processed {metadata['source_type']}: {metadata['dimensions']}")

# Process an image file
image_b64, metadata = processor.extract_image('photo.jpg')
print(f"Image dimensions: {metadata['dimensions']}")

# Initialize for multi-page processing
multi_processor = InputProcessor(enable_multi_page=True, max_pages=10)

# Process multi-page PDF
page_images, metadata = multi_processor.extract_image('report.pdf')
print(f"Processed {len(page_images)} pages")
for page_info in metadata['pages']:
    print(f"Page {page_info['page_number']}: {page_info['has_content']}")

# Check if file is supported before processing
file_path = Path('document.pdf')
if InputProcessor.is_supported_file(file_path):
    result = processor.extract_image(str(file_path))

Best Practices

Always check if a file is supported using is_supported_file() before calling extract_image() to avoid exceptions
Choose enable_multi_page=False for simple use cases where only the first page is needed to improve performance
Set an appropriate max_pages limit based on your memory constraints and processing requirements
The class automatically handles image resizing to 2048px maximum dimension to optimize for API limits
Images with transparency (RGBA, LA, P modes) are automatically converted to RGB with white background
PDF pages are rendered at 2x zoom (DPI) for better quality suitable for e-ink displays
Handle exceptions from extract_image() as file processing can fail due to corrupted files or unsupported formats
For multi-page PDFs, the returned metadata includes text_content per page which can be used for content analysis
The class is stateless after initialization - you can reuse the same instance for multiple files
Base64 strings returned are ready for direct use in API calls or data URIs

Similar Components

AI-powered semantic similarity - components with related functionality:

class DocumentProcessor_v3 68.5% similar

A comprehensive PDF document processor that handles text extraction, OCR (Optical Character Recognition), layout analysis, table detection, and metadata extraction from PDF files.
From: /tf/active/vicechatdev/invoice_extraction/core/document_processor.py
class DocumentProcessor 67.8% similar

A comprehensive document processing class that converts documents to PDF, adds audit trails, applies security features (watermarks, signatures, hashing), and optionally converts to PDF/A format with document protection.
From: /tf/active/vicechatdev/document_auditor/src/document_processor.py
class MultiPagePDFProcessor 67.1% similar

A class for processing multi-page PDF documents with context-aware analysis, OCR, and summarization capabilities.
From: /tf/active/vicechatdev/e-ink-llm/multi_page_processor.py
class DocumentProcessor_v2 62.1% similar

A document processing class that extracts text from PDF and Word documents using llmsherpa as the primary method with fallback support for PyPDF2, pdfplumber, and python-docx.
From: /tf/active/vicechatdev/contract_validity_analyzer/utils/document_processor_old.py
class DocumentProcessor_v1 61.6% similar

A document processing class that extracts text from PDF and Word documents using llmsherpa as the primary method with fallback support for PyPDF2, pdfplumber, and python-docx.
From: /tf/active/vicechatdev/contract_validity_analyzer/utils/document_processor_new.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class InputProcessor:
    """Handles extraction and encoding of images from various input formats"""
    
    def __init__(self, enable_multi_page: bool = True, max_pages: int = 50):
        """
        Initialize input processor
        
        Args:
            enable_multi_page: Enable multi-page PDF processing
            max_pages: Maximum pages to process in multi-page mode
        """
        self.enable_multi_page = enable_multi_page
        self.max_pages = max_pages
        self.multi_page_processor = MultiPagePDFProcessor(max_pages=max_pages) if enable_multi_page else None
    
    def extract_image(self, file_path: str) -> Union[Tuple[str, dict], Tuple[List[str], dict]]:
        """
        Extract image(s) from PDF or encode image file to base64
        
        Args:
            file_path: Path to the input file
            
        Returns:
            For single page: Tuple of (base64_encoded_image, metadata_dict)
            For multi-page: Tuple of (list_of_base64_images, metadata_dict)
        """
        file_path = Path(file_path)
        
        if file_path.suffix.lower() == '.pdf':
            if self.enable_multi_page:
                return self._extract_multi_page_pdf(file_path)
            else:
                return self._extract_from_pdf(file_path)
        else:
            return self._encode_image(file_path)
    
    @staticmethod
    def _extract_from_pdf(pdf_path: Path) -> Tuple[str, dict]:
        """Extract first page of PDF as image"""
        try:
            doc = fitz.open(pdf_path)
            page = doc[0]  # Get first page
            
            # Render page as image (high DPI for e-ink clarity)
            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
            pix = page.get_pixmap(matrix=mat)
            
            # Convert to PIL Image
            img_data = pix.tobytes("png")
            img = PILImage.open(io.BytesIO(img_data))
            
            # Convert to base64
            buffer = io.BytesIO()
            img.save(buffer, format='PNG')
            img_b64 = base64.b64encode(buffer.getvalue()).decode()
            
            metadata = {
                'source_type': 'pdf',
                'source_file': str(pdf_path),
                'page_count': len(doc),
                'dimensions': (img.width, img.height)
            }
            
            doc.close()
            return img_b64, metadata
            
        except Exception as e:
            raise Exception(f"Error processing PDF {pdf_path}: {str(e)}")
    
    def _extract_multi_page_pdf(self, pdf_path: Path) -> Tuple[List[str], dict]:
        """Extract all pages from PDF using multi-page processor"""
        try:
            pages, metadata = self.multi_page_processor.extract_all_pages(pdf_path)
            
            # Convert to expected format
            page_images = [page.image_b64 for page in pages]
            
            # Enhanced metadata with multi-page info
            enhanced_metadata = {
                **metadata,
                'pages': [
                    {
                        'page_number': page.page_number,
                        'text_content': page.text_content,
                        'dimensions': page.dimensions,
                        'has_content': len(page.text_content.strip()) > 0
                    }
                    for page in pages
                ],
                'total_text_length': sum(len(page.text_content) for page in pages),
                'content_pages': sum(1 for page in pages if len(page.text_content.strip()) > 0)
            }
            
            return page_images, enhanced_metadata
            
        except Exception as e:
            raise Exception(f"Error processing multi-page PDF {pdf_path}: {str(e)}")
    
    @staticmethod
    def _encode_image(image_path: Path) -> Tuple[str, dict]:
        """Encode image file to base64"""
        try:
            # Open and process image
            with PILImage.open(image_path) as img:
                # Convert to RGB if necessary (for PNG with transparency)
                if img.mode in ('RGBA', 'LA', 'P'):
                    background = PILImage.new('RGB', img.size, (255, 255, 255))
                    if img.mode == 'P':
                        img = img.convert('RGBA')
                    background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
                    img = background
                
                # Resize if too large (optimize for API limits)
                max_size = 2048
                if img.width > max_size or img.height > max_size:
                    img.thumbnail((max_size, max_size), PILImage.Resampling.LANCZOS)
                
                # Convert to base64
                buffer = io.BytesIO()
                img.save(buffer, format='JPEG', quality=85)
                img_b64 = base64.b64encode(buffer.getvalue()).decode()
                
                metadata = {
                    'source_type': 'image',
                    'source_file': str(image_path),
                    'original_format': image_path.suffix.lower(),
                    'dimensions': img.size
                }
                
                return img_b64, metadata
                
        except Exception as e:
            raise Exception(f"Error processing image {image_path}: {str(e)}")
    
    @staticmethod
    def is_supported_file(file_path: Path) -> bool:
        """Check if file type is supported"""
        supported_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
        return file_path.suffix.lower() in supported_extensions
                        

Improved Code

🔍 Code Extractor

class InputProcessor

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self, enable_multi_page: bool = True, max_pages: int = 50)`

`extract_image(self, file_path: str) -> Union[Tuple[str, dict], Tuple[List[str], dict]]`

`_extract_from_pdf(pdf_path: Path) -> Tuple[str, dict]` static

`_extract_multi_page_pdf(self, pdf_path: Path) -> Tuple[List[str], dict]`

`_encode_image(image_path: Path) -> Tuple[str, dict]` static

`is_supported_file(file_path: Path) -> bool` static

Attributes

Dependencies

Required Imports

Conditional/Optional Imports

Usage Example

Best Practices

Tags

Similar Components

class DocumentProcessor_v3 68.5% similar

class DocumentProcessor 67.8% similar

class MultiPagePDFProcessor 67.1% similar

class DocumentProcessor_v2 62.1% similar

class DocumentProcessor_v1 61.6% similar

class InputProcessor

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self, enable_multi_page: bool = True, max_pages: int = 50)

extract_image(self, file_path: str) -> Union[Tuple[str, dict], Tuple[List[str], dict]]

_extract_from_pdf(pdf_path: Path) -> Tuple[str, dict] static

_extract_multi_page_pdf(self, pdf_path: Path) -> Tuple[List[str], dict]

_encode_image(image_path: Path) -> Tuple[str, dict] static

is_supported_file(file_path: Path) -> bool static

Attributes

Dependencies

Required Imports

Conditional/Optional Imports

Usage Example

Best Practices

Tags

Similar Components

class DocumentProcessor_v3 68.5% similar

class DocumentProcessor 67.8% similar

class MultiPagePDFProcessor 67.1% similar

class DocumentProcessor_v2 62.1% similar

class DocumentProcessor_v1 61.6% similar

✨ Improve Code: InputProcessor

Code Comparison

`init(self, enable_multi_page: bool = True, max_pages: int = 50)`

`extract_image(self, file_path: str) -> Union[Tuple[str, dict], Tuple[List[str], dict]]`

`_extract_from_pdf(pdf_path: Path) -> Tuple[str, dict]` static

`_extract_multi_page_pdf(self, pdf_path: Path) -> Tuple[List[str], dict]`

`_encode_image(image_path: Path) -> Tuple[str, dict]` static

`is_supported_file(file_path: Path) -> bool` static