class InvoiceProcessor
Main orchestrator class that coordinates the complete invoice processing pipeline from PDF extraction through validation to Excel generation.
/tf/active/vicechatdev/invoice_extraction/main.py
36 - 189
complex
Purpose
InvoiceProcessor serves as the central coordinator for processing invoices from multiple entities (UK, BE, AU). It manages the entire workflow: document processing, language detection, entity classification, data extraction using entity-specific extractors, validation, and Excel output generation. The class maintains shared resources (LLM client) and routes documents to appropriate entity-specific processors based on classification results.
Source Code
class InvoiceProcessor:
"""Main orchestrator for invoice processing pipeline."""
def __init__(self, config: Config):
"""
Initialize the invoice processor with configuration.
Args:
config: Configuration object
"""
self.config = config
# Initialize LLM client (shared across components)
llm_config = config.get_section("llm")
self.llm_client = LLMClient(llm_config)
# Pass LLM client to components that need it
processor_config = {**config.get_section("extractors"), "llm_client": self.llm_client}
# Initialize core components
self.document_processor = DocumentProcessor(config.get_section("extractors"))
self.entity_classifier = EntityClassifier(processor_config)
self.language_detector = LanguageDetector(processor_config)
self.excel_generator = ExcelGenerator(config.get_section("storage"))
# Initialize entity-specific extractors
self.extractors = {
'UK': UKExtractor({**config.get_section("extractors"), "llm_client": self.llm_client}),
'BE': BEExtractor({**config.get_section("extractors"), "llm_client": self.llm_client}),
'AU': AUExtractor({**config.get_section("extractors"), "llm_client": self.llm_client})
}
# Initialize entity-specific validators
self.validators = {
'UK': UKValidator(config.get_section("validators")),
'BE': BEValidator(config.get_section("validators")),
'AU': AUValidator(config.get_section("validators"))
}
logger.info(f"Invoice processor initialized with {len(self.extractors)} extractors")
def process_invoice(self, invoice_path: str) -> Dict[str, Any]:
"""
Process a single invoice through the full pipeline.
Args:
invoice_path: Path to invoice file
Returns:
Dictionary with processing results
"""
# Use performance logger to track processing time
with PerformanceLogger("process_invoice") as perf:
perf.add_metric("file", os.path.basename(invoice_path))
try:
logger.info(f"Processing invoice: {invoice_path}")
# Step 1: Extract text and structure from PDF
document = self.document_processor.process(invoice_path)
# Step 2: Detect language
language = self.language_detector.detect(document)
logger.info(f"Detected language: {language}")
perf.add_metric("language", language)
# Step 3: Classify entity
entity = self.entity_classifier.classify(document)
logger.info(f"Classified entity: {entity}")
perf.add_metric("entity", entity)
# Step 4: Extract data using entity-specific extractor
if entity not in self.extractors:
raise ValueError(f"Unknown entity: {entity}")
extraction_result = self.extractors[entity].extract(document, language)
perf.add_metric("confidence", extraction_result.get("confidence", 0))
# Step 5: Validate extracted data
validation_result = self.validators[entity].validate(extraction_result)
perf.add_metric("valid", validation_result.is_valid)
perf.add_metric("issues", len(validation_result.issues))
perf.add_metric("warnings", len(validation_result.warnings))
if not validation_result.is_valid:
logger.warning(f"Validation issues: {validation_result.issues}")
# Step 6: Generate Excel output
output_dir = self.config.get("storage.path", "output")
excel_path = self.excel_generator.generate(
extraction_result,
entity
)
result = {
'invoice_path': invoice_path,
'entity': entity,
'language': language,
'extraction_result': extraction_result,
'validation_result': validation_result.as_dict(),
'excel_path': excel_path,
'status': 'success'
}
perf.add_metric("status", "success")
return result
except Exception as e:
logger.error(f"Error processing invoice {invoice_path}: {str(e)}")
logger.debug(traceback.format_exc())
perf.add_metric("status", "error")
perf.add_metric("error_message", str(e))
return {
'invoice_path': invoice_path,
'status': 'error',
'error': str(e)
}
def process_directory(self, directory_path: str) -> List[Dict[str, Any]]:
"""
Process all PDF invoices in a directory.
Args:
directory_path: Path to directory containing invoices
Returns:
List of processing results
"""
results = []
dir_path = Path(directory_path)
if not dir_path.exists():
logger.error(f"Directory not found: {directory_path}")
return results
pdf_files = list(dir_path.glob('**/*.pdf'))
logger.info(f"Found {len(pdf_files)} PDF files in {directory_path}")
for pdf_file in pdf_files:
result = self.process_invoice(str(pdf_file))
results.append(result)
# Log usage statistics
llm_stats = self.llm_client.get_usage_stats()
logger.info(f"LLM usage: {llm_stats['total_tokens']} tokens " +
f"({llm_stats['prompt_tokens']} prompt, {llm_stats['completion_tokens']} completion)")
return results
def get_llm_usage(self) -> Dict[str, Any]:
"""Get LLM usage statistics."""
return self.llm_client.get_usage_stats()
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
config: Configuration object containing all settings for the invoice processing pipeline. Must include sections for 'llm' (LLM client configuration), 'extractors' (document processing and extraction settings), 'validators' (validation rules), and 'storage' (output path configuration). The config object should have get_section() and get() methods to retrieve configuration values.
Return Value
Instantiation returns an InvoiceProcessor object ready to process invoices. The process_invoice() method returns a dictionary with keys: 'invoice_path' (str), 'entity' (str: UK/BE/AU), 'language' (str), 'extraction_result' (dict), 'validation_result' (dict), 'excel_path' (str), 'status' (str: success/error), and optionally 'error' (str) if processing failed. The process_directory() method returns a list of such dictionaries, one per processed invoice.
Class Interface
Methods
__init__(self, config: Config)
Purpose: Initialize the invoice processor with all required components and entity-specific extractors/validators
Parameters:
config: Configuration object containing settings for llm, extractors, validators, and storage sections
Returns: None - initializes the InvoiceProcessor instance with all components ready for processing
process_invoice(self, invoice_path: str) -> Dict[str, Any]
Purpose: Process a single invoice through the complete pipeline: extraction, language detection, entity classification, data extraction, validation, and Excel generation
Parameters:
invoice_path: String path to the PDF invoice file to process
Returns: Dictionary containing processing results with keys: invoice_path, entity, language, extraction_result, validation_result, excel_path, status ('success' or 'error'), and optionally 'error' message if processing failed
process_directory(self, directory_path: str) -> List[Dict[str, Any]]
Purpose: Process all PDF invoices found in a directory (including subdirectories) and return aggregated results with LLM usage statistics logged
Parameters:
directory_path: String path to directory containing PDF invoice files to process
Returns: List of dictionaries, each containing the same structure as process_invoice() return value, one per PDF file found
get_llm_usage(self) -> Dict[str, Any]
Purpose: Retrieve cumulative LLM usage statistics across all processing operations performed by this instance
Returns: Dictionary containing LLM usage statistics including total_tokens, prompt_tokens, and completion_tokens
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
config |
Config | Configuration object passed during initialization, used to retrieve settings throughout processing | instance |
llm_client |
LLMClient | Shared LLM client instance used by all components that require language model interactions, tracks cumulative token usage | instance |
document_processor |
DocumentProcessor | Component responsible for extracting text and structure from PDF documents | instance |
entity_classifier |
EntityClassifier | Component that classifies invoices to determine which entity (UK, BE, AU) they belong to | instance |
language_detector |
LanguageDetector | Component that detects the language of invoice documents | instance |
excel_generator |
ExcelGenerator | Component that generates Excel output files from extracted invoice data | instance |
extractors |
Dict[str, Extractor] | Dictionary mapping entity codes ('UK', 'BE', 'AU') to their corresponding entity-specific extractor instances | instance |
validators |
Dict[str, Validator] | Dictionary mapping entity codes ('UK', 'BE', 'AU') to their corresponding entity-specific validator instances | instance |
Dependencies
osargparseloggingpathlibsystracebacktypingtime
Required Imports
import os
from pathlib import Path
import traceback
from typing import Dict, Any, List
from config import Config
from utils.logging_utils import get_logger, PerformanceLogger
from utils.llm_client import LLMClient
from extractors.uk_extractor import UKExtractor
from extractors.be_extractor import BEExtractor
from extractors.au_extractor import AUExtractor
from validators.uk_validator import UKValidator
from validators.be_validator import BEValidator
from validators.au_validator import AUValidator
from core.document_processor import DocumentProcessor
from core.entity_classifier import EntityClassifier
from core.language_detector import LanguageDetector
from core.excel_generator import ExcelGenerator
Usage Example
from config import Config
from invoice_processor import InvoiceProcessor
# Load configuration
config = Config('config.yaml')
# Instantiate processor
processor = InvoiceProcessor(config)
# Process a single invoice
result = processor.process_invoice('invoices/invoice_001.pdf')
if result['status'] == 'success':
print(f"Entity: {result['entity']}")
print(f"Language: {result['language']}")
print(f"Excel output: {result['excel_path']}")
print(f"Valid: {result['validation_result']['is_valid']}")
else:
print(f"Error: {result['error']}")
# Process entire directory
results = processor.process_directory('invoices/')
successful = [r for r in results if r['status'] == 'success']
print(f"Processed {len(successful)}/{len(results)} invoices successfully")
# Get LLM usage statistics
usage = processor.get_llm_usage()
print(f"Total tokens used: {usage['total_tokens']}")
Best Practices
- Always instantiate with a properly configured Config object containing all required sections (llm, extractors, validators, storage)
- The class maintains stateful components (LLM client, extractors, validators) that are reused across multiple invoice processing calls for efficiency
- Use process_invoice() for single files and process_directory() for batch processing - both methods handle errors gracefully and return structured results
- Check the 'status' field in returned dictionaries before accessing other fields to handle errors appropriately
- The LLM client is shared across all components to track cumulative token usage - call get_llm_usage() after processing to monitor costs
- Processing is logged extensively - ensure logging is configured before instantiation to capture detailed pipeline execution information
- Each invoice goes through a fixed pipeline: document processing → language detection → entity classification → extraction → validation → Excel generation
- Entity classification determines which extractor and validator are used - ensure all three entities (UK, BE, AU) have corresponding extractor and validator classes
- Validation results include is_valid flag, issues (blocking problems), and warnings (non-blocking concerns) - handle accordingly
- The PerformanceLogger context manager tracks processing time and metrics automatically - metrics are logged but not returned in results
- Excel output path is determined by the storage configuration and entity type - ensure output directory exists or is writable
- For production use, implement retry logic around process_invoice() calls as external dependencies (LLM API) may fail transiently
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function main_v4 75.2% similar
-
class BaseExtractor 70.6% similar
-
class DocumentProcessor_v3 62.8% similar
-
class EntityClassifier 61.4% similar
-
class DocumentProcessor_v1 61.2% similar