class BaseExtractor
Abstract base class that defines the interface and shared functionality for entity-specific invoice data extractors (UK, BE, AU), providing a multi-stage extraction pipeline for invoice processing.
/tf/active/vicechatdev/invoice_extraction/extractors/base_extractor.py
10 - 387
complex
Purpose
This class serves as a template for implementing invoice data extractors for different entities/regions. It provides a comprehensive staged extraction framework that breaks down invoice processing into logical steps: structure identification, metadata extraction, vendor data, amounts, line items, payment info, and tax data. Subclasses must implement the abstract extract() method and can override individual stage methods for entity-specific logic. The class manages LLM client initialization, document structure analysis, bounding box operations, and confidence scoring.
Source Code
class BaseExtractor(ABC):
"""
Abstract base class for entity-specific invoice data extractors.
This class defines the common interface and shared functionality
for all entity-specific extractors (UK, BE, AU).
"""
def __init__(self, config=None):
"""
Initialize the extractor with configuration.
Args:
config: Dictionary containing configuration parameters
"""
self.config = config or {}
# Initialize LLM client at the base level
# This ensures all extractors use the same LLMClient if they have the same config
if 'llm' in self.config:
self.llm_client = LLMClient(self.config.get('llm'))
else:
# Don't initialize here if no config - leave it to subclasses
self.llm_client = None
@abstractmethod
def extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
"""
Extract invoice data from the document.
Args:
document: Processed document from DocumentProcessor
language: Detected language of the document ('en', 'fr', 'nl')
Returns:
Dict containing extracted invoice fields
"""
pass
def extract_staged(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
"""
Extract invoice data using a multi-stage approach.
This is the recommended method to implement in subclasses,
which breaks down extraction into logical stages.
Args:
document: Processed document from DocumentProcessor
language: Detected language of the document
Returns:
Dict containing extracted invoice fields with confidence scores
"""
logger.info(f"Starting staged extraction in {language}")
# Stage 1: Extract document structure (header, footer, line items section)
structure = self.extract_structure(document)
# Stage 2: Extract basic invoice metadata (number, dates)
invoice_data = self.extract_invoice_metadata(document, structure)
# Stage 3: Extract vendor information
vendor_data = self.extract_vendor_data(document, structure)
# Stage 4: Extract amounts and totals
amount_data = self.extract_amounts(document, structure)
# Stage 5: Extract line items if present
line_items = self.extract_line_items(document, structure)
# Stage 6: Extract payment information
payment_data = self.extract_payment_data(document, structure)
# Stage 7: Extract tax/VAT information
tax_data = self.extract_tax_data(document, structure, amount_data)
# Combine all extracted data
extraction_result = {
'invoice': invoice_data,
'vendor': vendor_data,
'amounts': {**amount_data, **tax_data},
'payment': payment_data,
'line_items': line_items,
'confidence': {}, # Will be populated with confidence scores
'metadata': {
'language': language,
'extraction_method': self.__class__.__name__
}
}
# Add confidence scores
extraction_result['confidence'] = self.calculate_confidence(extraction_result)
return extraction_result
def extract_structure(self, document: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract the document structure to identify key regions.
Args:
document: Processed document
Returns:
Dictionary with bounding boxes for header, footer, line items, etc.
"""
# Default implementation - override in subclasses for better accuracy
logger.debug("Using default structure extraction")
structure = {
'header': {'bbox': None},
'footer': {'bbox': None},
'line_items': {'bbox': None},
'totals': {'bbox': None},
'is_structured': False
}
# Simple heuristic: header is top 25%, footer is bottom 25%
if document.get('pages'):
first_page = document['pages'][0]
height = first_page.get('height', 1000)
width = first_page.get('width', 800)
structure['header']['bbox'] = [0, 0, width, height * 0.25]
structure['footer']['bbox'] = [0, height * 0.75, width, height]
structure['line_items']['bbox'] = [0, height * 0.3, width, height * 0.7]
structure['totals']['bbox'] = [width * 0.5, height * 0.7, width, height * 0.8]
# If tables were detected, use them as line items area
for page in document.get('pages', []):
if page.get('tables'):
# Use the first detected table as line items
table = page['tables'][0]
structure['line_items']['bbox'] = table.get('bbox')
structure['is_structured'] = True
break
return structure
def extract_invoice_metadata(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract basic invoice metadata like invoice number, dates.
Args:
document: Processed document
structure: Document structure with defined regions
Returns:
Dictionary with invoice metadata
"""
# Default implementation - should be overridden in subclasses
logger.warning("Using default invoice metadata extraction - override in subclass")
# Placeholder extraction
return {
'number': '',
'issue_date': '',
'due_date': '',
'po_number': '',
'reference': ''
}
def extract_vendor_data(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract vendor information.
Args:
document: Processed document
structure: Document structure with defined regions
Returns:
Dictionary with vendor data
"""
# Default implementation - should be overridden in subclasses
logger.warning("Using default vendor data extraction - override in subclass")
# Placeholder extraction
return {
'name': '',
'vat_number': '',
'address': '',
'contact': ''
}
def extract_amounts(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract amount information.
Args:
document: Processed document
structure: Document structure with defined regions
Returns:
Dictionary with amount data
"""
# Default implementation - should be overridden in subclasses
logger.warning("Using default amount extraction - override in subclass")
# Placeholder extraction
return {
'subtotal': None,
'total': None,
'currency': ''
}
def extract_line_items(self, document: Dict[str, Any], structure: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Extract line items from the invoice.
Args:
document: Processed document
structure: Document structure with defined regions
Returns:
List of dictionaries with line item data
"""
# Default implementation - should be overridden in subclasses
logger.warning("Using default line item extraction - override in subclass")
# Placeholder extraction
return []
def extract_payment_data(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract payment information.
Args:
document: Processed document
structure: Document structure with defined regions
Returns:
Dictionary with payment data
"""
# Default implementation - should be overridden in subclasses
logger.warning("Using default payment data extraction - override in subclass")
# Placeholder extraction
return {
'bank_name': '',
'account_number': '',
'sort_code': '',
'iban': '',
'payment_terms': ''
}
def extract_tax_data(self, document: Dict[str, Any], structure: Dict[str, Any],
amount_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract tax/VAT information.
Args:
document: Processed document
structure: Document structure with defined regions
amount_data: Previously extracted amount data
Returns:
Dictionary with tax/VAT data
"""
# Default implementation - should be overridden in subclasses
logger.warning("Using default tax data extraction - override in subclass")
# Placeholder extraction
return {
'vat': None,
'vat_rate': None
}
def calculate_confidence(self, extraction_result: Dict[str, Any]) -> Dict[str, float]:
"""
Calculate confidence scores for extracted fields.
Args:
extraction_result: Dictionary with extracted data
Returns:
Dictionary mapping field names to confidence scores (0.0-1.0)
"""
# Default implementation sets medium confidence for all fields
confidence = {}
# Recursively add confidence for all fields
def add_confidence(data, prefix=''):
if not data:
return
if isinstance(data, dict):
for key, value in data.items():
if key == 'confidence': # Skip confidence field itself
continue
field = f"{prefix}{key}" if prefix else key
if isinstance(value, (dict, list)):
add_confidence(value, field + '_')
else:
# Calculate confidence based on value presence
if value is None or value == '':
confidence[field] = 0.0
else:
confidence[field] = 0.7 # Default medium confidence
elif isinstance(data, list):
for i, item in enumerate(data):
add_confidence(item, f"{prefix}item{i}_")
# Calculate confidence for all fields
add_confidence(extraction_result)
# Critical fields get default medium-high confidence if present
critical_fields = [
'vendor_name', 'invoice_number', 'amounts_total',
'amounts_vat', 'invoice_issue_date'
]
for field in critical_fields:
if field in confidence and confidence[field] > 0:
confidence[field] = 0.8
return confidence
def get_text_in_bbox(self, document: Dict[str, Any], bbox: List[float]) -> Tuple[str, List[Dict[str, Any]]]:
"""
Extract text and blocks contained within a bounding box.
Args:
document: Processed document
bbox: Bounding box coordinates [x0, y0, x1, y1]
Returns:
Tuple of (extracted text, list of blocks in the region)
"""
if not bbox:
return "", []
x0, y0, x1, y1 = bbox
contained_blocks = []
# Find blocks that are within the bounding box
for block in document.get('blocks', []):
block_bbox = block.get('bbox')
if not block_bbox:
continue
b_x0, b_y0, b_x1, b_y1 = block_bbox
# Check if block is fully contained in bbox
if (b_x0 >= x0 and b_y0 >= y0 and b_x1 <= x1 and b_y1 <= y1):
contained_blocks.append(block)
# Or if block significantly overlaps with bbox (>50% area)
elif self._calculate_overlap(bbox, block_bbox) > 0.5:
contained_blocks.append(block)
# Extract text from the contained blocks
text = " ".join(block.get('text', '') for block in contained_blocks)
return text, contained_blocks
def _calculate_overlap(self, bbox1: List[float], bbox2: List[float]) -> float:
"""Calculate overlap ratio between two bounding boxes."""
# Extract coordinates
x0_1, y0_1, x1_1, y1_1 = bbox1
x0_2, y0_2, x1_2, y1_2 = bbox2
# Calculate intersection
x0_i = max(x0_1, x0_2)
y0_i = max(y0_1, y0_2)
x1_i = min(x1_1, x1_2)
y1_i = min(y1_1, y1_2)
# Check if there is an intersection
if x0_i >= x1_i or y0_i >= y1_i:
return 0.0
# Calculate areas
area_intersection = (x1_i - x0_i) * (y1_i - y0_i)
area_bbox2 = (x1_2 - x0_2) * (y1_2 - y0_2)
# Return ratio of intersection area to bbox2 area
return area_intersection / area_bbox2 if area_bbox2 > 0 else 0.0
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
ABC | - |
Parameter Details
config: Optional dictionary containing configuration parameters. If provided with an 'llm' key, initializes an LLMClient instance for AI-powered extraction. Can include entity-specific settings, extraction rules, or model configurations. Defaults to empty dict if not provided.
Return Value
Instantiation returns a BaseExtractor instance (or subclass instance) with initialized config and optional llm_client. The main extract() method (abstract) and extract_staged() method return a Dict containing structured invoice data with keys: 'invoice' (metadata), 'vendor' (vendor info), 'amounts' (financial data including tax), 'payment' (payment details), 'line_items' (list of items), 'confidence' (field-level confidence scores 0.0-1.0), and 'metadata' (extraction info). Individual extraction methods return specific data structures: extract_structure returns bbox regions, extract_invoice_metadata returns invoice fields dict, extract_vendor_data returns vendor fields dict, extract_amounts returns amount fields dict, extract_line_items returns list of dicts, extract_payment_data returns payment fields dict, extract_tax_data returns tax fields dict, calculate_confidence returns dict mapping field names to float scores, and get_text_in_bbox returns tuple of (text string, list of block dicts).
Class Interface
Methods
__init__(self, config=None)
Purpose: Initialize the extractor with optional configuration and set up LLM client if config contains 'llm' key
Parameters:
config: Optional dictionary with configuration parameters, particularly 'llm' for LLMClient initialization
Returns: None - initializes instance with config dict and llm_client (LLMClient or None)
extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]
Purpose: Abstract method that must be implemented by subclasses to extract invoice data from document
Parameters:
document: Processed document dict from DocumentProcessor containing pages, blocks, and tableslanguage: Detected language code of the document ('en', 'fr', 'nl')
Returns: Dictionary containing extracted invoice fields with structure defined by implementation
extract_staged(self, document: Dict[str, Any], language: str) -> Dict[str, Any]
Purpose: Execute multi-stage extraction pipeline breaking down extraction into logical stages (structure, metadata, vendor, amounts, line items, payment, tax)
Parameters:
document: Processed document dict from DocumentProcessorlanguage: Detected language code of the document
Returns: Dictionary with keys: 'invoice', 'vendor', 'amounts', 'payment', 'line_items', 'confidence', 'metadata'
extract_structure(self, document: Dict[str, Any]) -> Dict[str, Any]
Purpose: Identify key document regions (header, footer, line items, totals) using bounding boxes and table detection
Parameters:
document: Processed document dict
Returns: Dictionary with keys 'header', 'footer', 'line_items', 'totals' (each containing 'bbox'), and 'is_structured' boolean
extract_invoice_metadata(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]
Purpose: Extract basic invoice metadata like invoice number, dates, PO number, and reference
Parameters:
document: Processed document dictstructure: Document structure with defined regions from extract_structure()
Returns: Dictionary with keys: 'number', 'issue_date', 'due_date', 'po_number', 'reference'
extract_vendor_data(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]
Purpose: Extract vendor/supplier information from the invoice
Parameters:
document: Processed document dictstructure: Document structure with defined regions
Returns: Dictionary with keys: 'name', 'vat_number', 'address', 'contact'
extract_amounts(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]
Purpose: Extract financial amounts including subtotal, total, and currency
Parameters:
document: Processed document dictstructure: Document structure with defined regions
Returns: Dictionary with keys: 'subtotal', 'total', 'currency'
extract_line_items(self, document: Dict[str, Any], structure: Dict[str, Any]) -> List[Dict[str, Any]]
Purpose: Extract individual line items from the invoice including descriptions, quantities, and prices
Parameters:
document: Processed document dictstructure: Document structure with defined regions
Returns: List of dictionaries, each representing a line item with item-specific fields
extract_payment_data(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]
Purpose: Extract payment information including bank details and payment terms
Parameters:
document: Processed document dictstructure: Document structure with defined regions
Returns: Dictionary with keys: 'bank_name', 'account_number', 'sort_code', 'iban', 'payment_terms'
extract_tax_data(self, document: Dict[str, Any], structure: Dict[str, Any], amount_data: Dict[str, Any]) -> Dict[str, Any]
Purpose: Extract tax/VAT information, potentially using previously extracted amount data for validation
Parameters:
document: Processed document dictstructure: Document structure with defined regionsamount_data: Previously extracted amount data from extract_amounts()
Returns: Dictionary with keys: 'vat', 'vat_rate'
calculate_confidence(self, extraction_result: Dict[str, Any]) -> Dict[str, float]
Purpose: Calculate confidence scores (0.0-1.0) for all extracted fields, with higher scores for critical fields
Parameters:
extraction_result: Dictionary with extracted data from all stages
Returns: Dictionary mapping field names to confidence scores (0.0 for missing/empty, 0.7 default, 0.8 for critical fields)
get_text_in_bbox(self, document: Dict[str, Any], bbox: List[float]) -> Tuple[str, List[Dict[str, Any]]]
Purpose: Extract text and blocks contained within or significantly overlapping (>50%) a specified bounding box
Parameters:
document: Processed document dict with 'blocks' listbbox: Bounding box coordinates as [x0, y0, x1, y1]
Returns: Tuple of (concatenated text string, list of block dictionaries in the region)
_calculate_overlap(self, bbox1: List[float], bbox2: List[float]) -> float
Purpose: Calculate the overlap ratio between two bounding boxes (intersection area / bbox2 area)
Parameters:
bbox1: First bounding box as [x0, y0, x1, y1]bbox2: Second bounding box as [x0, y0, x1, y1]
Returns: Float ratio (0.0-1.0) representing how much of bbox2 is covered by bbox1
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
config |
Dict[str, Any] | Configuration dictionary containing extractor settings, LLM config, and entity-specific parameters. Defaults to empty dict if not provided during initialization. | instance |
llm_client |
Optional[LLMClient] | LLM client instance for AI-powered extraction. Initialized if config contains 'llm' key, otherwise None. Shared across extractors with same config. | instance |
Dependencies
abcloggingtyping
Required Imports
from abc import ABC, abstractmethod
import logging
from typing import Dict, List, Any, Optional, Tuple
from utils.llm_client import LLMClient
Usage Example
# Subclass implementation example
from base_extractor import BaseExtractor
from typing import Dict, Any
class UKExtractor(BaseExtractor):
def __init__(self, config=None):
super().__init__(config)
def extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
# Use the staged extraction pipeline
return self.extract_staged(document, language)
def extract_invoice_metadata(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
# Override with UK-specific logic
return {
'number': 'INV-12345',
'issue_date': '2024-01-15',
'due_date': '2024-02-15',
'po_number': 'PO-67890',
'reference': 'REF-001'
}
# Usage
config = {
'llm': {
'provider': 'openai',
'model': 'gpt-4',
'api_key': 'your-api-key'
}
}
extractor = UKExtractor(config)
document = {
'pages': [{'height': 1000, 'width': 800, 'tables': []}],
'blocks': [
{'bbox': [10, 10, 200, 50], 'text': 'Invoice #12345'},
{'bbox': [10, 60, 200, 100], 'text': 'Total: $1,500.00'}
]
}
result = extractor.extract(document, 'en')
print(f"Invoice Number: {result['invoice']['number']}")
print(f"Total: {result['amounts']['total']}")
print(f"Confidence: {result['confidence']}")
Best Practices
- Always subclass BaseExtractor and implement the abstract extract() method - direct instantiation will fail
- Override individual stage methods (extract_invoice_metadata, extract_vendor_data, etc.) for entity-specific logic rather than reimplementing the entire pipeline
- Use extract_staged() as the primary extraction method in subclass implementations for consistent multi-stage processing
- Initialize with config containing 'llm' key if AI-powered extraction is needed, otherwise llm_client will be None
- The extraction pipeline follows a specific order: structure → metadata → vendor → amounts → line items → payment → tax. Each stage can use results from previous stages
- Default implementations of stage methods return placeholder data and log warnings - always override them in production subclasses
- Use get_text_in_bbox() helper method to extract text from specific document regions identified in extract_structure()
- Confidence scores are automatically calculated but can be overridden by implementing custom calculate_confidence() logic
- Document structure expects specific format: 'pages' list with 'height', 'width', 'tables'; 'blocks' list with 'bbox' and 'text'
- Bounding boxes use [x0, y0, x1, y1] format where (x0,y0) is top-left and (x1,y1) is bottom-right
- The class is stateless between extract() calls - each extraction is independent
- Critical fields (vendor_name, invoice_number, amounts_total, amounts_vat, invoice_issue_date) automatically receive higher confidence scores (0.8) if present
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class BaseValidator 75.1% similar
-
class UKExtractor 74.5% similar
-
class AUExtractor 72.8% similar
-
class BEExtractor 71.7% similar
-
class InvoiceProcessor 70.6% similar