class BaseValidator
Abstract base class for validating extracted invoice data with entity-specific validation rules. Provides common validation functionality for required fields, field types, date consistency, and amount calculations.
/tf/active/vicechatdev/invoice_extraction/validators/base_validator.py
62 - 320
moderate
Purpose
BaseValidator serves as the foundation for entity-specific invoice validators (UK, BE, AU). It implements common validation logic including required field checks, type validation, date consistency verification, and amount calculations. Subclasses must implement entity-specific validation rules through the _entity_specific_validation abstract method. The class uses a configurable approach with customizable required fields, field types, date formats, and numeric thresholds.
Source Code
class BaseValidator(ABC):
"""
Abstract base class for entity-specific invoice data validators.
This class defines the common interface and shared validation
functionality for all entity validators (UK, BE, AU).
"""
def __init__(self, config=None):
"""
Initialize the validator with configuration.
Args:
config: Dictionary containing configuration parameters
"""
self.config = config or {}
# Configure required fields and validation strictness
self.required_fields = self.config.get('required_fields', {})
if not self.required_fields:
# Default required fields if not configured
self.required_fields = {
'invoice.number': 'critical',
'invoice.issue_date': 'critical',
'vendor.name': 'critical',
'amounts.total': 'critical'
}
# Field type definitions for validation
self.field_types = {
'invoice.number': 'string',
'invoice.issue_date': 'date',
'invoice.due_date': 'date',
'invoice.po_number': 'string',
'vendor.name': 'string',
'vendor.vat_number': 'string',
'vendor.address': 'string',
'amounts.subtotal': 'decimal',
'amounts.total': 'decimal',
'amounts.tax': 'decimal',
'amounts.vat': 'decimal',
'amounts.vat_rate': 'decimal',
'amounts.currency': 'string'
}
# Date format for validation
self.date_format = self.config.get('date_format', '%Y-%m-%d')
# Customizable threshold for numeric comparison
self.numeric_threshold = self.config.get('numeric_threshold', 0.01)
def validate(self, extraction_result: Dict[str, Any]) -> ValidationResult:
"""
Validate extracted invoice data.
Args:
extraction_result: Dictionary of extracted invoice fields
Returns:
ValidationResult object with issues and validity status
"""
logger.info("Validating extracted invoice data")
# Create new validation result
result = ValidationResult()
# Perform base validations
self._validate_required_fields(extraction_result, result)
self._validate_field_types(extraction_result, result)
self._validate_date_consistency(extraction_result, result)
self._validate_amount_consistency(extraction_result, result)
# Entity-specific validation
self._entity_specific_validation(extraction_result, result)
# Log validation results
if not result.is_valid:
logger.warning(f"Validation failed with {len(result.issues)} issues")
for issue in result.issues:
logger.warning(f" - {issue}")
else:
logger.info("Validation passed successfully")
if result.warnings:
logger.info(f"Validation passed with {len(result.warnings)} warnings")
return result
@abstractmethod
def _entity_specific_validation(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Perform entity-specific validation.
Each entity validator must implement this method to check
specific rules for that entity.
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
pass
def _validate_required_fields(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Validate that all required fields are present.
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
for field_path, importance in self.required_fields.items():
# Skip validation fields
if field_path.startswith('validation.'):
continue
# Get the field value using the path
value = self._get_nested_field(extraction_result, field_path)
if value is None or (isinstance(value, str) and not value.strip()):
severity = 'error' if importance == 'critical' else 'warning'
result.add_issue(field_path, "Required field is missing or empty", severity)
def _validate_field_types(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Validate field types (string, number, date).
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
for field_path, expected_type in self.field_types.items():
value = self._get_nested_field(extraction_result, field_path)
# Skip None values (these are handled by required field validation)
if value is None:
continue
if expected_type == 'string':
if not isinstance(value, str):
result.add_issue(
field_path,
f"Expected string, got {type(value).__name__}",
'warning'
)
elif expected_type == 'decimal':
if not isinstance(value, (int, float)):
try:
# Try to convert to float
float(value)
except (ValueError, TypeError):
result.add_issue(
field_path,
f"Expected decimal number, got {type(value).__name__}",
'warning'
)
elif expected_type == 'date':
if isinstance(value, str):
try:
# Try to parse the date
datetime.strptime(value, self.date_format)
except ValueError:
result.add_issue(
field_path,
f"Invalid date format, expected {self.date_format}",
'warning'
)
else:
result.add_issue(
field_path,
f"Expected date string, got {type(value).__name__}",
'warning'
)
def _validate_date_consistency(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Validate date consistency (issue date before due date).
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
issue_date = self._get_nested_field(extraction_result, 'invoice.issue_date')
due_date = self._get_nested_field(extraction_result, 'invoice.due_date')
if issue_date and due_date:
try:
issue_date_obj = datetime.strptime(issue_date, self.date_format)
due_date_obj = datetime.strptime(due_date, self.date_format)
if issue_date_obj > due_date_obj:
result.add_issue(
'invoice.due_date',
f"Due date ({due_date}) is before issue date ({issue_date})",
'error'
)
except ValueError:
# Date format issues are reported by _validate_field_types
pass
def _validate_amount_consistency(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Validate numeric amounts consistency.
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
subtotal = self._get_nested_field(extraction_result, 'amounts.subtotal')
tax = self._get_nested_field(extraction_result, 'amounts.tax') or \
self._get_nested_field(extraction_result, 'amounts.vat')
total = self._get_nested_field(extraction_result, 'amounts.total')
# Check if we have all values for comparison
if subtotal is not None and tax is not None and total is not None:
try:
subtotal_val = float(subtotal)
tax_val = float(tax)
total_val = float(total)
# Calculate expected total
expected_total = subtotal_val + tax_val
# Compare with threshold
if abs(expected_total - total_val) > self.numeric_threshold:
result.add_issue(
'amounts.total',
f"Amount inconsistency: subtotal ({subtotal_val}) + tax ({tax_val}) " +
f"≠ total ({total_val})",
'warning'
)
except (ValueError, TypeError):
# Type conversion issues are reported by _validate_field_types
pass
def _get_nested_field(self, data: Dict[str, Any], field_path: str) -> Any:
"""
Get a nested field value using a dot-notation path.
Args:
data: The data dictionary
field_path: Dot-notation path to the field
Returns:
Field value or None if not found
"""
keys = field_path.split('.')
value = data
for key in keys:
if isinstance(value, dict) and key in value:
value = value[key]
else:
return None
return value
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
ABC | - |
Parameter Details
config: Optional dictionary containing configuration parameters. Supports 'required_fields' (dict mapping field paths to importance levels: 'critical' or other), 'date_format' (string, default '%Y-%m-%d'), and 'numeric_threshold' (float, default 0.01 for amount comparison tolerance). If not provided or empty, uses sensible defaults.
Return Value
Instantiation returns a BaseValidator instance (or subclass instance). The main validate() method returns a ValidationResult object containing validation issues, warnings, and an is_valid boolean status. Helper methods return None (they modify the ValidationResult in-place) or specific values (_get_nested_field returns Any or None).
Class Interface
Methods
__init__(self, config=None)
Purpose: Initialize the validator with configuration parameters and set up default field definitions
Parameters:
config: Optional dictionary with 'required_fields', 'date_format', and 'numeric_threshold' keys
Returns: None (constructor)
validate(self, extraction_result: Dict[str, Any]) -> ValidationResult
Purpose: Main validation method that orchestrates all validation checks and returns comprehensive results
Parameters:
extraction_result: Dictionary containing extracted invoice fields with nested structure (e.g., {'invoice': {...}, 'vendor': {...}, 'amounts': {...}})
Returns: ValidationResult object with issues list, warnings list, and is_valid boolean status
_entity_specific_validation(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Abstract method that subclasses must implement to perform entity-specific validation rules
Parameters:
extraction_result: Dictionary of extracted invoice fieldsresult: ValidationResult object to add issues to (modified in-place)
Returns: None (modifies result parameter in-place)
_validate_required_fields(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Validate that all configured required fields are present and non-empty
Parameters:
extraction_result: Dictionary of extracted invoice fieldsresult: ValidationResult object to add issues to
Returns: None (modifies result parameter in-place)
_validate_field_types(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Validate that field values match their expected types (string, decimal, date)
Parameters:
extraction_result: Dictionary of extracted invoice fieldsresult: ValidationResult object to add issues to
Returns: None (modifies result parameter in-place)
_validate_date_consistency(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Validate that issue_date is before or equal to due_date
Parameters:
extraction_result: Dictionary of extracted invoice fieldsresult: ValidationResult object to add issues to
Returns: None (modifies result parameter in-place)
_validate_amount_consistency(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Validate that subtotal + tax/vat equals total within the configured numeric threshold
Parameters:
extraction_result: Dictionary of extracted invoice fieldsresult: ValidationResult object to add issues to
Returns: None (modifies result parameter in-place)
_get_nested_field(self, data: Dict[str, Any], field_path: str) -> Any
Purpose: Safely retrieve a nested field value using dot-notation path without raising KeyError
Parameters:
data: Dictionary to search infield_path: Dot-separated path to the field (e.g., 'invoice.number', 'amounts.total')
Returns: Field value if found, None if path doesn't exist or any intermediate key is missing
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
config |
Dict[str, Any] | Configuration dictionary passed during initialization, defaults to empty dict if None | instance |
required_fields |
Dict[str, str] | Maps field paths to importance levels ('critical' or other). Critical fields generate errors, others generate warnings. Defaults include invoice.number, invoice.issue_date, vendor.name, amounts.total | instance |
field_types |
Dict[str, str] | Maps field paths to expected types ('string', 'decimal', 'date'). Used for type validation. Includes definitions for invoice fields, vendor fields, and amount fields | instance |
date_format |
str | Date format string for parsing and validating dates. Defaults to '%Y-%m-%d' if not configured | instance |
numeric_threshold |
float | Tolerance threshold for numeric comparisons in amount validation. Defaults to 0.01 to handle floating-point precision issues | instance |
Dependencies
loggingabctypingredatetime
Required Imports
import logging
from abc import ABC, abstractmethod
from typing import Dict, List, Any, Optional, Union, Set
import re
from datetime import datetime
Usage Example
# Cannot instantiate directly (abstract class), must subclass
from base_validator import BaseValidator, ValidationResult
class UKValidator(BaseValidator):
def _entity_specific_validation(self, extraction_result, result):
# Implement UK-specific validation
vat_number = self._get_nested_field(extraction_result, 'vendor.vat_number')
if vat_number and not vat_number.startswith('GB'):
result.add_issue('vendor.vat_number', 'UK VAT must start with GB', 'error')
# Configure and instantiate
config = {
'required_fields': {
'invoice.number': 'critical',
'invoice.issue_date': 'critical',
'vendor.name': 'critical',
'amounts.total': 'critical'
},
'date_format': '%Y-%m-%d',
'numeric_threshold': 0.01
}
validator = UKValidator(config)
# Validate extracted invoice data
extraction_result = {
'invoice': {'number': 'INV-001', 'issue_date': '2024-01-15', 'due_date': '2024-02-15'},
'vendor': {'name': 'Acme Corp', 'vat_number': 'GB123456789'},
'amounts': {'subtotal': 100.0, 'tax': 20.0, 'total': 120.0}
}
result = validator.validate(extraction_result)
if result.is_valid:
print('Validation passed')
else:
print(f'Validation failed: {result.issues}')
if result.warnings:
print(f'Warnings: {result.warnings}')
Best Practices
- Cannot instantiate BaseValidator directly - must create a subclass that implements _entity_specific_validation()
- Always pass a ValidationResult object to validation methods - they modify it in-place rather than returning new objects
- Configure required_fields with importance levels ('critical' for errors, other values for warnings)
- Use dot-notation for nested field paths (e.g., 'invoice.number', 'amounts.total')
- The validate() method orchestrates all validation steps in a specific order: required fields, field types, date consistency, amount consistency, then entity-specific
- Validation methods are designed to be non-blocking - they collect all issues rather than failing fast
- Use numeric_threshold to handle floating-point comparison issues in amount validation
- Date format must match the format used in extraction_result data
- The _get_nested_field() helper safely navigates nested dictionaries without raising KeyError
- Subclasses should call parent __init__ if they override it to preserve base configuration
- Log messages are automatically generated during validation - ensure logger is configured at module level
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class TestBaseValidator 77.3% similar
-
class UKValidator 76.3% similar
-
class BaseExtractor 75.1% similar
-
class AUValidator 74.3% similar
-
class BEValidator 71.7% similar