class FormatNormalizer
Normalizes extracted data formats to ensure consistency. Handles: - Date format standardization - Number/currency normalization - VAT/tax number formatting - Field name standardization - Address formatting - Field value cleaning
/tf/active/vicechatdev/invoice_extraction/utils/format_normalizer.py
10 - 797
moderate
Purpose
Normalizes extracted data formats to ensure consistency. Handles: - Date format standardization - Number/currency normalization - VAT/tax number formatting - Field name standardization - Address formatting - Field value cleaning
Source Code
class FormatNormalizer:
"""
Normalizes extracted data formats to ensure consistency.
Handles:
- Date format standardization
- Number/currency normalization
- VAT/tax number formatting
- Field name standardization
- Address formatting
- Field value cleaning
"""
def __init__(self, config: Optional[Dict[str, Any]] = None):
"""
Initialize format normalizer with configuration.
Args:
config: Dictionary containing normalization configuration
"""
self.config = config or {}
# Date format configuration
self.input_date_formats = self.config.get('input_date_formats', [
'%d/%m/%Y', '%m/%d/%Y', '%Y-%m-%d', '%d-%m-%Y',
'%d.%m.%Y', '%m.%d.%Y', '%B %d, %Y', '%d %B %Y',
'%b %d, %Y', '%d %b %Y', '%d-%b-%Y', '%Y/%m/%d'
])
self.output_date_format = self.config.get('output_date_format', '%Y-%m-%d')
# Currency/number configuration
self.decimal_separator = self.config.get('decimal_separator', '.')
self.thousands_separator = self.config.get('thousands_separator', ',')
self.currency_symbols = {
'$': 'USD', '€': 'EUR', '£': 'GBP', '¥': 'JPY',
'kr': 'SEK', 'Fr.': 'CHF', 'A$': 'AUD', 'CA$': 'CAD'
}
# Field name mappings for standardization
self.field_name_mappings = {
# Invoice fields
'invoice_number': 'invoice.number',
'invoice_no': 'invoice.number',
'inv_number': 'invoice.number',
'invoice_date': 'invoice.issue_date',
'date': 'invoice.issue_date',
'issue_date': 'invoice.issue_date',
'due_date': 'invoice.due_date',
'payment_due': 'invoice.due_date',
'po_number': 'invoice.po_number',
'purchase_order': 'invoice.po_number',
# Vendor fields
'supplier': 'vendor.name',
'supplier_name': 'vendor.name',
'vendor_name': 'vendor.name',
'supplier_address': 'vendor.address',
'vendor_address': 'vendor.address',
'vat_number': 'vendor.vat_number',
'vat_no': 'vendor.vat_number',
'tax_number': 'vendor.vat_number',
'abn': 'vendor.abn',
'company_number': 'vendor.company_number',
# Amount fields
'total': 'amounts.total',
'total_amount': 'amounts.total',
'subtotal': 'amounts.subtotal',
'net_amount': 'amounts.subtotal',
'vat': 'amounts.vat',
'vat_amount': 'amounts.vat',
'tax': 'amounts.tax',
'tax_amount': 'amounts.tax',
'gst': 'amounts.gst',
'gst_amount': 'amounts.gst',
'currency': 'amounts.currency',
'vat_rate': 'amounts.vat_rate',
'tax_rate': 'amounts.tax_rate',
# Payment fields
'bank_account': 'payment.account_number',
'account_number': 'payment.account_number',
'sort_code': 'payment.sort_code',
'iban': 'payment.iban',
'bic': 'payment.bic',
'swift': 'payment.bic',
'bsb': 'payment.bsb',
}
# Country-specific VAT/tax number formats
self.vat_number_formats = {
'GB': r'^GB\d{9}$|^GB\d{12}$', # UK
'BE': r'^BE0\d{9}$', # Belgium
'AU': r'^\d{11}$', # Australia (ABN)
'DE': r'^DE\d{9}$', # Germany
'FR': r'^FR[A-Z0-9]{2}\d{9}$', # France
'IT': r'^IT\d{11}$', # Italy
'ES': r'^ES[A-Z0-9]\d{8}$', # Spain
'NL': r'^NL\d{9}B\d{2}$', # Netherlands
}
def normalize_extraction_result(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
"""
Normalize all fields in an extraction result.
Args:
extraction_result: Dictionary containing extracted invoice data
Returns:
Normalized extraction result
"""
# Create a copy to avoid modifying the original
result = extraction_result.copy()
# Normalize invoice fields
if 'invoice' in result:
result['invoice'] = self._normalize_invoice_fields(result['invoice'])
# Normalize vendor fields
if 'vendor' in result:
result['vendor'] = self._normalize_vendor_fields(result['vendor'])
# Normalize amounts
if 'amounts' in result:
result['amounts'] = self._normalize_amount_fields(result['amounts'])
# Normalize payment information
if 'payment' in result:
result['payment'] = self._normalize_payment_fields(result['payment'])
# Normalize line items
if 'line_items' in result and isinstance(result['line_items'], list):
result['line_items'] = [
self._normalize_line_item(item)
for item in result['line_items']
]
# Resolve any field conflicts or redundancies
result = self._resolve_field_conflicts(result)
# Add normalization metadata
if 'metadata' not in result:
result['metadata'] = {}
result['metadata']['normalized'] = True
result['metadata']['normalization_date'] = datetime.now().strftime(self.output_date_format)
return result
def _normalize_invoice_fields(self, invoice_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Normalize invoice-related fields.
Args:
invoice_data: Dictionary of invoice fields
Returns:
Normalized invoice fields
"""
result = invoice_data.copy()
# Normalize invoice number (remove prefixes like "INV-" or "#")
if 'number' in result:
result['number'] = self._clean_invoice_number(result['number'])
# Normalize dates
for date_field in ['issue_date', 'due_date']:
if date_field in result:
result[date_field] = self._normalize_date(result[date_field])
# Normalize PO number
if 'po_number' in result:
result['po_number'] = self._clean_invoice_number(result['po_number'])
return result
def _normalize_vendor_fields(self, vendor_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Normalize vendor-related fields.
Args:
vendor_data: Dictionary of vendor fields
Returns:
Normalized vendor fields
"""
result = vendor_data.copy()
# Normalize VAT number
if 'vat_number' in result:
result['vat_number'] = self._normalize_tax_number(result['vat_number'], 'vat')
# Normalize ABN (Australian Business Number)
if 'abn' in result:
result['abn'] = self._normalize_tax_number(result['abn'], 'abn')
# Normalize company number
if 'company_number' in result:
result['company_number'] = self._clean_value(result['company_number'])
# Normalize address
if 'address' in result:
result['address'] = self._normalize_address(result['address'])
# Clean vendor name
if 'name' in result:
result['name'] = self._clean_value(result['name'])
return result
def _normalize_amount_fields(self, amount_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Normalize amount-related fields.
Args:
amount_data: Dictionary of amount fields
Returns:
Normalized amount fields
"""
result = amount_data.copy()
# Normalize numeric fields
for field in ['subtotal', 'total', 'vat', 'tax', 'gst']:
if field in result:
result[field] = self._normalize_number(result[field])
# Normalize rate fields
for field in ['vat_rate', 'tax_rate', 'gst_rate']:
if field in result:
result[field] = self._normalize_number(result[field])
# Normalize currency
if 'currency' in result:
result['currency'] = self._normalize_currency(result['currency'])
# Ensure tax field consistency
if 'vat' in result and 'tax' not in result:
result['tax'] = result['vat']
elif 'tax' in result and 'vat' not in result:
result['vat'] = result['tax']
elif 'gst' in result and 'tax' not in result:
result['tax'] = result['gst']
# Ensure rate field consistency
if 'vat_rate' in result and 'tax_rate' not in result:
result['tax_rate'] = result['vat_rate']
elif 'tax_rate' in result and 'vat_rate' not in result:
result['vat_rate'] = result['tax_rate']
return result
def _normalize_payment_fields(self, payment_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Normalize payment-related fields.
Args:
payment_data: Dictionary of payment fields
Returns:
Normalized payment fields
"""
result = payment_data.copy()
# Normalize IBAN (remove spaces and format)
if 'iban' in result:
result['iban'] = self._normalize_iban(result['iban'])
# Normalize BIC/SWIFT
if 'bic' in result:
result['bic'] = self._clean_value(result['bic']).upper()
# Normalize sort code (UK)
if 'sort_code' in result:
result['sort_code'] = self._normalize_sort_code(result['sort_code'])
# Normalize BSB (Australia)
if 'bsb' in result:
result['bsb'] = self._normalize_bsb(result['bsb'])
# Normalize account number (remove spaces)
if 'account_number' in result:
result['account_number'] = re.sub(r'\s', '', result['account_number'])
# Normalize payment terms
if 'payment_terms' in result:
result['payment_terms'] = self._clean_value(result['payment_terms'])
return result
def _normalize_line_item(self, line_item: Dict[str, Any]) -> Dict[str, Any]:
"""
Normalize a single line item.
Args:
line_item: Dictionary representing a line item
Returns:
Normalized line item
"""
result = line_item.copy()
# Normalize description
if 'description' in result:
result['description'] = self._clean_value(result['description'])
# Normalize numeric fields
for field in ['quantity', 'unit_price', 'amount', 'tax_amount', 'gst_amount', 'vat_amount']:
if field in result:
result[field] = self._normalize_number(result[field])
# Normalize rate fields
for field in ['vat_rate', 'tax_rate', 'gst_rate']:
if field in result:
result[field] = self._normalize_number(result[field])
# Ensure tax amount consistency
if 'vat_amount' in result and 'tax_amount' not in result:
result['tax_amount'] = result['vat_amount']
elif 'tax_amount' in result and 'vat_amount' not in result:
result['vat_amount'] = result['tax_amount']
elif 'gst_amount' in result and 'tax_amount' not in result:
result['tax_amount'] = result['gst_amount']
return result
def _normalize_date(self, date_str: Union[str, None]) -> Optional[str]:
"""
Normalize date to standard format.
Args:
date_str: Date string in various formats
Returns:
Date in standard format or None if invalid
"""
if not date_str:
return None
if isinstance(date_str, datetime):
return date_str.strftime(self.output_date_format)
# Already in target format
if re.match(r'^\d{4}-\d{2}-\d{2}$', date_str):
try:
# Validate it's a legitimate date
datetime.strptime(date_str, self.output_date_format)
return date_str
except ValueError:
pass
# Try all configured date formats
for fmt in self.input_date_formats:
try:
date_obj = datetime.strptime(date_str, fmt)
return date_obj.strftime(self.output_date_format)
except (ValueError, TypeError):
continue
# Handle common date format with ordinals (1st, 2nd, 3rd, etc.)
date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
for fmt in self.input_date_formats:
try:
date_obj = datetime.strptime(date_str, fmt)
return date_obj.strftime(self.output_date_format)
except (ValueError, TypeError):
continue
# If all parsing attempts fail
logger.warning(f"Could not normalize date: {date_str}")
return date_str
def _normalize_number(self, value: Union[str, int, float, None]) -> Optional[float]:
"""
Normalize numeric value to float.
Args:
value: Numeric value in various formats
Returns:
Normalized float value or None if invalid
"""
if value is None:
return None
if isinstance(value, (int, float)):
return float(value)
if not isinstance(value, str):
logger.warning(f"Unexpected type for numeric value: {type(value)}")
return None
# Remove currency symbols and other non-numeric characters
cleaned = self._remove_currency_symbols(value)
# European format (1.234,56 -> 1234.56)
if ',' in cleaned and '.' in cleaned:
if cleaned.rindex(',') > cleaned.rindex('.'):
# Format like "1.234,56"
cleaned = cleaned.replace('.', '')
cleaned = cleaned.replace(',', '.')
elif ',' in cleaned and '.' not in cleaned:
# Format like "1234,56"
cleaned = cleaned.replace(',', '.')
# Remove any remaining non-numeric characters except decimal point
cleaned = re.sub(r'[^\d.]', '', cleaned)
try:
# Convert to float and handle precision
result = float(cleaned)
# Round to 2 decimal places for monetary values
return round(result, 2)
except (ValueError, TypeError):
logger.warning(f"Could not normalize number: {value}")
return None
def _normalize_currency(self, currency: Union[str, None]) -> Optional[str]:
"""
Normalize currency to standard 3-letter code.
Args:
currency: Currency string (e.g., $, USD, dollars)
Returns:
Normalized 3-letter currency code or None if invalid
"""
if not currency:
return None
# Already a standard 3-letter code
if re.match(r'^[A-Z]{3}$', currency):
return currency
# Handle currency symbols
currency_upper = currency.upper().strip()
for symbol, code in self.currency_symbols.items():
if symbol in currency:
return code
# Try to match currency names
currency_map = {
'DOLLAR': 'USD',
'DOLLARS': 'USD',
'US DOLLAR': 'USD',
'EURO': 'EUR',
'EUROS': 'EUR',
'POUND': 'GBP',
'POUNDS': 'GBP',
'STERLING': 'GBP',
'YEN': 'JPY',
'AUSTRALIAN DOLLAR': 'AUD',
'CANADIAN DOLLAR': 'CAD',
'FRANC': 'CHF',
'KRONA': 'SEK'
}
for name, code in currency_map.items():
if name in currency_upper:
return code
# Try using pycountry
try:
currency_obj = pycountry.currencies.lookup(currency_upper)
if currency_obj:
return currency_obj.alpha_3
except (LookupError, AttributeError):
pass
# Return as-is if we can't normalize
logger.warning(f"Could not normalize currency: {currency}")
return currency
def _normalize_tax_number(self, tax_number: Union[str, None], tax_type: str) -> Optional[str]:
"""
Normalize tax identification number based on type and country.
Args:
tax_number: Tax number string
tax_type: Type of tax ID ('vat', 'abn', etc.)
Returns:
Normalized tax number or None if invalid
"""
if not tax_number:
return None
# Remove all non-alphanumeric characters
clean_number = re.sub(r'[^a-zA-Z0-9]', '', tax_number)
# VAT number normalization
if tax_type == 'vat':
# Try to detect country code from first 2 characters
if len(clean_number) >= 2 and clean_number[:2].isalpha():
country_code = clean_number[:2].upper()
# Apply country-specific format if available
if country_code in self.vat_number_formats:
pattern = self.vat_number_formats[country_code]
if not re.match(pattern, clean_number):
logger.warning(f"VAT number {clean_number} does not match expected format for {country_code}")
# Format based on country
if country_code == 'GB':
if len(clean_number) == 11: # GB + 9 digits
return f"GB{clean_number[2:]}"
elif len(clean_number) == 14: # GB + 12 digits
return f"GB{clean_number[2:]}"
elif country_code == 'BE':
if len(clean_number) >= 10:
if not clean_number[2] == '0':
clean_number = country_code + '0' + clean_number[2:]
return f"{clean_number[:2]}{clean_number[2:5]}.{clean_number[5:8]}.{clean_number[8:11]}"
# If no country code detected, return cleaned number
return clean_number
# ABN normalization (Australian Business Number)
elif tax_type == 'abn':
if len(clean_number) == 11:
return f"{clean_number[:2]} {clean_number[2:5]} {clean_number[5:8]} {clean_number[8:11]}"
return clean_number
# Default - return cleaned number
return clean_number
def _normalize_iban(self, iban: Union[str, None]) -> Optional[str]:
"""
Normalize IBAN format.
Args:
iban: IBAN string
Returns:
Normalized IBAN or None if invalid
"""
if not iban:
return None
# Remove spaces and convert to uppercase
clean_iban = re.sub(r'\s', '', iban).upper()
# Check if it's a valid IBAN (basic format check)
if len(clean_iban) < 4 or not clean_iban[:2].isalpha():
logger.warning(f"Invalid IBAN format: {iban}")
return clean_iban
# Format with spaces for better readability
country_code = clean_iban[:2]
if country_code == 'BE':
if len(clean_iban) == 16:
return f"{clean_iban[:4]} {clean_iban[4:8]} {clean_iban[8:12]} {clean_iban[12:16]}"
elif country_code == 'GB':
if len(clean_iban) == 22:
return f"{clean_iban[:4]} {clean_iban[4:8]} {clean_iban[8:14]} {clean_iban[14:18]} {clean_iban[18:22]}"
# Generic formatting with spaces every 4 characters
formatted = ' '.join(clean_iban[i:i+4] for i in range(0, len(clean_iban), 4))
return formatted
def _normalize_sort_code(self, sort_code: Union[str, None]) -> Optional[str]:
"""
Normalize UK sort code.
Args:
sort_code: Sort code string
Returns:
Normalized sort code (XX-XX-XX format) or None if invalid
"""
if not sort_code:
return None
# Remove all non-numeric characters
digits = re.sub(r'[^0-9]', '', sort_code)
# Check length
if len(digits) != 6:
logger.warning(f"Invalid sort code length: {sort_code}")
return sort_code
# Format as XX-XX-XX
return f"{digits[:2]}-{digits[2:4]}-{digits[4:6]}"
def _normalize_bsb(self, bsb: Union[str, None]) -> Optional[str]:
"""
Normalize Australian BSB number.
Args:
bsb: BSB string
Returns:
Normalized BSB (XXX-XXX format) or None if invalid
"""
if not bsb:
return None
# Remove all non-numeric characters
digits = re.sub(r'[^0-9]', '', bsb)
# Check length
if len(digits) != 6:
logger.warning(f"Invalid BSB length: {bsb}")
return bsb
# Format as XXX-XXX
return f"{digits[:3]}-{digits[3:6]}"
def _normalize_address(self, address: Union[str, None]) -> Optional[str]:
"""
Normalize address format.
Args:
address: Address string
Returns:
Normalized address string
"""
if not address:
return None
# Clean up whitespace
normalized = self._clean_value(address)
# Replace multiple commas with a single comma
normalized = re.sub(r',+', ',', normalized)
# Ensure comma-space sequence
normalized = re.sub(r',\s*', ', ', normalized)
return normalized
def _clean_invoice_number(self, invoice_number: Union[str, None]) -> Optional[str]:
"""
Clean invoice number by removing common prefixes and symbols.
Args:
invoice_number: Invoice number string
Returns:
Cleaned invoice number string
"""
if not invoice_number:
return None
# Convert to string if not already
if not isinstance(invoice_number, str):
invoice_number = str(invoice_number)
# Remove common prefixes
prefixes = ['INV', 'INVOICE', 'INV#', 'INVOICE#', 'INVOICE NO', 'INVOICE NUMBER', '#']
cleaned = invoice_number.upper()
for prefix in prefixes:
if cleaned.startswith(prefix):
cleaned = cleaned[len(prefix):].strip()
# Remove special characters from start/end
cleaned = cleaned.strip('-_:#. \t')
# Convert back to original case
return invoice_number[invoice_number.upper().index(cleaned):invoice_number.upper().index(cleaned) + len(cleaned)]
def _clean_value(self, value: Union[str, None]) -> Optional[str]:
"""
General purpose string cleaning.
Args:
value: String value to clean
Returns:
Cleaned string value
"""
if not value:
return None
if not isinstance(value, str):
return str(value)
# Normalize whitespace
cleaned = ' '.join(value.split())
# Remove leading/trailing punctuation
cleaned = cleaned.strip('.,;:-_() \t')
return cleaned
def _remove_currency_symbols(self, value: str) -> str:
"""
Remove currency symbols from a string.
Args:
value: String value with possible currency symbols
Returns:
String with currency symbols removed
"""
# Common currency symbols
symbols = ['$', '€', '£', '¥', '₹', '₽', 'Fr.', 'kr', 'A$', 'C$']
result = value
for symbol in symbols:
result = result.replace(symbol, '')
return result.strip()
def _resolve_field_conflicts(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
"""
Resolve any conflicts or redundancies in the extraction result.
Args:
extraction_result: Extraction result dictionary
Returns:
Extraction result with conflicts resolved
"""
# Copy to avoid modifying the original
result = extraction_result.copy()
# Handle tax vs. vat vs. gst terminology
if 'amounts' in result:
# If we have both tax and vat, and they're different
if 'tax' in result['amounts'] and 'vat' in result['amounts']:
tax = result['amounts']['tax']
vat = result['amounts']['vat']
if tax != vat:
# Prefer the non-None value, or the higher value if both are set
if tax is None:
result['amounts']['tax'] = vat
elif vat is None:
result['amounts']['vat'] = tax
else:
# Both are set - use the higher value (assuming entity recognizes both terms)
if float(tax) > float(vat):
result['amounts']['vat'] = tax
else:
result['amounts']['tax'] = vat
# If we have both tax and gst, and they're different
if 'tax' in result['amounts'] and 'gst' in result['amounts']:
tax = result['amounts']['tax']
gst = result['amounts']['gst']
if tax != gst:
# For Australian entities, prefer GST
if 'AU' in str(result.get('metadata', {}).get('entity', '')):
result['amounts']['tax'] = gst
else:
result['amounts']['gst'] = tax
return result
def standardize_field_names(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert non-standard field names to standard format using mappings.
Args:
data: Dictionary with possibly non-standard field names
Returns:
Dictionary with standardized field names
"""
result = {}
for key, value in data.items():
# Check if this is a mapped field
if key in self.field_name_mappings:
# Get the standardized path
std_path = self.field_name_mappings[key]
# Split into parts for nested structure
parts = std_path.split('.')
# Navigate to the target location
current = result
for i, part in enumerate(parts[:-1]):
if part not in current:
current[part] = {}
current = current[part]
# Set the value at the final location
current[parts[-1]] = value
else:
# Keep unmapped fields as-is
result[key] = value
return result
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
bases: Parameter of type
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self, config)
Purpose: Initialize format normalizer with configuration. Args: config: Dictionary containing normalization configuration
Parameters:
config: Type: Optional[Dict[str, Any]]
Returns: None
normalize_extraction_result(self, extraction_result) -> Dict[str, Any]
Purpose: Normalize all fields in an extraction result. Args: extraction_result: Dictionary containing extracted invoice data Returns: Normalized extraction result
Parameters:
extraction_result: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
_normalize_invoice_fields(self, invoice_data) -> Dict[str, Any]
Purpose: Normalize invoice-related fields. Args: invoice_data: Dictionary of invoice fields Returns: Normalized invoice fields
Parameters:
invoice_data: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
_normalize_vendor_fields(self, vendor_data) -> Dict[str, Any]
Purpose: Normalize vendor-related fields. Args: vendor_data: Dictionary of vendor fields Returns: Normalized vendor fields
Parameters:
vendor_data: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
_normalize_amount_fields(self, amount_data) -> Dict[str, Any]
Purpose: Normalize amount-related fields. Args: amount_data: Dictionary of amount fields Returns: Normalized amount fields
Parameters:
amount_data: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
_normalize_payment_fields(self, payment_data) -> Dict[str, Any]
Purpose: Normalize payment-related fields. Args: payment_data: Dictionary of payment fields Returns: Normalized payment fields
Parameters:
payment_data: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
_normalize_line_item(self, line_item) -> Dict[str, Any]
Purpose: Normalize a single line item. Args: line_item: Dictionary representing a line item Returns: Normalized line item
Parameters:
line_item: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
_normalize_date(self, date_str) -> Optional[str]
Purpose: Normalize date to standard format. Args: date_str: Date string in various formats Returns: Date in standard format or None if invalid
Parameters:
date_str: Type: Union[str, None]
Returns: Returns Optional[str]
_normalize_number(self, value) -> Optional[float]
Purpose: Normalize numeric value to float. Args: value: Numeric value in various formats Returns: Normalized float value or None if invalid
Parameters:
value: Type: Union[str, int, float, None]
Returns: Returns Optional[float]
_normalize_currency(self, currency) -> Optional[str]
Purpose: Normalize currency to standard 3-letter code. Args: currency: Currency string (e.g., $, USD, dollars) Returns: Normalized 3-letter currency code or None if invalid
Parameters:
currency: Type: Union[str, None]
Returns: Returns Optional[str]
_normalize_tax_number(self, tax_number, tax_type) -> Optional[str]
Purpose: Normalize tax identification number based on type and country. Args: tax_number: Tax number string tax_type: Type of tax ID ('vat', 'abn', etc.) Returns: Normalized tax number or None if invalid
Parameters:
tax_number: Type: Union[str, None]tax_type: Type: str
Returns: Returns Optional[str]
_normalize_iban(self, iban) -> Optional[str]
Purpose: Normalize IBAN format. Args: iban: IBAN string Returns: Normalized IBAN or None if invalid
Parameters:
iban: Type: Union[str, None]
Returns: Returns Optional[str]
_normalize_sort_code(self, sort_code) -> Optional[str]
Purpose: Normalize UK sort code. Args: sort_code: Sort code string Returns: Normalized sort code (XX-XX-XX format) or None if invalid
Parameters:
sort_code: Type: Union[str, None]
Returns: Returns Optional[str]
_normalize_bsb(self, bsb) -> Optional[str]
Purpose: Normalize Australian BSB number. Args: bsb: BSB string Returns: Normalized BSB (XXX-XXX format) or None if invalid
Parameters:
bsb: Type: Union[str, None]
Returns: Returns Optional[str]
_normalize_address(self, address) -> Optional[str]
Purpose: Normalize address format. Args: address: Address string Returns: Normalized address string
Parameters:
address: Type: Union[str, None]
Returns: Returns Optional[str]
_clean_invoice_number(self, invoice_number) -> Optional[str]
Purpose: Clean invoice number by removing common prefixes and symbols. Args: invoice_number: Invoice number string Returns: Cleaned invoice number string
Parameters:
invoice_number: Type: Union[str, None]
Returns: Returns Optional[str]
_clean_value(self, value) -> Optional[str]
Purpose: General purpose string cleaning. Args: value: String value to clean Returns: Cleaned string value
Parameters:
value: Type: Union[str, None]
Returns: Returns Optional[str]
_remove_currency_symbols(self, value) -> str
Purpose: Remove currency symbols from a string. Args: value: String value with possible currency symbols Returns: String with currency symbols removed
Parameters:
value: Type: str
Returns: Returns str
_resolve_field_conflicts(self, extraction_result) -> Dict[str, Any]
Purpose: Resolve any conflicts or redundancies in the extraction result. Args: extraction_result: Extraction result dictionary Returns: Extraction result with conflicts resolved
Parameters:
extraction_result: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
standardize_field_names(self, data) -> Dict[str, Any]
Purpose: Convert non-standard field names to standard format using mappings. Args: data: Dictionary with possibly non-standard field names Returns: Dictionary with standardized field names
Parameters:
data: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
Required Imports
import re
import logging
from datetime import datetime
from typing import Dict
from typing import Any
Usage Example
# Example usage:
# result = FormatNormalizer(bases)
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function clean_text 49.7% similar
-
class UKValidator 47.9% similar
-
class BaseValidator 46.7% similar
-
class BEExtractor 46.3% similar
-
class BEValidator 46.2% similar