FormatNormalizer - Code Extractor

class FormatNormalizer

Maturity: 29

Normalizes extracted data formats to ensure consistency. Handles: - Date format standardization - Number/currency normalization - VAT/tax number formatting - Field name standardization - Address formatting - Field value cleaning

File:
/tf/active/vicechatdev/invoice_extraction/utils/format_normalizer.py

Lines:
10 - 797

Complexity:
moderate

Purpose

Source Code

class FormatNormalizer:
    """
    Normalizes extracted data formats to ensure consistency.
    
    Handles:
    - Date format standardization
    - Number/currency normalization
    - VAT/tax number formatting
    - Field name standardization
    - Address formatting
    - Field value cleaning
    """
    
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        """
        Initialize format normalizer with configuration.
        
        Args:
            config: Dictionary containing normalization configuration
        """
        self.config = config or {}
        
        # Date format configuration
        self.input_date_formats = self.config.get('input_date_formats', [
            '%d/%m/%Y', '%m/%d/%Y', '%Y-%m-%d', '%d-%m-%Y',
            '%d.%m.%Y', '%m.%d.%Y', '%B %d, %Y', '%d %B %Y',
            '%b %d, %Y', '%d %b %Y', '%d-%b-%Y', '%Y/%m/%d'
        ])
        self.output_date_format = self.config.get('output_date_format', '%Y-%m-%d')
        
        # Currency/number configuration
        self.decimal_separator = self.config.get('decimal_separator', '.')
        self.thousands_separator = self.config.get('thousands_separator', ',')
        self.currency_symbols = {
            '$': 'USD', '€': 'EUR', '£': 'GBP', '¥': 'JPY',
            'kr': 'SEK', 'Fr.': 'CHF', 'A$': 'AUD', 'CA$': 'CAD'
        }
        
        # Field name mappings for standardization
        self.field_name_mappings = {
            # Invoice fields
            'invoice_number': 'invoice.number',
            'invoice_no': 'invoice.number',
            'inv_number': 'invoice.number',
            'invoice_date': 'invoice.issue_date',
            'date': 'invoice.issue_date',
            'issue_date': 'invoice.issue_date',
            'due_date': 'invoice.due_date',
            'payment_due': 'invoice.due_date',
            'po_number': 'invoice.po_number',
            'purchase_order': 'invoice.po_number',
            
            # Vendor fields
            'supplier': 'vendor.name',
            'supplier_name': 'vendor.name',
            'vendor_name': 'vendor.name',
            'supplier_address': 'vendor.address',
            'vendor_address': 'vendor.address',
            'vat_number': 'vendor.vat_number',
            'vat_no': 'vendor.vat_number',
            'tax_number': 'vendor.vat_number',
            'abn': 'vendor.abn',
            'company_number': 'vendor.company_number',
            
            # Amount fields
            'total': 'amounts.total',
            'total_amount': 'amounts.total',
            'subtotal': 'amounts.subtotal',
            'net_amount': 'amounts.subtotal',
            'vat': 'amounts.vat',
            'vat_amount': 'amounts.vat',
            'tax': 'amounts.tax',
            'tax_amount': 'amounts.tax',
            'gst': 'amounts.gst',
            'gst_amount': 'amounts.gst',
            'currency': 'amounts.currency',
            'vat_rate': 'amounts.vat_rate',
            'tax_rate': 'amounts.tax_rate',
            
            # Payment fields
            'bank_account': 'payment.account_number',
            'account_number': 'payment.account_number',
            'sort_code': 'payment.sort_code',
            'iban': 'payment.iban',
            'bic': 'payment.bic',
            'swift': 'payment.bic',
            'bsb': 'payment.bsb',
        }
        
        # Country-specific VAT/tax number formats
        self.vat_number_formats = {
            'GB': r'^GB\d{9}$|^GB\d{12}$',                      # UK
            'BE': r'^BE0\d{9}$',                                # Belgium
            'AU': r'^\d{11}$',                                  # Australia (ABN)
            'DE': r'^DE\d{9}$',                                # Germany
            'FR': r'^FR[A-Z0-9]{2}\d{9}$',                     # France
            'IT': r'^IT\d{11}$',                               # Italy
            'ES': r'^ES[A-Z0-9]\d{8}$',                        # Spain
            'NL': r'^NL\d{9}B\d{2}$',                          # Netherlands
        }
    
    def normalize_extraction_result(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize all fields in an extraction result.
        
        Args:
            extraction_result: Dictionary containing extracted invoice data
            
        Returns:
            Normalized extraction result
        """
        # Create a copy to avoid modifying the original
        result = extraction_result.copy()
        
        # Normalize invoice fields
        if 'invoice' in result:
            result['invoice'] = self._normalize_invoice_fields(result['invoice'])
            
        # Normalize vendor fields
        if 'vendor' in result:
            result['vendor'] = self._normalize_vendor_fields(result['vendor'])
            
        # Normalize amounts
        if 'amounts' in result:
            result['amounts'] = self._normalize_amount_fields(result['amounts'])
            
        # Normalize payment information
        if 'payment' in result:
            result['payment'] = self._normalize_payment_fields(result['payment'])
            
        # Normalize line items
        if 'line_items' in result and isinstance(result['line_items'], list):
            result['line_items'] = [
                self._normalize_line_item(item) 
                for item in result['line_items']
            ]
            
        # Resolve any field conflicts or redundancies
        result = self._resolve_field_conflicts(result)
        
        # Add normalization metadata
        if 'metadata' not in result:
            result['metadata'] = {}
        result['metadata']['normalized'] = True
        result['metadata']['normalization_date'] = datetime.now().strftime(self.output_date_format)
        
        return result
    
    def _normalize_invoice_fields(self, invoice_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize invoice-related fields.
        
        Args:
            invoice_data: Dictionary of invoice fields
            
        Returns:
            Normalized invoice fields
        """
        result = invoice_data.copy()
        
        # Normalize invoice number (remove prefixes like "INV-" or "#")
        if 'number' in result:
            result['number'] = self._clean_invoice_number(result['number'])
            
        # Normalize dates
        for date_field in ['issue_date', 'due_date']:
            if date_field in result:
                result[date_field] = self._normalize_date(result[date_field])
                
        # Normalize PO number
        if 'po_number' in result:
            result['po_number'] = self._clean_invoice_number(result['po_number'])
            
        return result
    
    def _normalize_vendor_fields(self, vendor_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize vendor-related fields.
        
        Args:
            vendor_data: Dictionary of vendor fields
            
        Returns:
            Normalized vendor fields
        """
        result = vendor_data.copy()
        
        # Normalize VAT number
        if 'vat_number' in result:
            result['vat_number'] = self._normalize_tax_number(result['vat_number'], 'vat')
            
        # Normalize ABN (Australian Business Number)
        if 'abn' in result:
            result['abn'] = self._normalize_tax_number(result['abn'], 'abn')
            
        # Normalize company number
        if 'company_number' in result:
            result['company_number'] = self._clean_value(result['company_number'])
            
        # Normalize address
        if 'address' in result:
            result['address'] = self._normalize_address(result['address'])
            
        # Clean vendor name
        if 'name' in result:
            result['name'] = self._clean_value(result['name'])
            
        return result
    
    def _normalize_amount_fields(self, amount_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize amount-related fields.
        
        Args:
            amount_data: Dictionary of amount fields
            
        Returns:
            Normalized amount fields
        """
        result = amount_data.copy()
        
        # Normalize numeric fields
        for field in ['subtotal', 'total', 'vat', 'tax', 'gst']:
            if field in result:
                result[field] = self._normalize_number(result[field])
                
        # Normalize rate fields
        for field in ['vat_rate', 'tax_rate', 'gst_rate']:
            if field in result:
                result[field] = self._normalize_number(result[field])
                
        # Normalize currency
        if 'currency' in result:
            result['currency'] = self._normalize_currency(result['currency'])
            
        # Ensure tax field consistency
        if 'vat' in result and 'tax' not in result:
            result['tax'] = result['vat']
        elif 'tax' in result and 'vat' not in result:
            result['vat'] = result['tax']
        elif 'gst' in result and 'tax' not in result:
            result['tax'] = result['gst']
        
        # Ensure rate field consistency
        if 'vat_rate' in result and 'tax_rate' not in result:
            result['tax_rate'] = result['vat_rate']
        elif 'tax_rate' in result and 'vat_rate' not in result:
            result['vat_rate'] = result['tax_rate']
            
        return result
    
    def _normalize_payment_fields(self, payment_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize payment-related fields.
        
        Args:
            payment_data: Dictionary of payment fields
            
        Returns:
            Normalized payment fields
        """
        result = payment_data.copy()
        
        # Normalize IBAN (remove spaces and format)
        if 'iban' in result:
            result['iban'] = self._normalize_iban(result['iban'])
            
        # Normalize BIC/SWIFT
        if 'bic' in result:
            result['bic'] = self._clean_value(result['bic']).upper()
            
        # Normalize sort code (UK)
        if 'sort_code' in result:
            result['sort_code'] = self._normalize_sort_code(result['sort_code'])
            
        # Normalize BSB (Australia)
        if 'bsb' in result:
            result['bsb'] = self._normalize_bsb(result['bsb'])
            
        # Normalize account number (remove spaces)
        if 'account_number' in result:
            result['account_number'] = re.sub(r'\s', '', result['account_number'])
            
        # Normalize payment terms
        if 'payment_terms' in result:
            result['payment_terms'] = self._clean_value(result['payment_terms'])
            
        return result
    
    def _normalize_line_item(self, line_item: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize a single line item.
        
        Args:
            line_item: Dictionary representing a line item
            
        Returns:
            Normalized line item
        """
        result = line_item.copy()
        
        # Normalize description
        if 'description' in result:
            result['description'] = self._clean_value(result['description'])
            
        # Normalize numeric fields
        for field in ['quantity', 'unit_price', 'amount', 'tax_amount', 'gst_amount', 'vat_amount']:
            if field in result:
                result[field] = self._normalize_number(result[field])
                
        # Normalize rate fields
        for field in ['vat_rate', 'tax_rate', 'gst_rate']:
            if field in result:
                result[field] = self._normalize_number(result[field])
                
        # Ensure tax amount consistency
        if 'vat_amount' in result and 'tax_amount' not in result:
            result['tax_amount'] = result['vat_amount']
        elif 'tax_amount' in result and 'vat_amount' not in result:
            result['vat_amount'] = result['tax_amount']
        elif 'gst_amount' in result and 'tax_amount' not in result:
            result['tax_amount'] = result['gst_amount']
            
        return result
    
    def _normalize_date(self, date_str: Union[str, None]) -> Optional[str]:
        """
        Normalize date to standard format.
        
        Args:
            date_str: Date string in various formats
            
        Returns:
            Date in standard format or None if invalid
        """
        if not date_str:
            return None
            
        if isinstance(date_str, datetime):
            return date_str.strftime(self.output_date_format)
            
        # Already in target format
        if re.match(r'^\d{4}-\d{2}-\d{2}$', date_str):
            try:
                # Validate it's a legitimate date
                datetime.strptime(date_str, self.output_date_format)
                return date_str
            except ValueError:
                pass
        
        # Try all configured date formats
        for fmt in self.input_date_formats:
            try:
                date_obj = datetime.strptime(date_str, fmt)
                return date_obj.strftime(self.output_date_format)
            except (ValueError, TypeError):
                continue
                
        # Handle common date format with ordinals (1st, 2nd, 3rd, etc.)
        date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
        for fmt in self.input_date_formats:
            try:
                date_obj = datetime.strptime(date_str, fmt)
                return date_obj.strftime(self.output_date_format)
            except (ValueError, TypeError):
                continue
        
        # If all parsing attempts fail
        logger.warning(f"Could not normalize date: {date_str}")
        return date_str
    
    def _normalize_number(self, value: Union[str, int, float, None]) -> Optional[float]:
        """
        Normalize numeric value to float.
        
        Args:
            value: Numeric value in various formats
            
        Returns:
            Normalized float value or None if invalid
        """
        if value is None:
            return None
            
        if isinstance(value, (int, float)):
            return float(value)
            
        if not isinstance(value, str):
            logger.warning(f"Unexpected type for numeric value: {type(value)}")
            return None
            
        # Remove currency symbols and other non-numeric characters
        cleaned = self._remove_currency_symbols(value)
        
        # European format (1.234,56 -> 1234.56)
        if ',' in cleaned and '.' in cleaned:
            if cleaned.rindex(',') > cleaned.rindex('.'):
                # Format like "1.234,56"
                cleaned = cleaned.replace('.', '')
                cleaned = cleaned.replace(',', '.')
        elif ',' in cleaned and '.' not in cleaned:
            # Format like "1234,56"
            cleaned = cleaned.replace(',', '.')
            
        # Remove any remaining non-numeric characters except decimal point
        cleaned = re.sub(r'[^\d.]', '', cleaned)
        
        try:
            # Convert to float and handle precision
            result = float(cleaned)
            # Round to 2 decimal places for monetary values
            return round(result, 2)
        except (ValueError, TypeError):
            logger.warning(f"Could not normalize number: {value}")
            return None
    
    def _normalize_currency(self, currency: Union[str, None]) -> Optional[str]:
        """
        Normalize currency to standard 3-letter code.
        
        Args:
            currency: Currency string (e.g., $, USD, dollars)
            
        Returns:
            Normalized 3-letter currency code or None if invalid
        """
        if not currency:
            return None
            
        # Already a standard 3-letter code
        if re.match(r'^[A-Z]{3}$', currency):
            return currency
            
        # Handle currency symbols
        currency_upper = currency.upper().strip()
        for symbol, code in self.currency_symbols.items():
            if symbol in currency:
                return code
                
        # Try to match currency names
        currency_map = {
            'DOLLAR': 'USD',
            'DOLLARS': 'USD',
            'US DOLLAR': 'USD',
            'EURO': 'EUR',
            'EUROS': 'EUR',
            'POUND': 'GBP',
            'POUNDS': 'GBP',
            'STERLING': 'GBP',
            'YEN': 'JPY',
            'AUSTRALIAN DOLLAR': 'AUD',
            'CANADIAN DOLLAR': 'CAD',
            'FRANC': 'CHF',
            'KRONA': 'SEK'
        }
        
        for name, code in currency_map.items():
            if name in currency_upper:
                return code
                
        # Try using pycountry
        try:
            currency_obj = pycountry.currencies.lookup(currency_upper)
            if currency_obj:
                return currency_obj.alpha_3
        except (LookupError, AttributeError):
            pass
            
        # Return as-is if we can't normalize
        logger.warning(f"Could not normalize currency: {currency}")
        return currency
    
    def _normalize_tax_number(self, tax_number: Union[str, None], tax_type: str) -> Optional[str]:
        """
        Normalize tax identification number based on type and country.
        
        Args:
            tax_number: Tax number string
            tax_type: Type of tax ID ('vat', 'abn', etc.)
            
        Returns:
            Normalized tax number or None if invalid
        """
        if not tax_number:
            return None
            
        # Remove all non-alphanumeric characters
        clean_number = re.sub(r'[^a-zA-Z0-9]', '', tax_number)
        
        # VAT number normalization
        if tax_type == 'vat':
            # Try to detect country code from first 2 characters
            if len(clean_number) >= 2 and clean_number[:2].isalpha():
                country_code = clean_number[:2].upper()
                
                # Apply country-specific format if available
                if country_code in self.vat_number_formats:
                    pattern = self.vat_number_formats[country_code]
                    if not re.match(pattern, clean_number):
                        logger.warning(f"VAT number {clean_number} does not match expected format for {country_code}")
                        
                # Format based on country
                if country_code == 'GB':
                    if len(clean_number) == 11:  # GB + 9 digits
                        return f"GB{clean_number[2:]}"
                    elif len(clean_number) == 14:  # GB + 12 digits
                        return f"GB{clean_number[2:]}"
                elif country_code == 'BE':
                    if len(clean_number) >= 10:
                        if not clean_number[2] == '0':
                            clean_number = country_code + '0' + clean_number[2:]
                        return f"{clean_number[:2]}{clean_number[2:5]}.{clean_number[5:8]}.{clean_number[8:11]}"
            
            # If no country code detected, return cleaned number
            return clean_number
            
        # ABN normalization (Australian Business Number)
        elif tax_type == 'abn':
            if len(clean_number) == 11:
                return f"{clean_number[:2]} {clean_number[2:5]} {clean_number[5:8]} {clean_number[8:11]}"
            return clean_number
            
        # Default - return cleaned number
        return clean_number
    
    def _normalize_iban(self, iban: Union[str, None]) -> Optional[str]:
        """
        Normalize IBAN format.
        
        Args:
            iban: IBAN string
            
        Returns:
            Normalized IBAN or None if invalid
        """
        if not iban:
            return None
            
        # Remove spaces and convert to uppercase
        clean_iban = re.sub(r'\s', '', iban).upper()
        
        # Check if it's a valid IBAN (basic format check)
        if len(clean_iban) < 4 or not clean_iban[:2].isalpha():
            logger.warning(f"Invalid IBAN format: {iban}")
            return clean_iban
            
        # Format with spaces for better readability
        country_code = clean_iban[:2]
        
        if country_code == 'BE':
            if len(clean_iban) == 16:
                return f"{clean_iban[:4]} {clean_iban[4:8]} {clean_iban[8:12]} {clean_iban[12:16]}"
        elif country_code == 'GB':
            if len(clean_iban) == 22:
                return f"{clean_iban[:4]} {clean_iban[4:8]} {clean_iban[8:14]} {clean_iban[14:18]} {clean_iban[18:22]}"
            
        # Generic formatting with spaces every 4 characters
        formatted = ' '.join(clean_iban[i:i+4] for i in range(0, len(clean_iban), 4))
        return formatted
    
    def _normalize_sort_code(self, sort_code: Union[str, None]) -> Optional[str]:
        """
        Normalize UK sort code.
        
        Args:
            sort_code: Sort code string
            
        Returns:
            Normalized sort code (XX-XX-XX format) or None if invalid
        """
        if not sort_code:
            return None
            
        # Remove all non-numeric characters
        digits = re.sub(r'[^0-9]', '', sort_code)
        
        # Check length
        if len(digits) != 6:
            logger.warning(f"Invalid sort code length: {sort_code}")
            return sort_code
            
        # Format as XX-XX-XX
        return f"{digits[:2]}-{digits[2:4]}-{digits[4:6]}"
    
    def _normalize_bsb(self, bsb: Union[str, None]) -> Optional[str]:
        """
        Normalize Australian BSB number.
        
        Args:
            bsb: BSB string
            
        Returns:
            Normalized BSB (XXX-XXX format) or None if invalid
        """
        if not bsb:
            return None
            
        # Remove all non-numeric characters
        digits = re.sub(r'[^0-9]', '', bsb)
        
        # Check length
        if len(digits) != 6:
            logger.warning(f"Invalid BSB length: {bsb}")
            return bsb
            
        # Format as XXX-XXX
        return f"{digits[:3]}-{digits[3:6]}"
    
    def _normalize_address(self, address: Union[str, None]) -> Optional[str]:
        """
        Normalize address format.
        
        Args:
            address: Address string
            
        Returns:
            Normalized address string
        """
        if not address:
            return None
            
        # Clean up whitespace
        normalized = self._clean_value(address)
        
        # Replace multiple commas with a single comma
        normalized = re.sub(r',+', ',', normalized)
        
        # Ensure comma-space sequence
        normalized = re.sub(r',\s*', ', ', normalized)
        
        return normalized
    
    def _clean_invoice_number(self, invoice_number: Union[str, None]) -> Optional[str]:
        """
        Clean invoice number by removing common prefixes and symbols.
        
        Args:
            invoice_number: Invoice number string
            
        Returns:
            Cleaned invoice number string
        """
        if not invoice_number:
            return None
            
        # Convert to string if not already
        if not isinstance(invoice_number, str):
            invoice_number = str(invoice_number)
            
        # Remove common prefixes
        prefixes = ['INV', 'INVOICE', 'INV#', 'INVOICE#', 'INVOICE NO', 'INVOICE NUMBER', '#']
        cleaned = invoice_number.upper()
        
        for prefix in prefixes:
            if cleaned.startswith(prefix):
                cleaned = cleaned[len(prefix):].strip()
                
        # Remove special characters from start/end
        cleaned = cleaned.strip('-_:#. \t')
        
        # Convert back to original case
        return invoice_number[invoice_number.upper().index(cleaned):invoice_number.upper().index(cleaned) + len(cleaned)]
    
    def _clean_value(self, value: Union[str, None]) -> Optional[str]:
        """
        General purpose string cleaning.
        
        Args:
            value: String value to clean
            
        Returns:
            Cleaned string value
        """
        if not value:
            return None
            
        if not isinstance(value, str):
            return str(value)
            
        # Normalize whitespace
        cleaned = ' '.join(value.split())
        
        # Remove leading/trailing punctuation
        cleaned = cleaned.strip('.,;:-_() \t')
        
        return cleaned
    
    def _remove_currency_symbols(self, value: str) -> str:
        """
        Remove currency symbols from a string.
        
        Args:
            value: String value with possible currency symbols
            
        Returns:
            String with currency symbols removed
        """
        # Common currency symbols
        symbols = ['$', '€', '£', '¥', '₹', '₽', 'Fr.', 'kr', 'A$', 'C$']
        
        result = value
        for symbol in symbols:
            result = result.replace(symbol, '')
            
        return result.strip()
    
    def _resolve_field_conflicts(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
        """
        Resolve any conflicts or redundancies in the extraction result.
        
        Args:
            extraction_result: Extraction result dictionary
            
        Returns:
            Extraction result with conflicts resolved
        """
        # Copy to avoid modifying the original
        result = extraction_result.copy()
        
        # Handle tax vs. vat vs. gst terminology
        if 'amounts' in result:
            # If we have both tax and vat, and they're different
            if 'tax' in result['amounts'] and 'vat' in result['amounts']:
                tax = result['amounts']['tax']
                vat = result['amounts']['vat']
                
                if tax != vat:
                    # Prefer the non-None value, or the higher value if both are set
                    if tax is None:
                        result['amounts']['tax'] = vat
                    elif vat is None:
                        result['amounts']['vat'] = tax
                    else:
                        # Both are set - use the higher value (assuming entity recognizes both terms)
                        if float(tax) > float(vat):
                            result['amounts']['vat'] = tax
                        else:
                            result['amounts']['tax'] = vat
            
            # If we have both tax and gst, and they're different
            if 'tax' in result['amounts'] and 'gst' in result['amounts']:
                tax = result['amounts']['tax']
                gst = result['amounts']['gst']
                
                if tax != gst:
                    # For Australian entities, prefer GST
                    if 'AU' in str(result.get('metadata', {}).get('entity', '')):
                        result['amounts']['tax'] = gst
                    else:
                        result['amounts']['gst'] = tax
        
        return result

    def standardize_field_names(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Convert non-standard field names to standard format using mappings.
        
        Args:
            data: Dictionary with possibly non-standard field names
            
        Returns:
            Dictionary with standardized field names
        """
        result = {}
        
        for key, value in data.items():
            # Check if this is a mapped field
            if key in self.field_name_mappings:
                # Get the standardized path
                std_path = self.field_name_mappings[key]
                
                # Split into parts for nested structure
                parts = std_path.split('.')
                
                # Navigate to the target location
                current = result
                for i, part in enumerate(parts[:-1]):
                    if part not in current:
                        current[part] = {}
                    current = current[part]
                
                # Set the value at the final location
                current[parts[-1]] = value
            else:
                # Keep unmapped fields as-is
                result[key] = value
                
        return result

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

bases: Parameter of type

Return Value

Returns unspecified type

Class Interface

Methods

`init(self, config)`

Purpose: Initialize format normalizer with configuration. Args: config: Dictionary containing normalization configuration

Parameters:

config: Type: Optional[Dict[str, Any]]

Returns: None

`normalize_extraction_result(self, extraction_result) -> Dict[str, Any]`

Purpose: Normalize all fields in an extraction result. Args: extraction_result: Dictionary containing extracted invoice data Returns: Normalized extraction result

Parameters:

extraction_result: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`_normalize_invoice_fields(self, invoice_data) -> Dict[str, Any]`

Purpose: Normalize invoice-related fields. Args: invoice_data: Dictionary of invoice fields Returns: Normalized invoice fields

Parameters:

invoice_data: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`_normalize_vendor_fields(self, vendor_data) -> Dict[str, Any]`

Purpose: Normalize vendor-related fields. Args: vendor_data: Dictionary of vendor fields Returns: Normalized vendor fields

Parameters:

vendor_data: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`_normalize_amount_fields(self, amount_data) -> Dict[str, Any]`

Purpose: Normalize amount-related fields. Args: amount_data: Dictionary of amount fields Returns: Normalized amount fields

Parameters:

amount_data: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`_normalize_payment_fields(self, payment_data) -> Dict[str, Any]`

Purpose: Normalize payment-related fields. Args: payment_data: Dictionary of payment fields Returns: Normalized payment fields

Parameters:

payment_data: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`_normalize_line_item(self, line_item) -> Dict[str, Any]`

Purpose: Normalize a single line item. Args: line_item: Dictionary representing a line item Returns: Normalized line item

Parameters:

line_item: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`_normalize_date(self, date_str) -> Optional[str]`

Purpose: Normalize date to standard format. Args: date_str: Date string in various formats Returns: Date in standard format or None if invalid

Parameters:

date_str: Type: Union[str, None]

Returns: Returns Optional[str]

`_normalize_number(self, value) -> Optional[float]`

Purpose: Normalize numeric value to float. Args: value: Numeric value in various formats Returns: Normalized float value or None if invalid

Parameters:

value: Type: Union[str, int, float, None]

Returns: Returns Optional[float]

`_normalize_currency(self, currency) -> Optional[str]`

Purpose: Normalize currency to standard 3-letter code. Args: currency: Currency string (e.g., $, USD, dollars) Returns: Normalized 3-letter currency code or None if invalid

Parameters:

currency: Type: Union[str, None]

Returns: Returns Optional[str]

`_normalize_tax_number(self, tax_number, tax_type) -> Optional[str]`

Purpose: Normalize tax identification number based on type and country. Args: tax_number: Tax number string tax_type: Type of tax ID ('vat', 'abn', etc.) Returns: Normalized tax number or None if invalid

Parameters:

tax_number: Type: Union[str, None]
tax_type: Type: str

Returns: Returns Optional[str]

`_normalize_iban(self, iban) -> Optional[str]`

Purpose: Normalize IBAN format. Args: iban: IBAN string Returns: Normalized IBAN or None if invalid

Parameters:

iban: Type: Union[str, None]

Returns: Returns Optional[str]

`_normalize_sort_code(self, sort_code) -> Optional[str]`

Purpose: Normalize UK sort code. Args: sort_code: Sort code string Returns: Normalized sort code (XX-XX-XX format) or None if invalid

Parameters:

sort_code: Type: Union[str, None]

Returns: Returns Optional[str]

`_normalize_bsb(self, bsb) -> Optional[str]`

Purpose: Normalize Australian BSB number. Args: bsb: BSB string Returns: Normalized BSB (XXX-XXX format) or None if invalid

Parameters:

bsb: Type: Union[str, None]

Returns: Returns Optional[str]

`_normalize_address(self, address) -> Optional[str]`

Purpose: Normalize address format. Args: address: Address string Returns: Normalized address string

Parameters:

address: Type: Union[str, None]

Returns: Returns Optional[str]

`_clean_invoice_number(self, invoice_number) -> Optional[str]`

Purpose: Clean invoice number by removing common prefixes and symbols. Args: invoice_number: Invoice number string Returns: Cleaned invoice number string

Parameters:

invoice_number: Type: Union[str, None]

Returns: Returns Optional[str]

`_clean_value(self, value) -> Optional[str]`

Purpose: General purpose string cleaning. Args: value: String value to clean Returns: Cleaned string value

Parameters:

value: Type: Union[str, None]

Returns: Returns Optional[str]

`_remove_currency_symbols(self, value) -> str`

Purpose: Remove currency symbols from a string. Args: value: String value with possible currency symbols Returns: String with currency symbols removed

Parameters:

value: Type: str

Returns: Returns str

`_resolve_field_conflicts(self, extraction_result) -> Dict[str, Any]`

Purpose: Resolve any conflicts or redundancies in the extraction result. Args: extraction_result: Extraction result dictionary Returns: Extraction result with conflicts resolved

Parameters:

extraction_result: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`standardize_field_names(self, data) -> Dict[str, Any]`

Purpose: Convert non-standard field names to standard format using mappings. Args: data: Dictionary with possibly non-standard field names Returns: Dictionary with standardized field names

Parameters:

data: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

Required Imports

import re
import logging
from datetime import datetime
from typing import Dict
from typing import Any

Usage Example

# Example usage:
# result = FormatNormalizer(bases)

Similar Components

AI-powered semantic similarity - components with related functionality:

function clean_text 49.7% similar

Cleans and normalizes text content by removing HTML tags, normalizing whitespace, and stripping markdown formatting elements.
From: /tf/active/vicechatdev/improved_convert_disclosures_to_table.py
class UKValidator 47.9% similar

UK-specific invoice data validator that extends BaseValidator to implement validation rules specific to UK invoices including VAT number format, UK addresses, VAT rates, and banking details.
From: /tf/active/vicechatdev/invoice_extraction/validators/uk_validator.py
class BaseValidator 46.7% similar

Abstract base class for validating extracted invoice data with entity-specific validation rules. Provides common validation functionality for required fields, field types, date consistency, and amount calculations.
From: /tf/active/vicechatdev/invoice_extraction/validators/base_validator.py
class BEExtractor 46.3% similar

Belgium-specific invoice data extractor that uses LLM (Large Language Model) to extract structured invoice data from Belgian invoices in multiple languages (English, French, Dutch).
From: /tf/active/vicechatdev/invoice_extraction/extractors/be_extractor.py
class BEValidator 46.2% similar

Belgium-specific invoice data validator that extends BaseValidator to implement Belgian invoice validation rules including VAT number format, address verification, IBAN validation, and legal requirements.
From: /tf/active/vicechatdev/invoice_extraction/validators/be_validator.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class FormatNormalizer:
    """
    Normalizes extracted data formats to ensure consistency.
    
    Handles:
    - Date format standardization
    - Number/currency normalization
    - VAT/tax number formatting
    - Field name standardization
    - Address formatting
    - Field value cleaning
    """
    
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        """
        Initialize format normalizer with configuration.
        
        Args:
            config: Dictionary containing normalization configuration
        """
        self.config = config or {}
        
        # Date format configuration
        self.input_date_formats = self.config.get('input_date_formats', [
            '%d/%m/%Y', '%m/%d/%Y', '%Y-%m-%d', '%d-%m-%Y',
            '%d.%m.%Y', '%m.%d.%Y', '%B %d, %Y', '%d %B %Y',
            '%b %d, %Y', '%d %b %Y', '%d-%b-%Y', '%Y/%m/%d'
        ])
        self.output_date_format = self.config.get('output_date_format', '%Y-%m-%d')
        
        # Currency/number configuration
        self.decimal_separator = self.config.get('decimal_separator', '.')
        self.thousands_separator = self.config.get('thousands_separator', ',')
        self.currency_symbols = {
            '$': 'USD', '€': 'EUR', '£': 'GBP', '¥': 'JPY',
            'kr': 'SEK', 'Fr.': 'CHF', 'A$': 'AUD', 'CA$': 'CAD'
        }
        
        # Field name mappings for standardization
        self.field_name_mappings = {
            # Invoice fields
            'invoice_number': 'invoice.number',
            'invoice_no': 'invoice.number',
            'inv_number': 'invoice.number',
            'invoice_date': 'invoice.issue_date',
            'date': 'invoice.issue_date',
            'issue_date': 'invoice.issue_date',
            'due_date': 'invoice.due_date',
            'payment_due': 'invoice.due_date',
            'po_number': 'invoice.po_number',
            'purchase_order': 'invoice.po_number',
            
            # Vendor fields
            'supplier': 'vendor.name',
            'supplier_name': 'vendor.name',
            'vendor_name': 'vendor.name',
            'supplier_address': 'vendor.address',
            'vendor_address': 'vendor.address',
            'vat_number': 'vendor.vat_number',
            'vat_no': 'vendor.vat_number',
            'tax_number': 'vendor.vat_number',
            'abn': 'vendor.abn',
            'company_number': 'vendor.company_number',
            
            # Amount fields
            'total': 'amounts.total',
            'total_amount': 'amounts.total',
            'subtotal': 'amounts.subtotal',
            'net_amount': 'amounts.subtotal',
            'vat': 'amounts.vat',
            'vat_amount': 'amounts.vat',
            'tax': 'amounts.tax',
            'tax_amount': 'amounts.tax',
            'gst': 'amounts.gst',
            'gst_amount': 'amounts.gst',
            'currency': 'amounts.currency',
            'vat_rate': 'amounts.vat_rate',
            'tax_rate': 'amounts.tax_rate',
            
            # Payment fields
            'bank_account': 'payment.account_number',
            'account_number': 'payment.account_number',
            'sort_code': 'payment.sort_code',
            'iban': 'payment.iban',
            'bic': 'payment.bic',
            'swift': 'payment.bic',
            'bsb': 'payment.bsb',
        }
        
        # Country-specific VAT/tax number formats
        self.vat_number_formats = {
            'GB': r'^GB\d{9}$|^GB\d{12}$',                      # UK
            'BE': r'^BE0\d{9}$',                                # Belgium
            'AU': r'^\d{11}$',                                  # Australia (ABN)
            'DE': r'^DE\d{9}$',                                # Germany
            'FR': r'^FR[A-Z0-9]{2}\d{9}$',                     # France
            'IT': r'^IT\d{11}$',                               # Italy
            'ES': r'^ES[A-Z0-9]\d{8}$',                        # Spain
            'NL': r'^NL\d{9}B\d{2}$',                          # Netherlands
        }
    
    def normalize_extraction_result(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize all fields in an extraction result.
        
        Args:
            extraction_result: Dictionary containing extracted invoice data
            
        Returns:
            Normalized extraction result
        """
        # Create a copy to avoid modifying the original
        result = extraction_result.copy()
        
        # Normalize invoice fields
        if 'invoice' in result:
            result['invoice'] = self._normalize_invoice_fields(result['invoice'])
            
        # Normalize vendor fields
        if 'vendor' in result:
            result['vendor'] = self._normalize_vendor_fields(result['vendor'])
            
        # Normalize amounts
        if 'amounts' in result:
            result['amounts'] = self._normalize_amount_fields(result['amounts'])
            
        # Normalize payment information
        if 'payment' in result:
            result['payment'] = self._normalize_payment_fields(result['payment'])
            
        # Normalize line items
        if 'line_items' in result and isinstance(result['line_items'], list):
            result['line_items'] = [
                self._normalize_line_item(item) 
                for item in result['line_items']
            ]
            
        # Resolve any field conflicts or redundancies
        result = self._resolve_field_conflicts(result)
        
        # Add normalization metadata
        if 'metadata' not in result:
            result['metadata'] = {}
        result['metadata']['normalized'] = True
        result['metadata']['normalization_date'] = datetime.now().strftime(self.output_date_format)
        
        return result
    
    def _normalize_invoice_fields(self, invoice_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize invoice-related fields.
        
        Args:
            invoice_data: Dictionary of invoice fields
            
        Returns:
            Normalized invoice fields
        """
        result = invoice_data.copy()
        
        # Normalize invoice number (remove prefixes like "INV-" or "#")
        if 'number' in result:
            result['number'] = self._clean_invoice_number(result['number'])
            
        # Normalize dates
        for date_field in ['issue_date', 'due_date']:
            if date_field in result:
                result[date_field] = self._normalize_date(result[date_field])
                
        # Normalize PO number
        if 'po_number' in result:
            result['po_number'] = self._clean_invoice_number(result['po_number'])
            
        return result
    
    def _normalize_vendor_fields(self, vendor_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize vendor-related fields.
        
        Args:
            vendor_data: Dictionary of vendor fields
            
        Returns:
            Normalized vendor fields
        """
        result = vendor_data.copy()
        
        # Normalize VAT number
        if 'vat_number' in result:
            result['vat_number'] = self._normalize_tax_number(result['vat_number'], 'vat')
            
        # Normalize ABN (Australian Business Number)
        if 'abn' in result:
            result['abn'] = self._normalize_tax_number(result['abn'], 'abn')
            
        # Normalize company number
        if 'company_number' in result:
            result['company_number'] = self._clean_value(result['company_number'])
            
        # Normalize address
        if 'address' in result:
            result['address'] = self._normalize_address(result['address'])
            
        # Clean vendor name
        if 'name' in result:
            result['name'] = self._clean_value(result['name'])
            
        return result
    
    def _normalize_amount_fields(self, amount_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize amount-related fields.
        
        Args:
            amount_data: Dictionary of amount fields
            
        Returns:
            Normalized amount fields
        """
        result = amount_data.copy()
        
        # Normalize numeric fields
        for field in ['subtotal', 'total', 'vat', 'tax', 'gst']:
            if field in result:
                result[field] = self._normalize_number(result[field])
                
        # Normalize rate fields
        for field in ['vat_rate', 'tax_rate', 'gst_rate']:
            if field in result:
                result[field] = self._normalize_number(result[field])
                
        # Normalize currency
        if 'currency' in result:
            result['currency'] = self._normalize_currency(result['currency'])
            
        # Ensure tax field consistency
        if 'vat' in result and 'tax' not in result:
            result['tax'] = result['vat']
        elif 'tax' in result and 'vat' not in result:
            result['vat'] = result['tax']
        elif 'gst' in result and 'tax' not in result:
            result['tax'] = result['gst']
        
        # Ensure rate field consistency
        if 'vat_rate' in result and 'tax_rate' not in result:
            result['tax_rate'] = result['vat_rate']
        elif 'tax_rate' in result and 'vat_rate' not in result:
            result['vat_rate'] = result['tax_rate']
            
        return result
    
    def _normalize_payment_fields(self, payment_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize payment-related fields.
        
        Args:
            payment_data: Dictionary of payment fields
            
        Returns:
            Normalized payment fields
        """
        result = payment_data.copy()
        
        # Normalize IBAN (remove spaces and format)
        if 'iban' in result:
            result['iban'] = self._normalize_iban(result['iban'])
            
        # Normalize BIC/SWIFT
        if 'bic' in result:
            result['bic'] = self._clean_value(result['bic']).upper()
            
        # Normalize sort code (UK)
        if 'sort_code' in result:
            result['sort_code'] = self._normalize_sort_code(result['sort_code'])
            
        # Normalize BSB (Australia)
        if 'bsb' in result:
            result['bsb'] = self._normalize_bsb(result['bsb'])
            
        # Normalize account number (remove spaces)
        if 'account_number' in result:
            result['account_number'] = re.sub(r'\s', '', result['account_number'])
            
        # Normalize payment terms
        if 'payment_terms' in result:
            result['payment_terms'] = self._clean_value(result['payment_terms'])
            
        return result
    
    def _normalize_line_item(self, line_item: Dict[str, Any]) -> Dict[str, Any]:
        """
        Normalize a single line item.
        
        Args:
            line_item: Dictionary representing a line item
            
        Returns:
            Normalized line item
        """
        result = line_item.copy()
        
        # Normalize description
        if 'description' in result:
            result['description'] = self._clean_value(result['description'])
            
        # Normalize numeric fields
        for field in ['quantity', 'unit_price', 'amount', 'tax_amount', 'gst_amount', 'vat_amount']:
            if field in result:
                result[field] = self._normalize_number(result[field])
                
        # Normalize rate fields
        for field in ['vat_rate', 'tax_rate', 'gst_rate']:
            if field in result:
                result[field] = self._normalize_number(result[field])
                
        # Ensure tax amount consistency
        if 'vat_amount' in result and 'tax_amount' not in result:
            result['tax_amount'] = result['vat_amount']
        elif 'tax_amount' in result and 'vat_amount' not in result:
            result['vat_amount'] = result['tax_amount']
        elif 'gst_amount' in result and 'tax_amount' not in result:
            result['tax_amount'] = result['gst_amount']
            
        return result
    
    def _normalize_date(self, date_str: Union[str, None]) -> Optional[str]:
        """
        Normalize date to standard format.
        
        Args:
            date_str: Date string in various formats
            
        Returns:
            Date in standard format or None if invalid
        """
        if not date_str:
            return None
            
        if isinstance(date_str, datetime):
            return date_str.strftime(self.output_date_format)
            
        # Already in target format
        if re.match(r'^\d{4}-\d{2}-\d{2}$', date_str):
            try:
                # Validate it's a legitimate date
                datetime.strptime(date_str, self.output_date_format)
                return date_str
            except ValueError:
                pass
        
        # Try all configured date formats
        for fmt in self.input_date_formats:
            try:
                date_obj = datetime.strptime(date_str, fmt)
                return date_obj.strftime(self.output_date_format)
            except (ValueError, TypeError):
                continue
                
        # Handle common date format with ordinals (1st, 2nd, 3rd, etc.)
        date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
        for fmt in self.input_date_formats:
            try:
                date_obj = datetime.strptime(date_str, fmt)
                return date_obj.strftime(self.output_date_format)
            except (ValueError, TypeError):
                continue
        
        # If all parsing attempts fail
        logger.warning(f"Could not normalize date: {date_str}")
        return date_str
    
    def _normalize_number(self, value: Union[str, int, float, None]) -> Optional[float]:
        """
        Normalize numeric value to float.
        
        Args:
            value: Numeric value in various formats
            
        Returns:
            Normalized float value or None if invalid
        """
        if value is None:
            return None
            
        if isinstance(value, (int, float)):
            return float(value)
            
        if not isinstance(value, str):
            logger.warning(f"Unexpected type for numeric value: {type(value)}")
            return None
            
        # Remove currency symbols and other non-numeric characters
        cleaned = self._remove_currency_symbols(value)
        
        # European format (1.234,56 -> 1234.56)
        if ',' in cleaned and '.' in cleaned:
            if cleaned.rindex(',') > cleaned.rindex('.'):
                # Format like "1.234,56"
                cleaned = cleaned.replace('.', '')
                cleaned = cleaned.replace(',', '.')
        elif ',' in cleaned and '.' not in cleaned:
            # Format like "1234,56"
            cleaned = cleaned.replace(',', '.')
            
        # Remove any remaining non-numeric characters except decimal point
        cleaned = re.sub(r'[^\d.]', '', cleaned)
        
        try:
            # Convert to float and handle precision
            result = float(cleaned)
            # Round to 2 decimal places for monetary values
            return round(result, 2)
        except (ValueError, TypeError):
            logger.warning(f"Could not normalize number: {value}")
            return None
    
    def _normalize_currency(self, currency: Union[str, None]) -> Optional[str]:
        """
        Normalize currency to standard 3-letter code.
        
        Args:
            currency: Currency string (e.g., $, USD, dollars)
            
        Returns:
            Normalized 3-letter currency code or None if invalid
        """
        if not currency:
            return None
            
        # Already a standard 3-letter code
        if re.match(r'^[A-Z]{3}$', currency):
            return currency
            
        # Handle currency symbols
        currency_upper = currency.upper().strip()
        for symbol, code in self.currency_symbols.items():
            if symbol in currency:
                return code
                
        # Try to match currency names
        currency_map = {
            'DOLLAR': 'USD',
            'DOLLARS': 'USD',
            'US DOLLAR': 'USD',
            'EURO': 'EUR',
            'EUROS': 'EUR',
            'POUND': 'GBP',
            'POUNDS': 'GBP',
            'STERLING': 'GBP',
            'YEN': 'JPY',
            'AUSTRALIAN DOLLAR': 'AUD',
            'CANADIAN DOLLAR': 'CAD',
            'FRANC': 'CHF',
            'KRONA': 'SEK'
        }
        
        for name, code in currency_map.items():
            if name in currency_upper:
                return code
                
        # Try using pycountry
        try:
            currency_obj = pycountry.currencies.lookup(currency_upper)
            if currency_obj:
                return currency_obj.alpha_3
        except (LookupError, AttributeError):
            pass
            
        # Return as-is if we can't normalize
        logger.warning(f"Could not normalize currency: {currency}")
        return currency
    
    def _normalize_tax_number(self, tax_number: Union[str, None], tax_type: str) -> Optional[str]:
        """
        Normalize tax identification number based on type and country.
        
        Args:
            tax_number: Tax number string
            tax_type: Type of tax ID ('vat', 'abn', etc.)
            
        Returns:
            Normalized tax number or None if invalid
        """
        if not tax_number:
            return None
            
        # Remove all non-alphanumeric characters
        clean_number = re.sub(r'[^a-zA-Z0-9]', '', tax_number)
        
        # VAT number normalization
        if tax_type == 'vat':
            # Try to detect country code from first 2 characters
            if len(clean_number) >= 2 and clean_number[:2].isalpha():
                country_code = clean_number[:2].upper()
                
                # Apply country-specific format if available
                if country_code in self.vat_number_formats:
                    pattern = self.vat_number_formats[country_code]
                    if not re.match(pattern, clean_number):
                        logger.warning(f"VAT number {clean_number} does not match expected format for {country_code}")
                        
                # Format based on country
                if country_code == 'GB':
                    if len(clean_number) == 11:  # GB + 9 digits
                        return f"GB{clean_number[2:]}"
                    elif len(clean_number) == 14:  # GB + 12 digits
                        return f"GB{clean_number[2:]}"
                elif country_code == 'BE':
                    if len(clean_number) >= 10:
                        if not clean_number[2] == '0':
                            clean_number = country_code + '0' + clean_number[2:]
                        return f"{clean_number[:2]}{clean_number[2:5]}.{clean_number[5:8]}.{clean_number[8:11]}"
            
            # If no country code detected, return cleaned number
            return clean_number
            
        # ABN normalization (Australian Business Number)
        elif tax_type == 'abn':
            if len(clean_number) == 11:
                return f"{clean_number[:2]} {clean_number[2:5]} {clean_number[5:8]} {clean_number[8:11]}"
            return clean_number
            
        # Default - return cleaned number
        return clean_number
    
    def _normalize_iban(self, iban: Union[str, None]) -> Optional[str]:
        """
        Normalize IBAN format.
        
        Args:
            iban: IBAN string
            
        Returns:
            Normalized IBAN or None if invalid
        """
        if not iban:
            return None
            
        # Remove spaces and convert to uppercase
        clean_iban = re.sub(r'\s', '', iban).upper()
        
        # Check if it's a valid IBAN (basic format check)
        if len(clean_iban) < 4 or not clean_iban[:2].isalpha():
            logger.warning(f"Invalid IBAN format: {iban}")
            return clean_iban
            
        # Format with spaces for better readability
        country_code = clean_iban[:2]
        
        if country_code == 'BE':
            if len(clean_iban) == 16:
                return f"{clean_iban[:4]} {clean_iban[4:8]} {clean_iban[8:12]} {clean_iban[12:16]}"
        elif country_code == 'GB':
            if len(clean_iban) == 22:
                return f"{clean_iban[:4]} {clean_iban[4:8]} {clean_iban[8:14]} {clean_iban[14:18]} {clean_iban[18:22]}"
            
        # Generic formatting with spaces every 4 characters
        formatted = ' '.join(clean_iban[i:i+4] for i in range(0, len(clean_iban), 4))
        return formatted
    
    def _normalize_sort_code(self, sort_code: Union[str, None]) -> Optional[str]:
        """
        Normalize UK sort code.
        
        Args:
            sort_code: Sort code string
            
        Returns:
            Normalized sort code (XX-XX-XX format) or None if invalid
        """
        if not sort_code:
            return None
            
        # Remove all non-numeric characters
        digits = re.sub(r'[^0-9]', '', sort_code)
        
        # Check length
        if len(digits) != 6:
            logger.warning(f"Invalid sort code length: {sort_code}")
            return sort_code
            
        # Format as XX-XX-XX
        return f"{digits[:2]}-{digits[2:4]}-{digits[4:6]}"
    
    def _normalize_bsb(self, bsb: Union[str, None]) -> Optional[str]:
        """
        Normalize Australian BSB number.
        
        Args:
            bsb: BSB string
            
        Returns:
            Normalized BSB (XXX-XXX format) or None if invalid
        """
        if not bsb:
            return None
            
        # Remove all non-numeric characters
        digits = re.sub(r'[^0-9]', '', bsb)
        
        # Check length
        if len(digits) != 6:
            logger.warning(f"Invalid BSB length: {bsb}")
            return bsb
            
        # Format as XXX-XXX
        return f"{digits[:3]}-{digits[3:6]}"
    
    def _normalize_address(self, address: Union[str, None]) -> Optional[str]:
        """
        Normalize address format.
        
        Args:
            address: Address string
            
        Returns:
            Normalized address string
        """
        if not address:
            return None
            
        # Clean up whitespace
        normalized = self._clean_value(address)
        
        # Replace multiple commas with a single comma
        normalized = re.sub(r',+', ',', normalized)
        
        # Ensure comma-space sequence
        normalized = re.sub(r',\s*', ', ', normalized)
        
        return normalized
    
    def _clean_invoice_number(self, invoice_number: Union[str, None]) -> Optional[str]:
        """
        Clean invoice number by removing common prefixes and symbols.
        
        Args:
            invoice_number: Invoice number string
            
        Returns:
            Cleaned invoice number string
        """
        if not invoice_number:
            return None
            
        # Convert to string if not already
        if not isinstance(invoice_number, str):
            invoice_number = str(invoice_number)
            
        # Remove common prefixes
        prefixes = ['INV', 'INVOICE', 'INV#', 'INVOICE#', 'INVOICE NO', 'INVOICE NUMBER', '#']
        cleaned = invoice_number.upper()
        
        for prefix in prefixes:
            if cleaned.startswith(prefix):
                cleaned = cleaned[len(prefix):].strip()
                
        # Remove special characters from start/end
        cleaned = cleaned.strip('-_:#. \t')
        
        # Convert back to original case
        return invoice_number[invoice_number.upper().index(cleaned):invoice_number.upper().index(cleaned) + len(cleaned)]
    
    def _clean_value(self, value: Union[str, None]) -> Optional[str]:
        """
        General purpose string cleaning.
        
        Args:
            value: String value to clean
            
        Returns:
            Cleaned string value
        """
        if not value:
            return None
            
        if not isinstance(value, str):
            return str(value)
            
        # Normalize whitespace
        cleaned = ' '.join(value.split())
        
        # Remove leading/trailing punctuation
        cleaned = cleaned.strip('.,;:-_() \t')
        
        return cleaned
    
    def _remove_currency_symbols(self, value: str) -> str:
        """
        Remove currency symbols from a string.
        
        Args:
            value: String value with possible currency symbols
            
        Returns:
            String with currency symbols removed
        """
        # Common currency symbols
        symbols = ['$', '€', '£', '¥', '₹', '₽', 'Fr.', 'kr', 'A$', 'C$']
        
        result = value
        for symbol in symbols:
            result = result.replace(symbol, '')
            
        return result.strip()
    
    def _resolve_field_conflicts(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
        """
        Resolve any conflicts or redundancies in the extraction result.
        
        Args:
            extraction_result: Extraction result dictionary
            
        Returns:
            Extraction result with conflicts resolved
        """
        # Copy to avoid modifying the original
        result = extraction_result.copy()
        
        # Handle tax vs. vat vs. gst terminology
        if 'amounts' in result:
            # If we have both tax and vat, and they're different
            if 'tax' in result['amounts'] and 'vat' in result['amounts']:
                tax = result['amounts']['tax']
                vat = result['amounts']['vat']
                
                if tax != vat:
                    # Prefer the non-None value, or the higher value if both are set
                    if tax is None:
                        result['amounts']['tax'] = vat
                    elif vat is None:
                        result['amounts']['vat'] = tax
                    else:
                        # Both are set - use the higher value (assuming entity recognizes both terms)
                        if float(tax) > float(vat):
                            result['amounts']['vat'] = tax
                        else:
                            result['amounts']['tax'] = vat
            
            # If we have both tax and gst, and they're different
            if 'tax' in result['amounts'] and 'gst' in result['amounts']:
                tax = result['amounts']['tax']
                gst = result['amounts']['gst']
                
                if tax != gst:
                    # For Australian entities, prefer GST
                    if 'AU' in str(result.get('metadata', {}).get('entity', '')):
                        result['amounts']['tax'] = gst
                    else:
                        result['amounts']['gst'] = tax
        
        return result

    def standardize_field_names(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Convert non-standard field names to standard format using mappings.
        
        Args:
            data: Dictionary with possibly non-standard field names
            
        Returns:
            Dictionary with standardized field names
        """
        result = {}
        
        for key, value in data.items():
            # Check if this is a mapped field
            if key in self.field_name_mappings:
                # Get the standardized path
                std_path = self.field_name_mappings[key]
                
                # Split into parts for nested structure
                parts = std_path.split('.')
                
                # Navigate to the target location
                current = result
                for i, part in enumerate(parts[:-1]):
                    if part not in current:
                        current[part] = {}
                    current = current[part]
                
                # Set the value at the final location
                current[parts[-1]] = value
            else:
                # Keep unmapped fields as-is
                result[key] = value
                
        return result
                        

Improved Code

🔍 Code Extractor

class FormatNormalizer

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self, config)`

`normalize_extraction_result(self, extraction_result) -> Dict[str, Any]`

`_normalize_invoice_fields(self, invoice_data) -> Dict[str, Any]`

`_normalize_vendor_fields(self, vendor_data) -> Dict[str, Any]`

`_normalize_amount_fields(self, amount_data) -> Dict[str, Any]`

`_normalize_payment_fields(self, payment_data) -> Dict[str, Any]`

`_normalize_line_item(self, line_item) -> Dict[str, Any]`

`_normalize_date(self, date_str) -> Optional[str]`

`_normalize_number(self, value) -> Optional[float]`

`_normalize_currency(self, currency) -> Optional[str]`

`_normalize_tax_number(self, tax_number, tax_type) -> Optional[str]`

`_normalize_iban(self, iban) -> Optional[str]`

`_normalize_sort_code(self, sort_code) -> Optional[str]`

`_normalize_bsb(self, bsb) -> Optional[str]`

`_normalize_address(self, address) -> Optional[str]`

`_clean_invoice_number(self, invoice_number) -> Optional[str]`

`_clean_value(self, value) -> Optional[str]`

`_remove_currency_symbols(self, value) -> str`

`_resolve_field_conflicts(self, extraction_result) -> Dict[str, Any]`

`standardize_field_names(self, data) -> Dict[str, Any]`

Required Imports

Usage Example

Tags

Similar Components

function clean_text 49.7% similar

class UKValidator 47.9% similar

class BaseValidator 46.7% similar

class BEExtractor 46.3% similar

class BEValidator 46.2% similar

class FormatNormalizer

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self, config)

normalize_extraction_result(self, extraction_result) -> Dict[str, Any]

_normalize_invoice_fields(self, invoice_data) -> Dict[str, Any]

_normalize_vendor_fields(self, vendor_data) -> Dict[str, Any]

_normalize_amount_fields(self, amount_data) -> Dict[str, Any]

_normalize_payment_fields(self, payment_data) -> Dict[str, Any]

_normalize_line_item(self, line_item) -> Dict[str, Any]

_normalize_date(self, date_str) -> Optional[str]

_normalize_number(self, value) -> Optional[float]

_normalize_currency(self, currency) -> Optional[str]

_normalize_tax_number(self, tax_number, tax_type) -> Optional[str]

_normalize_iban(self, iban) -> Optional[str]

_normalize_sort_code(self, sort_code) -> Optional[str]

_normalize_bsb(self, bsb) -> Optional[str]

_normalize_address(self, address) -> Optional[str]

_clean_invoice_number(self, invoice_number) -> Optional[str]

_clean_value(self, value) -> Optional[str]

_remove_currency_symbols(self, value) -> str

_resolve_field_conflicts(self, extraction_result) -> Dict[str, Any]

standardize_field_names(self, data) -> Dict[str, Any]

Required Imports

Usage Example

Tags

Similar Components

function clean_text 49.7% similar

class UKValidator 47.9% similar

class BaseValidator 46.7% similar

class BEExtractor 46.3% similar

class BEValidator 46.2% similar

✨ Improve Code: FormatNormalizer

Code Comparison

`init(self, config)`

`normalize_extraction_result(self, extraction_result) -> Dict[str, Any]`

`_normalize_invoice_fields(self, invoice_data) -> Dict[str, Any]`

`_normalize_vendor_fields(self, vendor_data) -> Dict[str, Any]`

`_normalize_amount_fields(self, amount_data) -> Dict[str, Any]`

`_normalize_payment_fields(self, payment_data) -> Dict[str, Any]`

`_normalize_line_item(self, line_item) -> Dict[str, Any]`

`_normalize_date(self, date_str) -> Optional[str]`

`_normalize_number(self, value) -> Optional[float]`

`_normalize_currency(self, currency) -> Optional[str]`

`_normalize_tax_number(self, tax_number, tax_type) -> Optional[str]`

`_normalize_iban(self, iban) -> Optional[str]`

`_normalize_sort_code(self, sort_code) -> Optional[str]`

`_normalize_bsb(self, bsb) -> Optional[str]`

`_normalize_address(self, address) -> Optional[str]`

`_clean_invoice_number(self, invoice_number) -> Optional[str]`

`_clean_value(self, value) -> Optional[str]`

`_remove_currency_symbols(self, value) -> str`

`_resolve_field_conflicts(self, extraction_result) -> Dict[str, Any]`

`standardize_field_names(self, data) -> Dict[str, Any]`