class AUValidator
Australia-specific invoice data validator that extends BaseValidator to implement validation rules for Australian invoices including ABN validation, GST calculations, and Australian tax invoice requirements.
/tf/active/vicechatdev/invoice_extraction/validators/au_validator.py
10 - 414
complex
Purpose
This class validates invoice data extracted from Australian invoices according to Australian Tax Office (ATO) requirements. It performs comprehensive validation including: ABN (Australian Business Number) checksum validation, Australian address verification with postcode and state checking, GST rate consistency (10% standard rate), BSB and account number format validation, currency verification (AUD), GST calculation accuracy, and compliance with Australian tax invoice mandatory requirements. The validator ensures invoices meet legal requirements for Australian tax invoices, particularly those over $1,000 which have additional requirements.
Source Code
class AUValidator(BaseValidator):
"""
Australia-specific invoice data validator.
Implements validation rules specific to Australian invoices:
- ABN (Australian Business Number) validation
- Australian address verification
- GST rate consistency (10%)
- BSB and account number validation
- Australian tax invoice requirements
"""
def __init__(self, config=None):
super().__init__(config)
# Australian-specific required fields
au_required = {
'vendor.abn': 'critical', # ABN is required for Australian tax invoices
'amounts.gst': 'important', # GST amount is important
'invoice.number': 'critical', # Tax invoice number is required
'vendor.name': 'critical', # Supplier identity is required
}
# Update required fields with Australian-specific ones
self.required_fields.update(au_required)
# Australian GST rate is fixed at 10%
self.au_gst_rate = self.config.get('au_gst_rate', 10.0)
# Australian ABN regex (11 digits typically formatted as XX XXX XXX XXX)
self.au_abn_regex = r'^(\d{2}\s?){3}\d{2}$|^\d{11}$'
# BSB regex (XXX-XXX format)
self.au_bsb_regex = r'^\d{3}-\d{3}$'
# Australian account number regex (typically 6-10 digits)
self.au_account_number_regex = r'^\d{6,10}$'
# Australian postcode regex (4 digits)
self.au_postcode_regex = r'\b\d{4}\b'
# Australian states abbreviations
self.au_states = ['NSW', 'VIC', 'QLD', 'SA', 'WA', 'TAS', 'NT', 'ACT']
def _entity_specific_validation(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Perform Australia-specific validation.
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
# 1. ABN validation
self._validate_abn(extraction_result, result)
# 2. Address verification (check for Australian postal code and state names)
self._validate_au_address(extraction_result, result)
# 3. GST rate validation (should be 10% in Australia)
self._validate_gst_rate(extraction_result, result)
# 4. BSB and account number validation
self._validate_au_banking_details(extraction_result, result)
# 5. Validate currency is AUD
self._validate_au_currency(extraction_result, result)
# 6. GST calculation validation
self._validate_gst_calculation(extraction_result, result)
# 7. Australian tax invoice requirements
self._validate_tax_invoice_requirements(extraction_result, result)
def _validate_abn(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Validate Australian Business Number (ABN).
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
abn = self._get_nested_field(extraction_result, 'vendor.abn')
if abn:
# Remove spaces and other non-numeric characters
clean_abn = re.sub(r'[^0-9]', '', abn)
# ABN must be 11 digits
if len(clean_abn) != 11:
result.add_issue(
'vendor.abn',
f"Invalid ABN length: {len(clean_abn)}. ABN must be 11 digits.",
'error'
)
return
# Perform checksum validation
try:
# Subtract 1 from first digit
weights = [10, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
# First digit less 1
digits = [int(clean_abn[0]) - 1] + [int(d) for d in clean_abn[1:]]
# Calculate weighted sum
weighted_sum = sum(w * d for w, d in zip(weights, digits))
# Check divisibility by 89
if weighted_sum % 89 != 0:
result.add_issue(
'vendor.abn',
f"ABN checksum validation failed for: {abn}. This may not be a valid ABN.",
'warning'
)
except Exception as e:
logger.warning(f"Error validating ABN checksum: {e}")
def _validate_au_address(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Validate Australian address (checks for Australian postal code and state names).
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
address = self._get_nested_field(extraction_result, 'vendor.address')
if address:
# Check if address contains an Australian postcode (4 digits)
postcodes = re.findall(self.au_postcode_regex, address)
if not postcodes:
result.add_issue(
'vendor.address',
"No valid Australian postcode (4 digits) found in address",
'warning'
)
# Check for Australian state abbreviations
has_au_state = any(f" {state} " in f" {address.upper()} " for state in self.au_states)
# Check for Australian-specific terms
au_terms = ['AUSTRALIA', 'AUS', 'SYDNEY', 'MELBOURNE', 'BRISBANE', 'PERTH', 'ADELAIDE', 'HOBART']
has_au_term = any(term in address.upper() for term in au_terms)
if not has_au_state and not has_au_term and not postcodes:
result.add_issue(
'vendor.address',
"Address does not appear to be from Australia",
'warning'
)
def _validate_gst_rate(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Validate GST rate (should be 10% in Australia).
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
# Check both tax_rate and gst_rate fields
tax_rate = self._get_nested_field(extraction_result, 'amounts.tax_rate')
gst_rate = self._get_nested_field(extraction_result, 'amounts.gst_rate')
# Use GST rate if available, otherwise tax_rate
rate = gst_rate if gst_rate is not None else tax_rate
if rate is not None:
try:
rate_float = float(rate)
# Check if rate is 10% (standard GST rate) or 0% (GST-free)
if rate_float not in [0.0, self.au_gst_rate]:
# Allow minor deviation
if abs(rate_float - self.au_gst_rate) > 0.5: # 0.5% tolerance
result.add_issue(
'amounts.gst_rate' if gst_rate is not None else 'amounts.tax_rate',
f"Unusual GST rate: {rate_float}%. Australian GST is {self.au_gst_rate}% " +
"or 0% for GST-free supplies.",
'warning'
)
except (ValueError, TypeError):
result.add_issue(
'amounts.gst_rate' if gst_rate is not None else 'amounts.tax_rate',
f"Invalid GST rate format: {rate}",
'warning'
)
def _validate_au_banking_details(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Validate Australian banking details (BSB and account number).
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
bsb = self._get_nested_field(extraction_result, 'payment.bsb')
account_number = self._get_nested_field(extraction_result, 'payment.account_number')
# BSB validation
if bsb:
# Remove spaces and other non-alphanumeric characters
clean_bsb = re.sub(r'[^0-9-]', '', bsb)
# If no dash, try to format it
if '-' not in clean_bsb and len(clean_bsb) == 6:
clean_bsb = f"{clean_bsb[0:3]}-{clean_bsb[3:6]}"
# Check format
if not re.match(self.au_bsb_regex, clean_bsb):
result.add_issue(
'payment.bsb',
f"Invalid BSB format: {bsb}. Should be XXX-XXX.",
'warning'
)
else:
# Check that BSB first digit is in range 0-9
if clean_bsb[0] not in '0123456789':
result.add_issue(
'payment.bsb',
f"Invalid BSB: {bsb}. First digit should be 0-9.",
'warning'
)
# Known invalid BSBs start with 00
if clean_bsb.startswith('00'):
result.add_issue(
'payment.bsb',
f"Invalid BSB: {bsb}. BSB should not start with 00.",
'warning'
)
# Account number validation
if account_number:
# Remove spaces and other non-numeric characters
clean_account = re.sub(r'[^0-9]', '', account_number)
# Check length (Australian account numbers are typically 6-10 digits)
if len(clean_account) < 6 or len(clean_account) > 10:
result.add_issue(
'payment.account_number',
f"Unusual account number length: {len(clean_account)}. " +
"Australian account numbers are typically 6-10 digits.",
'warning'
)
def _validate_au_currency(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Validate that the currency is AUD for Australian invoices.
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
currency = self._get_nested_field(extraction_result, 'amounts.currency')
if currency:
# Normalize currency code
currency = currency.upper().strip()
# Check if it's an Australian currency code
au_currencies = ['AUD', '$', 'A$', 'DOLLAR', 'DOLLARS']
if not any(au_curr in currency for au_curr in au_currencies):
result.add_issue(
'amounts.currency',
f"Non-Australian currency detected: {currency}. Expected AUD for Australian invoice.",
'warning'
)
def _validate_gst_calculation(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Validate that the GST calculation is consistent (10% of subtotal).
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
subtotal = self._get_nested_field(extraction_result, 'amounts.subtotal')
# Check both GST and tax fields
gst = self._get_nested_field(extraction_result, 'amounts.gst')
tax = self._get_nested_field(extraction_result, 'amounts.tax')
# Use GST if available, otherwise tax
tax_amount = gst if gst is not None else tax
field_name = 'amounts.gst' if gst is not None else 'amounts.tax'
# Try to get the GST/tax rate
gst_rate = self._get_nested_field(extraction_result, 'amounts.gst_rate')
tax_rate = self._get_nested_field(extraction_result, 'amounts.tax_rate')
rate = gst_rate if gst_rate is not None else (tax_rate if tax_rate is not None else self.au_gst_rate)
# Get tax status to check if GST-free
tax_status = self._get_nested_field(extraction_result, 'amounts.tax_status')
gst_free = tax_status and ('free' in tax_status.lower() or 'exempt' in tax_status.lower())
if subtotal is not None and tax_amount is not None and not gst_free:
try:
subtotal_float = float(subtotal)
tax_float = float(tax_amount)
rate_float = float(rate) / 100 # Convert percentage to decimal
# Calculate expected GST
expected_tax = subtotal_float * rate_float
# Compare with tolerance
if abs(expected_tax - tax_float) > max(0.02, subtotal_float * 0.01): # 2 cents or 1% tolerance
result.add_issue(
field_name,
f"GST amount ({tax_float}) doesn't match the calculated GST " +
f"({expected_tax:.2f}) based on subtotal ({subtotal_float}) " +
f"and rate ({rate_float*100}%)",
'warning'
)
except (ValueError, TypeError):
# Already handled by type validation
pass
def _validate_tax_invoice_requirements(self, extraction_result: Dict[str, Any],
result: ValidationResult) -> None:
"""
Validate Australian tax invoice requirements.
Args:
extraction_result: Dictionary of extracted invoice fields
result: ValidationResult to add issues to
"""
# 1. Check for "Tax Invoice" words (required by ATO)
# We can't directly check this without the raw text, but we can advise it
result.add_issue(
'metadata',
"Ensure document has the words 'Tax Invoice' prominently displayed (ATO requirement).",
'warning'
)
# 2. Check for supplier identity (mandatory)
vendor_name = self._get_nested_field(extraction_result, 'vendor.name')
vendor_abn = self._get_nested_field(extraction_result, 'vendor.abn')
if not vendor_name:
result.add_issue(
'vendor.name',
"Supplier identity is mandatory for Australian tax invoices",
'error'
)
if not vendor_abn:
result.add_issue(
'vendor.abn',
"Supplier ABN is mandatory for Australian tax invoices",
'error'
)
# 3. Check for tax invoice date
issue_date = self._get_nested_field(extraction_result, 'invoice.issue_date')
if not issue_date:
result.add_issue(
'invoice.issue_date',
"Tax invoice date is mandatory for Australian tax invoices",
'error'
)
# 4. Check for indication of GST status
subtotal = self._get_nested_field(extraction_result, 'amounts.subtotal')
gst = self._get_nested_field(extraction_result, 'amounts.gst') or \
self._get_nested_field(extraction_result, 'amounts.tax')
if subtotal is not None and gst is not None and float(gst) == 0 and float(subtotal) > 0:
# If there's no GST on a positive amount, we should see GST-free indication
tax_status = self._get_nested_field(extraction_result, 'amounts.tax_status')
if not tax_status or not ('free' in tax_status.lower() or 'exempt' in tax_status.lower()):
result.add_issue(
'amounts.tax_status',
"For GST-free supplies, an explicit GST-free indication is required",
'warning'
)
# 5. Check total amount is present
total = self._get_nested_field(extraction_result, 'amounts.total')
if not total:
result.add_issue(
'amounts.total',
"Total amount is mandatory for Australian tax invoices",
'error'
)
# 6. Additional requirements for tax invoices over $1,000
if total and float(total) >= 1000:
recipient_identity = self._get_nested_field(extraction_result, 'recipient.name') or \
self._get_nested_field(extraction_result, 'customer.name')
if not recipient_identity:
result.add_issue(
'recipient',
"For tax invoices over $1,000, recipient identity (name/address) is required",
'warning'
)
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
BaseValidator | - |
Parameter Details
config: Optional configuration dictionary that can override default settings. Can include 'au_gst_rate' to set a custom GST rate (defaults to 10.0%). The config is passed to the parent BaseValidator class and merged with Australia-specific required fields including vendor.abn, amounts.gst, invoice.number, and vendor.name.
Return Value
The constructor returns an instance of AUValidator. The main validation method (_entity_specific_validation) does not return a value but modifies the ValidationResult object passed to it by adding validation issues. Each validation method adds issues to the result object with severity levels ('error', 'warning') and descriptive messages about validation failures.
Class Interface
Methods
__init__(self, config=None) -> None
Purpose: Initialize the AUValidator with Australian-specific validation rules and configuration
Parameters:
config: Optional dictionary containing configuration overrides, particularly 'au_gst_rate' for custom GST rate
Returns: None - initializes the validator instance
_entity_specific_validation(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Main validation entry point that orchestrates all Australia-specific validation checks
Parameters:
extraction_result: Dictionary containing extracted invoice fields with nested structure (e.g., vendor.abn, amounts.gst)result: ValidationResult object to which validation issues are added
Returns: None - modifies the result object in-place by adding validation issues
_validate_abn(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Validate Australian Business Number format and checksum using the official ABN algorithm
Parameters:
extraction_result: Dictionary containing extracted invoice fieldsresult: ValidationResult object to add ABN validation issues to
Returns: None - adds validation issues to result object if ABN is invalid
_validate_au_address(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Validate that the vendor address appears to be Australian by checking for postcodes, state abbreviations, and city names
Parameters:
extraction_result: Dictionary containing extracted invoice fieldsresult: ValidationResult object to add address validation issues to
Returns: None - adds validation issues if address doesn't appear to be Australian
_validate_gst_rate(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Validate that the GST rate is either 10% (standard) or 0% (GST-free), with 0.5% tolerance
Parameters:
extraction_result: Dictionary containing extracted invoice fieldsresult: ValidationResult object to add GST rate validation issues to
Returns: None - adds validation issues if GST rate is unusual
_validate_au_banking_details(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Validate Australian banking details including BSB format (XXX-XXX) and account number length (6-10 digits)
Parameters:
extraction_result: Dictionary containing extracted invoice fieldsresult: ValidationResult object to add banking validation issues to
Returns: None - adds validation issues if BSB or account number format is invalid
_validate_au_currency(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Validate that the invoice currency is AUD or Australian dollar variants
Parameters:
extraction_result: Dictionary containing extracted invoice fieldsresult: ValidationResult object to add currency validation issues to
Returns: None - adds validation issues if currency is not Australian
_validate_gst_calculation(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Validate that the GST amount correctly equals the subtotal multiplied by the GST rate, with tolerance for rounding
Parameters:
extraction_result: Dictionary containing extracted invoice fieldsresult: ValidationResult object to add GST calculation validation issues to
Returns: None - adds validation issues if GST calculation doesn't match expected value
_validate_tax_invoice_requirements(self, extraction_result: Dict[str, Any], result: ValidationResult) -> None
Purpose: Validate compliance with Australian Tax Office requirements for tax invoices, including mandatory fields and special requirements for invoices over $1,000
Parameters:
extraction_result: Dictionary containing extracted invoice fieldsresult: ValidationResult object to add tax invoice requirement validation issues to
Returns: None - adds validation issues if tax invoice requirements are not met
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
required_fields |
Dict[str, str] | Dictionary mapping field paths to their criticality level ('critical' or 'important'), inherited from BaseValidator and updated with Australian-specific required fields | instance |
au_gst_rate |
float | Australian GST rate percentage, defaults to 10.0% but can be overridden via config | instance |
au_abn_regex |
str | Regular expression pattern for validating ABN format (11 digits with optional spacing) | instance |
au_bsb_regex |
str | Regular expression pattern for validating BSB format (XXX-XXX) | instance |
au_account_number_regex |
str | Regular expression pattern for validating Australian account numbers (6-10 digits) | instance |
au_postcode_regex |
str | Regular expression pattern for matching Australian postcodes (4 digits) | instance |
au_states |
List[str] | List of valid Australian state and territory abbreviations: NSW, VIC, QLD, SA, WA, TAS, NT, ACT | instance |
config |
Dict[str, Any] | Configuration dictionary inherited from BaseValidator, contains validation settings and overrides | instance |
Dependencies
reloggingtypingdatetime
Required Imports
import re
import logging
from typing import Dict, Any
from datetime import datetime
from validators.base_validator import BaseValidator, ValidationResult
Usage Example
from validators.au_validator import AUValidator
from validators.base_validator import ValidationResult
# Create validator instance with default config
validator = AUValidator()
# Or with custom config
config = {'au_gst_rate': 10.0}
validator = AUValidator(config=config)
# Prepare extraction result (typically from OCR/extraction system)
extraction_result = {
'vendor': {
'name': 'ABC Company Pty Ltd',
'abn': '51 824 753 556',
'address': '123 Main St, Sydney NSW 2000'
},
'invoice': {
'number': 'INV-001',
'issue_date': '2024-01-15'
},
'amounts': {
'subtotal': 1000.00,
'gst': 100.00,
'total': 1100.00,
'currency': 'AUD',
'gst_rate': 10.0
},
'payment': {
'bsb': '123-456',
'account_number': '12345678'
}
}
# Create validation result object
result = ValidationResult()
# Perform validation
validator._entity_specific_validation(extraction_result, result)
# Check validation results
if result.is_valid:
print('Invoice is valid')
else:
for issue in result.issues:
print(f'{issue.severity}: {issue.field} - {issue.message}')
Best Practices
- Always instantiate with appropriate config if default GST rate needs to be overridden
- The validator modifies the ValidationResult object in-place, so ensure you pass a fresh ValidationResult for each validation
- This validator is designed to be called by a parent validation framework that handles the overall validation lifecycle
- The _entity_specific_validation method is the main entry point and should be called after basic validation
- All validation methods are prefixed with underscore indicating they are internal/protected methods
- The validator performs multiple independent checks, so partial validation failures are expected and handled gracefully
- ABN validation includes both format and checksum validation - format errors are 'error' level, checksum failures are 'warning' level
- For invoices over $1,000, additional recipient information is required per ATO regulations
- The validator uses tolerance levels for numeric comparisons (e.g., 0.5% for GST rate, 2 cents or 1% for GST calculations)
- State management: The validator maintains regex patterns and configuration as instance attributes for reuse across multiple validations
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class TestAUValidator 86.1% similar
-
class AUExtractor 77.1% similar
-
class TestAUExtractor 76.1% similar
-
class UKValidator 74.9% similar
-
class BaseValidator 74.3% similar