class AUExtractor
Australia-specific invoice data extractor that uses LLM (Large Language Model) to extract structured invoice data from Australian tax invoices, handling ABN, ACN, GST, BSB numbers and Australian date formats.
/tf/active/vicechatdev/invoice_extraction/extractors/au_extractor.py
11 - 503
complex
Purpose
This class specializes in extracting invoice data from Australian tax invoices by leveraging LLM capabilities to understand Australian-specific formats and requirements. It handles Australian Business Numbers (ABN), Australian Company Numbers (ACN), Bank-State-Branch (BSB) numbers, GST (Goods and Services Tax) calculations at 10%, and Australian date formats (DD/MM/YYYY). The extractor uses a comprehensive LLM-based approach with fallback mechanisms to parse invoice metadata, vendor information, amounts, payment details, and line items from document text and tables.
Source Code
class AUExtractor(BaseExtractor):
"""Australia-specific invoice data extractor using pure LLM approach."""
def __init__(self, config=None):
super().__init__(config)
# Only initialize LLM client if not already initialized by parent
if self.llm_client is None:
self.llm_client = LLMClient(self.config.get('llm', {}))
# Australian-specific configuration
self.default_currency = 'AUD'
# GST rate in Australia
self.gst_rate = 10
def extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
"""
Extract invoice data from the document with Australian-specific processing.
Args:
document: Processed document from DocumentProcessor
language: Detected language of the document (likely 'en')
Returns:
Dict containing extracted invoice fields
"""
logger.info("Extracting data from Australian invoice")
# Get full text of the document
full_text = self._get_full_document_text(document)
if not full_text:
logger.warning("No text content found in document")
return self._empty_extraction_result(language)
# Extract tables if present
tables = []
for page in document.get('pages', []):
tables.extend(page.get('tables', []))
table_text = self._format_table_content(tables)
# Extract all data using comprehensive LLM approach
extraction_result = self._extract_all_invoice_data(full_text, table_text, language)
# Add metadata
extraction_result['metadata'] = {
'language': language,
'extraction_method': self.__class__.__name__
}
# Add confidence scores
extraction_result['confidence'] = self.calculate_confidence(extraction_result)
return extraction_result
def _get_full_document_text(self, document: Dict[str, Any]) -> str:
"""Extract full text from document."""
# If text is directly available in the document
if document.get('text'):
return document['text']
# Otherwise, collect text from all pages
full_text = []
for page in document.get('pages', []):
if page.get('text'):
full_text.append(page['text'])
return "\n\n".join(full_text)
def _format_table_content(self, tables: List[Dict[str, Any]]) -> str:
"""Format tables as text to provide additional structure to the LLM."""
if not tables:
return ""
table_texts = []
for i, table in enumerate(tables):
rows = []
current_row = []
current_row_number = 0
# Sort cells by row and column
cells = sorted(table.get('cells', []), key=lambda x: (x.get('row', 0), x.get('column', 0)))
for cell in cells:
row = cell.get('row', 0)
if row > current_row_number:
if current_row:
rows.append(" | ".join(current_row))
current_row = []
current_row_number = row
current_row.append(cell.get('text', '').strip())
if current_row:
rows.append(" | ".join(current_row))
table_texts.append(f"TABLE {i+1}:\n" + "\n".join(rows))
return "\n\n".join(table_texts)
def _empty_extraction_result(self, language: str) -> Dict[str, Any]:
"""Return an empty extraction result structure."""
return {
'invoice': {},
'vendor': {},
'amounts': {},
'payment': {},
'line_items': [],
'metadata': {
'language': language,
'extraction_method': self.__class__.__name__
},
'confidence': 0.0
}
def _extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]:
"""
Extract all invoice data using a comprehensive LLM approach.
Args:
full_text: Full text of the document
table_text: Formatted table content if available
language: Detected language of the document
Returns:
Dictionary with all extracted invoice data
"""
# Create comprehensive extraction prompt
prompt = f"""# Australian Invoice Data Extraction
Analyze the following Australian tax invoice text and extract all required information according to Australian standards.
## Important Australian Invoice Characteristics:
- Tax invoices in Australia must include the words "Tax Invoice" if GST is charged
- ABN (Australian Business Number) format: XX XXX XXX XXX (11 digits)
- ACN (Australian Company Number): XXX XXX XXX (9 digits)
- BSB (Bank-State-Branch) format: XXX-XXX (6 digits)
- GST (Goods and Services Tax) rate is 10% in Australia
- Date formats: DD/MM/YYYY, DD-MM-YYYY (day first, unlike US format)
- Number formats: Standard with decimal point (1,234.56) for amounts
## Invoice Text:
{full_text}
## Tables Detected:
{table_text}
## Required Output:
Extract and return a valid JSON object with the following structure:
{{
"invoice": {{
"number": "extracted invoice number",
"issue_date": "issue date in YYYY-MM-DD format",
"due_date": "due date in YYYY-MM-DD format",
"po_number": "purchase order number if present",
"reference": "additional reference if present"
}},
"vendor": {{
"name": "vendor company name",
"abn": "Australian Business Number in XX XXX XXX XXX format",
"acn": "Australian Company Number if present",
"address": "complete vendor address",
"contact": "contact information"
}},
"amounts": {{
"subtotal": numeric value (before GST),
"gst": numeric value (GST amount),
"total": numeric value (including GST),
"currency": "currency code (default AUD)",
"tax_rate": numeric value (percentage, typically 10% in Australia),
"tax_status": "GST status (e.g., 'GST inclusive', 'GST free')"
}},
"payment": {{
"bank_name": "bank name",
"bsb": "BSB number in XXX-XXX format",
"account_number": "account number",
"account_name": "account name",
"payment_terms": "payment terms",
"reference": "payment reference"
}},
"line_items": [
{{
"description": "item description",
"quantity": numeric value,
"unit_price": numeric value,
"gst_amount": numeric value,
"amount": numeric value,
"gst_applicable": boolean
}}
...
]
}}
Convert all amounts to standard decimal format (1234.56).
Format dates as ISO format YYYY-MM-DD.
If information is not found, use null or empty string as appropriate.
"""
# Call LLM with comprehensive extraction prompt
response = self.llm_client.generate(prompt)
# Parse response
try:
extraction_result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
return self._post_process_extraction(extraction_result)
except Exception as e:
logger.error(f"Failed to parse LLM extraction result: {e}")
# Attempt to extract partial results with a more structured approach
return self._fallback_extraction(full_text, table_text)
def _post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
"""Perform post-processing on the extracted data."""
result = {
'invoice': {},
'vendor': {},
'amounts': {},
'payment': {},
'line_items': []
}
# Copy extracted data
for section in ['invoice', 'vendor', 'amounts', 'payment']:
if section in extraction_result and isinstance(extraction_result[section], dict):
result[section] = extraction_result[section]
if 'line_items' in extraction_result and isinstance(extraction_result['line_items'], list):
result['line_items'] = extraction_result['line_items']
# Process dates to ensure consistent format
for date_field in ['issue_date', 'due_date']:
if result.get('invoice', {}).get(date_field):
try:
date_str = result['invoice'][date_field]
# Check if already in ISO format
if '-' in date_str and len(date_str) == 10:
parts = date_str.split('-')
if len(parts) == 3 and len(parts[0]) == 4:
# Already in YYYY-MM-DD format
continue
# Try to parse and standardize date
parsed_date = self._parse_date(date_str)
if parsed_date:
result['invoice'][date_field] = parsed_date
except Exception as e:
logger.warning(f"Failed to process date {date_field}: {e}")
# Ensure currency defaults to AUD
if 'amounts' in result and not result['amounts'].get('currency'):
result['amounts']['currency'] = self.default_currency
# Ensure tax_rate is set to standard GST if close
if result.get('amounts', {}).get('tax_rate') is not None:
tax_rate = result['amounts']['tax_rate']
try:
tax_rate = float(tax_rate)
# Check if close to standard Australian GST rate (10%)
if 9 <= tax_rate <= 11:
result['amounts']['tax_rate'] = self.gst_rate
except:
pass
# For Australian context, copy 'gst' to 'tax' for compatibility
if result.get('amounts', {}).get('gst') is not None:
result['amounts']['tax'] = result['amounts']['gst']
# Process line items
for item in result.get('line_items', []):
# Copy gst_amount to tax_amount for compatibility
if item.get('gst_amount') is not None:
item['tax_amount'] = item['gst_amount']
return result
def _fallback_extraction(self, full_text: str, table_text: str) -> Dict[str, Any]:
"""
Fallback method to extract invoice data in multiple smaller LLM calls.
Used when comprehensive extraction fails.
"""
logger.info("Using fallback extraction method")
result = {
'invoice': self._extract_invoice_metadata(full_text),
'vendor': self._extract_vendor_data(full_text),
'amounts': self._extract_amounts_and_tax(full_text),
'payment': self._extract_payment_data(full_text),
'line_items': self._extract_line_items(full_text, table_text)
}
return result
def _extract_invoice_metadata(self, text: str) -> Dict[str, Any]:
"""Extract invoice metadata using LLM."""
prompt = f"""Extract the following invoice metadata from this Australian tax invoice:
- number: The tax invoice number or reference
- issue_date: The date the invoice was issued (in YYYY-MM-DD format)
- due_date: The payment due date (in YYYY-MM-DD format)
- po_number: The purchase order number if present
- reference: Any additional reference numbers
Australian date formats are commonly DD/MM/YYYY or DD-MM-YYYY.
Convert all dates to YYYY-MM-DD format.
Invoice text:
{text[:3000]}
Return ONLY a valid JSON object with these fields.
"""
response = self.llm_client.generate(prompt)
try:
return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
except:
logger.warning("Failed to extract invoice metadata")
return {}
def _extract_vendor_data(self, text: str) -> Dict[str, Any]:
"""Extract vendor data using LLM."""
prompt = f"""Extract the vendor information from this Australian tax invoice:
- name: The vendor/supplier company name
- abn: The Australian Business Number (format: XX XXX XXX XXX, 11 digits)
- acn: The Australian Company Number (9 digits) if present
- address: The complete vendor address
- contact: Email, phone or other contact information
Look for "ABN" to find the Australian Business Number.
Invoice text:
{text[:3000]}
Return ONLY a valid JSON object with these fields.
"""
response = self.llm_client.generate(prompt)
try:
return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
except:
logger.warning("Failed to extract vendor data")
return {}
def _extract_amounts_and_tax(self, text: str) -> Dict[str, Any]:
"""Extract amount and GST information using LLM."""
prompt = f"""Extract the financial information from this Australian tax invoice:
- subtotal: The amount before GST (net amount)
- gst: The GST amount
- total: The total amount due including GST
- currency: The currency code (default AUD)
- tax_rate: The GST percentage (typically 10% in Australia)
- tax_status: GST status if specified (e.g., "GST inclusive", "GST free")
Return all amounts as numeric values (1234.56), not formatted strings.
Australian GST is typically 10% of the subtotal amount.
Invoice text:
{text}
Return ONLY a valid JSON object with these fields.
"""
response = self.llm_client.generate(prompt)
try:
result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
# Ensure amounts are numeric
for field in ['subtotal', 'gst', 'total', 'tax_rate']:
if field in result and result[field] is not None:
try:
result[field] = float(result[field])
except:
result[field] = None
# Copy GST to tax for compatibility
if 'gst' in result:
result['tax'] = result['gst']
return result
except:
logger.warning("Failed to extract amounts")
return {}
def _extract_payment_data(self, text: str) -> Dict[str, Any]:
"""Extract payment information using LLM."""
prompt = f"""Extract the payment information from this Australian tax invoice:
- bank_name: The name of the bank
- bsb: The BSB number (format: XXX-XXX, 6 digits)
- account_number: The account number
- account_name: The account name or account holder
- payment_terms: Payment terms (e.g., "30 days")
- reference: Payment reference to include
Look for terms like "Direct Deposit", "EFT Details", "Banking Details".
Australian BSB numbers are 6 digits, usually formatted as XXX-XXX.
Invoice text:
{text}
Return ONLY a valid JSON object with these fields.
"""
response = self.llm_client.generate(prompt)
try:
return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
except:
logger.warning("Failed to extract payment data")
return {}
def _extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]:
"""Extract line items using LLM."""
# Use table text if available, otherwise use full text
context = table_text if table_text else text
prompt = f"""Extract the line items from this Australian tax invoice.
Look for tables with descriptions, quantities, unit prices, and amounts.
Australian invoices typically include:
- Item description
- Quantity
- Unit price (excluding GST)
- GST amount for the line
- Total amount for the line (including GST)
- Whether GST is applicable
Return ONLY a valid JSON array of line items with these properties:
- description: Item description
- quantity: Numeric quantity
- unit_price: Numeric unit price (excluding GST)
- gst_amount: GST amount for this line item
- amount: Total amount for line item (including GST)
- gst_applicable: Boolean indicating if GST applies to this item
Return an empty array [] if no line items can be identified.
Invoice content:
{context}
"""
response = self.llm_client.generate(prompt)
try:
result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
# Ensure numeric fields are properly formatted
for item in result:
for field in ['quantity', 'unit_price', 'gst_amount', 'amount']:
if field in item and item[field] is not None:
try:
item[field] = float(item[field])
except:
item[field] = None
# Copy gst_amount to tax_amount for compatibility
if 'gst_amount' in item:
item['tax_amount'] = item['gst_amount']
return result
except:
logger.warning("Failed to extract line items")
return []
def _parse_date(self, date_str: str) -> Optional[str]:
"""Parse a date string in various formats and return ISO format."""
if not date_str:
return None
date_str = date_str.strip()
# Common date formats in Australia
date_formats = [
'%d/%m/%Y', '%d-%m-%Y', '%d.%m.%Y', '%Y-%m-%d',
'%d/%m/%y', '%d-%m-%y', '%d.%m.%y',
'%Y/%m/%d', '%Y.%m.%d',
'%d %B %Y', '%d %b %Y', '%B %d, %Y'
]
# Try all formats
for fmt in date_formats:
try:
date_obj = datetime.datetime.strptime(date_str, fmt)
return date_obj.strftime('%Y-%m-%d')
except ValueError:
continue
# If standard formats fail, rely on LLM to parse the date
prompt = f"""Convert this date string: "{date_str}" to ISO format YYYY-MM-DD.
Remember that Australian dates typically use day first (DD/MM/YYYY), not month first like US dates.
Return ONLY the date in YYYY-MM-DD format, nothing else."""
try:
response = self.llm_client.generate(prompt)
date_match = response.strip()
# Validate format with simple check
if len(date_match) == 10 and date_match[4] == '-' and date_match[7] == '-':
return date_match
except:
pass
# If all parsing attempts fail
return None
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
BaseExtractor | - |
Parameter Details
config: Optional configuration dictionary that can contain LLM settings and other extractor parameters. If not provided or if llm_client is not initialized by parent class, a new LLMClient will be created with config.get('llm', {}). The config is passed to the parent BaseExtractor class.
Return Value
Instantiation returns an AUExtractor object. The main extract() method returns a dictionary containing: 'invoice' (metadata like number, dates), 'vendor' (name, ABN, ACN, address), 'amounts' (subtotal, GST, total, currency), 'payment' (bank details, BSB, account info), 'line_items' (array of item details), 'metadata' (language, extraction method), and 'confidence' (extraction confidence score as float).
Class Interface
Methods
__init__(self, config=None)
Purpose: Initialize the AUExtractor with optional configuration, set up LLM client, and configure Australian-specific defaults
Parameters:
config: Optional dictionary containing configuration settings, particularly 'llm' key for LLMClient configuration
Returns: None - initializes instance
extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]
Purpose: Main extraction method that processes an Australian invoice document and returns structured data
Parameters:
document: Processed document dictionary from DocumentProcessor containing 'text' or 'pages' with text and optional tableslanguage: Detected language code of the document (typically 'en' for Australian invoices)
Returns: Dictionary with keys: 'invoice', 'vendor', 'amounts', 'payment', 'line_items', 'metadata', 'confidence'
_get_full_document_text(self, document: Dict[str, Any]) -> str
Purpose: Extract and concatenate all text content from the document structure
Parameters:
document: Document dictionary containing either direct 'text' field or 'pages' array with text
Returns: String containing full document text, pages separated by double newlines
_format_table_content(self, tables: List[Dict[str, Any]]) -> str
Purpose: Convert table structures into formatted text representation for LLM processing
Parameters:
tables: List of table dictionaries containing 'cells' with row, column, and text information
Returns: Formatted string representation of tables with pipe-separated cells and labeled table numbers
_empty_extraction_result(self, language: str) -> Dict[str, Any]
Purpose: Generate an empty result structure when extraction fails or no content is found
Parameters:
language: Language code to include in metadata
Returns: Dictionary with empty sections for invoice, vendor, amounts, payment, line_items, plus metadata and 0.0 confidence
_extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]
Purpose: Perform comprehensive LLM-based extraction of all invoice data in a single call with detailed Australian-specific prompt
Parameters:
full_text: Complete document text contenttable_text: Formatted table content if availablelanguage: Document language code
Returns: Dictionary with all extracted invoice sections, falls back to _fallback_extraction on failure
_post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]
Purpose: Clean and standardize extracted data, normalize dates, ensure currency defaults, and add compatibility fields
Parameters:
extraction_result: Raw extraction result dictionary from LLM
Returns: Processed dictionary with standardized dates, currency defaults, GST/tax field duplication for compatibility
_fallback_extraction(self, full_text: str, table_text: str) -> Dict[str, Any]
Purpose: Alternative extraction method using multiple smaller LLM calls when comprehensive extraction fails
Parameters:
full_text: Complete document texttable_text: Formatted table content
Returns: Dictionary with extracted data from separate calls to extract invoice, vendor, amounts, payment, and line items
_extract_invoice_metadata(self, text: str) -> Dict[str, Any]
Purpose: Extract invoice-specific metadata (number, dates, PO number, reference) using targeted LLM prompt
Parameters:
text: Invoice text content (first 3000 characters used)
Returns: Dictionary with invoice metadata fields or empty dict on failure
_extract_vendor_data(self, text: str) -> Dict[str, Any]
Purpose: Extract vendor information including ABN, ACN, name, address, and contact details
Parameters:
text: Invoice text content (first 3000 characters used)
Returns: Dictionary with vendor fields or empty dict on failure
_extract_amounts_and_tax(self, text: str) -> Dict[str, Any]
Purpose: Extract financial amounts including subtotal, GST, total, currency, and tax rate
Parameters:
text: Full invoice text content
Returns: Dictionary with numeric amount fields, GST copied to tax field for compatibility, or empty dict on failure
_extract_payment_data(self, text: str) -> Dict[str, Any]
Purpose: Extract payment information including bank name, BSB, account number, and payment terms
Parameters:
text: Full invoice text content
Returns: Dictionary with payment fields or empty dict on failure
_extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]
Purpose: Extract invoice line items with descriptions, quantities, prices, and GST information
Parameters:
text: Full invoice text contenttable_text: Formatted table content (preferred if available)
Returns: List of dictionaries containing line item details with numeric fields, GST copied to tax_amount, or empty list on failure
_parse_date(self, date_str: str) -> Optional[str]
Purpose: Parse various Australian date formats and convert to ISO format (YYYY-MM-DD), with LLM fallback for complex formats
Parameters:
date_str: Date string in various formats (DD/MM/YYYY, DD-MM-YYYY, etc.)
Returns: ISO format date string (YYYY-MM-DD) or None if parsing fails
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
llm_client |
LLMClient | LLM client instance for making language model API calls, inherited from BaseExtractor or initialized in __init__ | instance |
config |
Dict | Configuration dictionary inherited from BaseExtractor containing settings for LLM and other components | instance |
default_currency |
str | Default currency code for Australian invoices, set to 'AUD' | instance |
gst_rate |
int | Standard GST (Goods and Services Tax) rate in Australia, set to 10 (percent) | instance |
Dependencies
loggingjsontypingdatetimeextractors.base_extractorutils.llm_client
Required Imports
import logging
import json
from typing import Dict, List, Any, Optional
import datetime
from extractors.base_extractor import BaseExtractor
from utils.llm_client import LLMClient
Usage Example
from extractors.au_extractor import AUExtractor
# Initialize with optional config
config = {
'llm': {
'api_key': 'your-api-key',
'model': 'gpt-4'
}
}
extractor = AUExtractor(config)
# Prepare document from DocumentProcessor
document = {
'text': 'Tax Invoice\nABN: 12 345 678 901\nInvoice #: INV-001\nDate: 15/03/2024\nGST: $100.00\nTotal: $1,100.00',
'pages': [
{
'text': 'page text',
'tables': [{'cells': [{'row': 0, 'column': 0, 'text': 'Description'}]}]
}
]
}
# Extract invoice data
result = extractor.extract(document, language='en')
# Access extracted data
print(result['invoice']['number']) # 'INV-001'
print(result['vendor']['abn']) # '12 345 678 901'
print(result['amounts']['gst']) # 100.0
print(result['amounts']['total']) # 1100.0
print(result['confidence']) # 0.85
Best Practices
- Always call extract() method after instantiation - it's the main entry point for data extraction
- Ensure document parameter has proper structure with 'text' or 'pages' containing text content
- The extractor expects Australian tax invoices - results may be suboptimal for other invoice types
- Confidence scores should be checked to validate extraction quality
- The class uses fallback extraction if comprehensive LLM extraction fails, providing robustness
- Date formats are automatically converted to ISO format (YYYY-MM-DD) from Australian formats
- GST amounts are automatically copied to 'tax' fields for compatibility with other systems
- All monetary amounts are returned as numeric values (float), not formatted strings
- The extractor handles both direct document text and table-based data extraction
- LLM client must be properly configured before use - check parent BaseExtractor initialization
- Method call order: instantiate -> extract() -> access result dictionary
- State is maintained in instance attributes (default_currency, gst_rate) but extract() is stateless per call
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class TestAUExtractor 83.1% similar
-
class AUValidator 77.1% similar
-
class BaseExtractor 72.8% similar
-
class BEExtractor 72.5% similar
-
class TestAUValidator 70.3% similar