AUExtractor - Code Extractor

class AUExtractor

Maturity: 52

Australia-specific invoice data extractor that uses LLM (Large Language Model) to extract structured invoice data from Australian tax invoices, handling ABN, ACN, GST, BSB numbers and Australian date formats.

File:
/tf/active/vicechatdev/invoice_extraction/extractors/au_extractor.py

Lines:
11 - 503

Complexity:
complex

Purpose

This class specializes in extracting invoice data from Australian tax invoices by leveraging LLM capabilities to understand Australian-specific formats and requirements. It handles Australian Business Numbers (ABN), Australian Company Numbers (ACN), Bank-State-Branch (BSB) numbers, GST (Goods and Services Tax) calculations at 10%, and Australian date formats (DD/MM/YYYY). The extractor uses a comprehensive LLM-based approach with fallback mechanisms to parse invoice metadata, vendor information, amounts, payment details, and line items from document text and tables.

Source Code

class AUExtractor(BaseExtractor):
    """Australia-specific invoice data extractor using pure LLM approach."""
    
    def __init__(self, config=None):
        super().__init__(config)
        
        # Only initialize LLM client if not already initialized by parent
        if self.llm_client is None:
            self.llm_client = LLMClient(self.config.get('llm', {}))
        
        # Australian-specific configuration
        self.default_currency = 'AUD'
        
        # GST rate in Australia
        self.gst_rate = 10
    
    def extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
        """
        Extract invoice data from the document with Australian-specific processing.
        
        Args:
            document: Processed document from DocumentProcessor
            language: Detected language of the document (likely 'en')
            
        Returns:
            Dict containing extracted invoice fields
        """
        logger.info("Extracting data from Australian invoice")
        
        # Get full text of the document
        full_text = self._get_full_document_text(document)
        if not full_text:
            logger.warning("No text content found in document")
            return self._empty_extraction_result(language)
        
        # Extract tables if present
        tables = []
        for page in document.get('pages', []):
            tables.extend(page.get('tables', []))
            
        table_text = self._format_table_content(tables)
        
        # Extract all data using comprehensive LLM approach
        extraction_result = self._extract_all_invoice_data(full_text, table_text, language)
        
        # Add metadata
        extraction_result['metadata'] = {
            'language': language,
            'extraction_method': self.__class__.__name__
        }
        
        # Add confidence scores
        extraction_result['confidence'] = self.calculate_confidence(extraction_result)
        
        return extraction_result
    
    def _get_full_document_text(self, document: Dict[str, Any]) -> str:
        """Extract full text from document."""
        # If text is directly available in the document
        if document.get('text'):
            return document['text']
        
        # Otherwise, collect text from all pages
        full_text = []
        for page in document.get('pages', []):
            if page.get('text'):
                full_text.append(page['text'])
        
        return "\n\n".join(full_text)
    
    def _format_table_content(self, tables: List[Dict[str, Any]]) -> str:
        """Format tables as text to provide additional structure to the LLM."""
        if not tables:
            return ""
            
        table_texts = []
        for i, table in enumerate(tables):
            rows = []
            current_row = []
            current_row_number = 0
            
            # Sort cells by row and column
            cells = sorted(table.get('cells', []), key=lambda x: (x.get('row', 0), x.get('column', 0)))
            
            for cell in cells:
                row = cell.get('row', 0)
                if row > current_row_number:
                    if current_row:
                        rows.append(" | ".join(current_row))
                    current_row = []
                    current_row_number = row
                
                current_row.append(cell.get('text', '').strip())
            
            if current_row:
                rows.append(" | ".join(current_row))
                
            table_texts.append(f"TABLE {i+1}:\n" + "\n".join(rows))
        
        return "\n\n".join(table_texts)
    
    def _empty_extraction_result(self, language: str) -> Dict[str, Any]:
        """Return an empty extraction result structure."""
        return {
            'invoice': {},
            'vendor': {},
            'amounts': {},
            'payment': {},
            'line_items': [],
            'metadata': {
                'language': language,
                'extraction_method': self.__class__.__name__
            },
            'confidence': 0.0
        }
    
    def _extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]:
        """
        Extract all invoice data using a comprehensive LLM approach.
        
        Args:
            full_text: Full text of the document
            table_text: Formatted table content if available
            language: Detected language of the document
            
        Returns:
            Dictionary with all extracted invoice data
        """
        # Create comprehensive extraction prompt
        prompt = f"""# Australian Invoice Data Extraction

Analyze the following Australian tax invoice text and extract all required information according to Australian standards.

## Important Australian Invoice Characteristics:
- Tax invoices in Australia must include the words "Tax Invoice" if GST is charged
- ABN (Australian Business Number) format: XX XXX XXX XXX (11 digits)
- ACN (Australian Company Number): XXX XXX XXX (9 digits)
- BSB (Bank-State-Branch) format: XXX-XXX (6 digits)
- GST (Goods and Services Tax) rate is 10% in Australia
- Date formats: DD/MM/YYYY, DD-MM-YYYY (day first, unlike US format)
- Number formats: Standard with decimal point (1,234.56) for amounts

## Invoice Text:
{full_text}

## Tables Detected:
{table_text}

## Required Output:
Extract and return a valid JSON object with the following structure:

{{
  "invoice": {{
    "number": "extracted invoice number",
    "issue_date": "issue date in YYYY-MM-DD format",
    "due_date": "due date in YYYY-MM-DD format",
    "po_number": "purchase order number if present",
    "reference": "additional reference if present"
  }},
  "vendor": {{
    "name": "vendor company name",
    "abn": "Australian Business Number in XX XXX XXX XXX format",
    "acn": "Australian Company Number if present",
    "address": "complete vendor address",
    "contact": "contact information"
  }},
  "amounts": {{
    "subtotal": numeric value (before GST),
    "gst": numeric value (GST amount),
    "total": numeric value (including GST),
    "currency": "currency code (default AUD)",
    "tax_rate": numeric value (percentage, typically 10% in Australia),
    "tax_status": "GST status (e.g., 'GST inclusive', 'GST free')"
  }},
  "payment": {{
    "bank_name": "bank name",
    "bsb": "BSB number in XXX-XXX format",
    "account_number": "account number",
    "account_name": "account name",
    "payment_terms": "payment terms",
    "reference": "payment reference"
  }},
  "line_items": [
    {{
      "description": "item description",
      "quantity": numeric value,
      "unit_price": numeric value,
      "gst_amount": numeric value,
      "amount": numeric value,
      "gst_applicable": boolean
    }}
    ...
  ]
}}

Convert all amounts to standard decimal format (1234.56).
Format dates as ISO format YYYY-MM-DD.
If information is not found, use null or empty string as appropriate.
"""

        # Call LLM with comprehensive extraction prompt
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            extraction_result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            return self._post_process_extraction(extraction_result)
        except Exception as e:
            logger.error(f"Failed to parse LLM extraction result: {e}")
            
            # Attempt to extract partial results with a more structured approach
            return self._fallback_extraction(full_text, table_text)
    
    def _post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
        """Perform post-processing on the extracted data."""
        result = {
            'invoice': {},
            'vendor': {},
            'amounts': {},
            'payment': {},
            'line_items': []
        }
        
        # Copy extracted data
        for section in ['invoice', 'vendor', 'amounts', 'payment']:
            if section in extraction_result and isinstance(extraction_result[section], dict):
                result[section] = extraction_result[section]
        
        if 'line_items' in extraction_result and isinstance(extraction_result['line_items'], list):
            result['line_items'] = extraction_result['line_items']
        
        # Process dates to ensure consistent format
        for date_field in ['issue_date', 'due_date']:
            if result.get('invoice', {}).get(date_field):
                try:
                    date_str = result['invoice'][date_field]
                    # Check if already in ISO format
                    if '-' in date_str and len(date_str) == 10:
                        parts = date_str.split('-')
                        if len(parts) == 3 and len(parts[0]) == 4:
                            # Already in YYYY-MM-DD format
                            continue
                            
                    # Try to parse and standardize date
                    parsed_date = self._parse_date(date_str)
                    if parsed_date:
                        result['invoice'][date_field] = parsed_date
                except Exception as e:
                    logger.warning(f"Failed to process date {date_field}: {e}")
        
        # Ensure currency defaults to AUD
        if 'amounts' in result and not result['amounts'].get('currency'):
            result['amounts']['currency'] = self.default_currency
            
        # Ensure tax_rate is set to standard GST if close
        if result.get('amounts', {}).get('tax_rate') is not None:
            tax_rate = result['amounts']['tax_rate']
            try:
                tax_rate = float(tax_rate)
                # Check if close to standard Australian GST rate (10%)
                if 9 <= tax_rate <= 11:
                    result['amounts']['tax_rate'] = self.gst_rate
            except:
                pass
        
        # For Australian context, copy 'gst' to 'tax' for compatibility
        if result.get('amounts', {}).get('gst') is not None:
            result['amounts']['tax'] = result['amounts']['gst']
            
        # Process line items
        for item in result.get('line_items', []):
            # Copy gst_amount to tax_amount for compatibility
            if item.get('gst_amount') is not None:
                item['tax_amount'] = item['gst_amount']
                
        return result
    
    def _fallback_extraction(self, full_text: str, table_text: str) -> Dict[str, Any]:
        """
        Fallback method to extract invoice data in multiple smaller LLM calls.
        Used when comprehensive extraction fails.
        """
        logger.info("Using fallback extraction method")
        
        result = {
            'invoice': self._extract_invoice_metadata(full_text),
            'vendor': self._extract_vendor_data(full_text),
            'amounts': self._extract_amounts_and_tax(full_text),
            'payment': self._extract_payment_data(full_text),
            'line_items': self._extract_line_items(full_text, table_text)
        }
        
        return result
    
    def _extract_invoice_metadata(self, text: str) -> Dict[str, Any]:
        """Extract invoice metadata using LLM."""
        prompt = f"""Extract the following invoice metadata from this Australian tax invoice:
- number: The tax invoice number or reference
- issue_date: The date the invoice was issued (in YYYY-MM-DD format)
- due_date: The payment due date (in YYYY-MM-DD format)
- po_number: The purchase order number if present
- reference: Any additional reference numbers

Australian date formats are commonly DD/MM/YYYY or DD-MM-YYYY.
Convert all dates to YYYY-MM-DD format.

Invoice text:
{text[:3000]}

Return ONLY a valid JSON object with these fields.
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract invoice metadata")
            return {}
    
    def _extract_vendor_data(self, text: str) -> Dict[str, Any]:
        """Extract vendor data using LLM."""
        prompt = f"""Extract the vendor information from this Australian tax invoice:
- name: The vendor/supplier company name
- abn: The Australian Business Number (format: XX XXX XXX XXX, 11 digits)
- acn: The Australian Company Number (9 digits) if present
- address: The complete vendor address
- contact: Email, phone or other contact information

Look for "ABN" to find the Australian Business Number.

Invoice text:
{text[:3000]}

Return ONLY a valid JSON object with these fields.
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract vendor data")
            return {}
    
    def _extract_amounts_and_tax(self, text: str) -> Dict[str, Any]:
        """Extract amount and GST information using LLM."""
        prompt = f"""Extract the financial information from this Australian tax invoice:
- subtotal: The amount before GST (net amount)
- gst: The GST amount 
- total: The total amount due including GST
- currency: The currency code (default AUD)
- tax_rate: The GST percentage (typically 10% in Australia)
- tax_status: GST status if specified (e.g., "GST inclusive", "GST free")

Return all amounts as numeric values (1234.56), not formatted strings.
Australian GST is typically 10% of the subtotal amount.

Invoice text:
{text}

Return ONLY a valid JSON object with these fields.
"""
        response = self.llm_client.generate(prompt)
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            # Ensure amounts are numeric
            for field in ['subtotal', 'gst', 'total', 'tax_rate']:
                if field in result and result[field] is not None:
                    try:
                        result[field] = float(result[field])
                    except:
                        result[field] = None
                        
            # Copy GST to tax for compatibility
            if 'gst' in result:
                result['tax'] = result['gst']
                
            return result
        except:
            logger.warning("Failed to extract amounts")
            return {}
    
    def _extract_payment_data(self, text: str) -> Dict[str, Any]:
        """Extract payment information using LLM."""
        prompt = f"""Extract the payment information from this Australian tax invoice:
- bank_name: The name of the bank
- bsb: The BSB number (format: XXX-XXX, 6 digits)
- account_number: The account number
- account_name: The account name or account holder
- payment_terms: Payment terms (e.g., "30 days")
- reference: Payment reference to include

Look for terms like "Direct Deposit", "EFT Details", "Banking Details".
Australian BSB numbers are 6 digits, usually formatted as XXX-XXX.

Invoice text:
{text}

Return ONLY a valid JSON object with these fields.
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract payment data")
            return {}
    
    def _extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]:
        """Extract line items using LLM."""
        # Use table text if available, otherwise use full text
        context = table_text if table_text else text
        
        prompt = f"""Extract the line items from this Australian tax invoice.
Look for tables with descriptions, quantities, unit prices, and amounts.

Australian invoices typically include:
- Item description
- Quantity
- Unit price (excluding GST)
- GST amount for the line
- Total amount for the line (including GST)
- Whether GST is applicable

Return ONLY a valid JSON array of line items with these properties:
- description: Item description
- quantity: Numeric quantity
- unit_price: Numeric unit price (excluding GST)
- gst_amount: GST amount for this line item
- amount: Total amount for line item (including GST)
- gst_applicable: Boolean indicating if GST applies to this item

Return an empty array [] if no line items can be identified.

Invoice content:
{context}
"""
        response = self.llm_client.generate(prompt)
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            # Ensure numeric fields are properly formatted
            for item in result:
                for field in ['quantity', 'unit_price', 'gst_amount', 'amount']:
                    if field in item and item[field] is not None:
                        try:
                            item[field] = float(item[field])
                        except:
                            item[field] = None
                
                # Copy gst_amount to tax_amount for compatibility
                if 'gst_amount' in item:
                    item['tax_amount'] = item['gst_amount']
                    
            return result
        except:
            logger.warning("Failed to extract line items")
            return []
    
    def _parse_date(self, date_str: str) -> Optional[str]:
        """Parse a date string in various formats and return ISO format."""
        if not date_str:
            return None
            
        date_str = date_str.strip()
        
        # Common date formats in Australia
        date_formats = [
            '%d/%m/%Y', '%d-%m-%Y', '%d.%m.%Y', '%Y-%m-%d',
            '%d/%m/%y', '%d-%m-%y', '%d.%m.%y',
            '%Y/%m/%d', '%Y.%m.%d',
            '%d %B %Y', '%d %b %Y', '%B %d, %Y'
        ]
        
        # Try all formats
        for fmt in date_formats:
            try:
                date_obj = datetime.datetime.strptime(date_str, fmt)
                return date_obj.strftime('%Y-%m-%d')
            except ValueError:
                continue
                
        # If standard formats fail, rely on LLM to parse the date
        prompt = f"""Convert this date string: "{date_str}" to ISO format YYYY-MM-DD.
Remember that Australian dates typically use day first (DD/MM/YYYY), not month first like US dates.
Return ONLY the date in YYYY-MM-DD format, nothing else."""

        try:
            response = self.llm_client.generate(prompt)
            date_match = response.strip()
            # Validate format with simple check
            if len(date_match) == 10 and date_match[4] == '-' and date_match[7] == '-':
                return date_match
        except:
            pass
                
        # If all parsing attempts fail
        return None

Parameters

Name	Type	Default	Kind
`bases`	BaseExtractor	-

Parameter Details

config: Optional configuration dictionary that can contain LLM settings and other extractor parameters. If not provided or if llm_client is not initialized by parent class, a new LLMClient will be created with config.get('llm', {}). The config is passed to the parent BaseExtractor class.

Return Value

Instantiation returns an AUExtractor object. The main extract() method returns a dictionary containing: 'invoice' (metadata like number, dates), 'vendor' (name, ABN, ACN, address), 'amounts' (subtotal, GST, total, currency), 'payment' (bank details, BSB, account info), 'line_items' (array of item details), 'metadata' (language, extraction method), and 'confidence' (extraction confidence score as float).

Class Interface

Methods

`init(self, config=None)`

Purpose: Initialize the AUExtractor with optional configuration, set up LLM client, and configure Australian-specific defaults

Parameters:

config: Optional dictionary containing configuration settings, particularly 'llm' key for LLMClient configuration

Returns: None - initializes instance

`extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]`

Purpose: Main extraction method that processes an Australian invoice document and returns structured data

Parameters:

document: Processed document dictionary from DocumentProcessor containing 'text' or 'pages' with text and optional tables
language: Detected language code of the document (typically 'en' for Australian invoices)

Returns: Dictionary with keys: 'invoice', 'vendor', 'amounts', 'payment', 'line_items', 'metadata', 'confidence'

`_get_full_document_text(self, document: Dict[str, Any]) -> str`

Purpose: Extract and concatenate all text content from the document structure

Parameters:

document: Document dictionary containing either direct 'text' field or 'pages' array with text

Returns: String containing full document text, pages separated by double newlines

`_format_table_content(self, tables: List[Dict[str, Any]]) -> str`

Purpose: Convert table structures into formatted text representation for LLM processing

Parameters:

tables: List of table dictionaries containing 'cells' with row, column, and text information

Returns: Formatted string representation of tables with pipe-separated cells and labeled table numbers

`_empty_extraction_result(self, language: str) -> Dict[str, Any]`

Purpose: Generate an empty result structure when extraction fails or no content is found

Parameters:

language: Language code to include in metadata

Returns: Dictionary with empty sections for invoice, vendor, amounts, payment, line_items, plus metadata and 0.0 confidence

`_extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]`

Purpose: Perform comprehensive LLM-based extraction of all invoice data in a single call with detailed Australian-specific prompt

Parameters:

full_text: Complete document text content
table_text: Formatted table content if available
language: Document language code

Returns: Dictionary with all extracted invoice sections, falls back to _fallback_extraction on failure

`_post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]`

Purpose: Clean and standardize extracted data, normalize dates, ensure currency defaults, and add compatibility fields

Parameters:

extraction_result: Raw extraction result dictionary from LLM

Returns: Processed dictionary with standardized dates, currency defaults, GST/tax field duplication for compatibility

`_fallback_extraction(self, full_text: str, table_text: str) -> Dict[str, Any]`

Purpose: Alternative extraction method using multiple smaller LLM calls when comprehensive extraction fails

Parameters:

full_text: Complete document text
table_text: Formatted table content

Returns: Dictionary with extracted data from separate calls to extract invoice, vendor, amounts, payment, and line items

`_extract_invoice_metadata(self, text: str) -> Dict[str, Any]`

Purpose: Extract invoice-specific metadata (number, dates, PO number, reference) using targeted LLM prompt

Parameters:

text: Invoice text content (first 3000 characters used)

Returns: Dictionary with invoice metadata fields or empty dict on failure

`_extract_vendor_data(self, text: str) -> Dict[str, Any]`

Purpose: Extract vendor information including ABN, ACN, name, address, and contact details

Parameters:

text: Invoice text content (first 3000 characters used)

Returns: Dictionary with vendor fields or empty dict on failure

`_extract_amounts_and_tax(self, text: str) -> Dict[str, Any]`

Purpose: Extract financial amounts including subtotal, GST, total, currency, and tax rate

Parameters:

text: Full invoice text content

Returns: Dictionary with numeric amount fields, GST copied to tax field for compatibility, or empty dict on failure

`_extract_payment_data(self, text: str) -> Dict[str, Any]`

Purpose: Extract payment information including bank name, BSB, account number, and payment terms

Parameters:

text: Full invoice text content

Returns: Dictionary with payment fields or empty dict on failure

`_extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]`

Purpose: Extract invoice line items with descriptions, quantities, prices, and GST information

Parameters:

text: Full invoice text content
table_text: Formatted table content (preferred if available)

Returns: List of dictionaries containing line item details with numeric fields, GST copied to tax_amount, or empty list on failure

`_parse_date(self, date_str: str) -> Optional[str]`

Purpose: Parse various Australian date formats and convert to ISO format (YYYY-MM-DD), with LLM fallback for complex formats

Parameters:

date_str: Date string in various formats (DD/MM/YYYY, DD-MM-YYYY, etc.)

Returns: ISO format date string (YYYY-MM-DD) or None if parsing fails

Attributes

Name	Type	Description	Scope
`llm_client`	LLMClient	LLM client instance for making language model API calls, inherited from BaseExtractor or initialized in __init__	instance
`config`	Dict	Configuration dictionary inherited from BaseExtractor containing settings for LLM and other components	instance
`default_currency`	str	Default currency code for Australian invoices, set to 'AUD'	instance
`gst_rate`	int	Standard GST (Goods and Services Tax) rate in Australia, set to 10 (percent)	instance

Dependencies

logging
json
typing
datetime
extractors.base_extractor
utils.llm_client

Required Imports

import logging
import json
from typing import Dict, List, Any, Optional
import datetime
from extractors.base_extractor import BaseExtractor
from utils.llm_client import LLMClient

Usage Example

from extractors.au_extractor import AUExtractor

# Initialize with optional config
config = {
    'llm': {
        'api_key': 'your-api-key',
        'model': 'gpt-4'
    }
}
extractor = AUExtractor(config)

# Prepare document from DocumentProcessor
document = {
    'text': 'Tax Invoice\nABN: 12 345 678 901\nInvoice #: INV-001\nDate: 15/03/2024\nGST: $100.00\nTotal: $1,100.00',
    'pages': [
        {
            'text': 'page text',
            'tables': [{'cells': [{'row': 0, 'column': 0, 'text': 'Description'}]}]
        }
    ]
}

# Extract invoice data
result = extractor.extract(document, language='en')

# Access extracted data
print(result['invoice']['number'])  # 'INV-001'
print(result['vendor']['abn'])  # '12 345 678 901'
print(result['amounts']['gst'])  # 100.0
print(result['amounts']['total'])  # 1100.0
print(result['confidence'])  # 0.85

Best Practices

Always call extract() method after instantiation - it's the main entry point for data extraction
Ensure document parameter has proper structure with 'text' or 'pages' containing text content
The extractor expects Australian tax invoices - results may be suboptimal for other invoice types
Confidence scores should be checked to validate extraction quality
The class uses fallback extraction if comprehensive LLM extraction fails, providing robustness
Date formats are automatically converted to ISO format (YYYY-MM-DD) from Australian formats
GST amounts are automatically copied to 'tax' fields for compatibility with other systems
All monetary amounts are returned as numeric values (float), not formatted strings
The extractor handles both direct document text and table-based data extraction
LLM client must be properly configured before use - check parent BaseExtractor initialization
Method call order: instantiate -> extract() -> access result dictionary
State is maintained in instance attributes (default_currency, gst_rate) but extract() is stateless per call

Similar Components

AI-powered semantic similarity - components with related functionality:

class TestAUExtractor 83.1% similar

Unit test class for testing the AUExtractor class, which extracts data from Australian invoices including ABN, GST, and payment details.
From: /tf/active/vicechatdev/invoice_extraction/tests/test_extractors.py
class AUValidator 77.1% similar

Australia-specific invoice data validator that extends BaseValidator to implement validation rules for Australian invoices including ABN validation, GST calculations, and Australian tax invoice requirements.
From: /tf/active/vicechatdev/invoice_extraction/validators/au_validator.py
class BaseExtractor 72.8% similar

Abstract base class that defines the interface and shared functionality for entity-specific invoice data extractors (UK, BE, AU), providing a multi-stage extraction pipeline for invoice processing.
From: /tf/active/vicechatdev/invoice_extraction/extractors/base_extractor.py
class BEExtractor 72.5% similar

Belgium-specific invoice data extractor that uses LLM (Large Language Model) to extract structured invoice data from Belgian invoices in multiple languages (English, French, Dutch).
From: /tf/active/vicechatdev/invoice_extraction/extractors/be_extractor.py
class TestAUValidator 70.3% similar

Unit test class for validating the AUValidator class, which validates Australian invoice extraction results including ABN, GST, banking details, and tax invoice requirements.
From: /tf/active/vicechatdev/invoice_extraction/tests/test_validators.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class AUExtractor(BaseExtractor):
    """Australia-specific invoice data extractor using pure LLM approach."""
    
    def __init__(self, config=None):
        super().__init__(config)
        
        # Only initialize LLM client if not already initialized by parent
        if self.llm_client is None:
            self.llm_client = LLMClient(self.config.get('llm', {}))
        
        # Australian-specific configuration
        self.default_currency = 'AUD'
        
        # GST rate in Australia
        self.gst_rate = 10
    
    def extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
        """
        Extract invoice data from the document with Australian-specific processing.
        
        Args:
            document: Processed document from DocumentProcessor
            language: Detected language of the document (likely 'en')
            
        Returns:
            Dict containing extracted invoice fields
        """
        logger.info("Extracting data from Australian invoice")
        
        # Get full text of the document
        full_text = self._get_full_document_text(document)
        if not full_text:
            logger.warning("No text content found in document")
            return self._empty_extraction_result(language)
        
        # Extract tables if present
        tables = []
        for page in document.get('pages', []):
            tables.extend(page.get('tables', []))
            
        table_text = self._format_table_content(tables)
        
        # Extract all data using comprehensive LLM approach
        extraction_result = self._extract_all_invoice_data(full_text, table_text, language)
        
        # Add metadata
        extraction_result['metadata'] = {
            'language': language,
            'extraction_method': self.__class__.__name__
        }
        
        # Add confidence scores
        extraction_result['confidence'] = self.calculate_confidence(extraction_result)
        
        return extraction_result
    
    def _get_full_document_text(self, document: Dict[str, Any]) -> str:
        """Extract full text from document."""
        # If text is directly available in the document
        if document.get('text'):
            return document['text']
        
        # Otherwise, collect text from all pages
        full_text = []
        for page in document.get('pages', []):
            if page.get('text'):
                full_text.append(page['text'])
        
        return "\n\n".join(full_text)
    
    def _format_table_content(self, tables: List[Dict[str, Any]]) -> str:
        """Format tables as text to provide additional structure to the LLM."""
        if not tables:
            return ""
            
        table_texts = []
        for i, table in enumerate(tables):
            rows = []
            current_row = []
            current_row_number = 0
            
            # Sort cells by row and column
            cells = sorted(table.get('cells', []), key=lambda x: (x.get('row', 0), x.get('column', 0)))
            
            for cell in cells:
                row = cell.get('row', 0)
                if row > current_row_number:
                    if current_row:
                        rows.append(" | ".join(current_row))
                    current_row = []
                    current_row_number = row
                
                current_row.append(cell.get('text', '').strip())
            
            if current_row:
                rows.append(" | ".join(current_row))
                
            table_texts.append(f"TABLE {i+1}:\n" + "\n".join(rows))
        
        return "\n\n".join(table_texts)
    
    def _empty_extraction_result(self, language: str) -> Dict[str, Any]:
        """Return an empty extraction result structure."""
        return {
            'invoice': {},
            'vendor': {},
            'amounts': {},
            'payment': {},
            'line_items': [],
            'metadata': {
                'language': language,
                'extraction_method': self.__class__.__name__
            },
            'confidence': 0.0
        }
    
    def _extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]:
        """
        Extract all invoice data using a comprehensive LLM approach.
        
        Args:
            full_text: Full text of the document
            table_text: Formatted table content if available
            language: Detected language of the document
            
        Returns:
            Dictionary with all extracted invoice data
        """
        # Create comprehensive extraction prompt
        prompt = f"""# Australian Invoice Data Extraction

Analyze the following Australian tax invoice text and extract all required information according to Australian standards.

## Important Australian Invoice Characteristics:
- Tax invoices in Australia must include the words "Tax Invoice" if GST is charged
- ABN (Australian Business Number) format: XX XXX XXX XXX (11 digits)
- ACN (Australian Company Number): XXX XXX XXX (9 digits)
- BSB (Bank-State-Branch) format: XXX-XXX (6 digits)
- GST (Goods and Services Tax) rate is 10% in Australia
- Date formats: DD/MM/YYYY, DD-MM-YYYY (day first, unlike US format)
- Number formats: Standard with decimal point (1,234.56) for amounts

## Invoice Text:
{full_text}

## Tables Detected:
{table_text}

## Required Output:
Extract and return a valid JSON object with the following structure:

{{
  "invoice": {{
    "number": "extracted invoice number",
    "issue_date": "issue date in YYYY-MM-DD format",
    "due_date": "due date in YYYY-MM-DD format",
    "po_number": "purchase order number if present",
    "reference": "additional reference if present"
  }},
  "vendor": {{
    "name": "vendor company name",
    "abn": "Australian Business Number in XX XXX XXX XXX format",
    "acn": "Australian Company Number if present",
    "address": "complete vendor address",
    "contact": "contact information"
  }},
  "amounts": {{
    "subtotal": numeric value (before GST),
    "gst": numeric value (GST amount),
    "total": numeric value (including GST),
    "currency": "currency code (default AUD)",
    "tax_rate": numeric value (percentage, typically 10% in Australia),
    "tax_status": "GST status (e.g., 'GST inclusive', 'GST free')"
  }},
  "payment": {{
    "bank_name": "bank name",
    "bsb": "BSB number in XXX-XXX format",
    "account_number": "account number",
    "account_name": "account name",
    "payment_terms": "payment terms",
    "reference": "payment reference"
  }},
  "line_items": [
    {{
      "description": "item description",
      "quantity": numeric value,
      "unit_price": numeric value,
      "gst_amount": numeric value,
      "amount": numeric value,
      "gst_applicable": boolean
    }}
    ...
  ]
}}

Convert all amounts to standard decimal format (1234.56).
Format dates as ISO format YYYY-MM-DD.
If information is not found, use null or empty string as appropriate.
"""

        # Call LLM with comprehensive extraction prompt
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            extraction_result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            return self._post_process_extraction(extraction_result)
        except Exception as e:
            logger.error(f"Failed to parse LLM extraction result: {e}")
            
            # Attempt to extract partial results with a more structured approach
            return self._fallback_extraction(full_text, table_text)
    
    def _post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]:
        """Perform post-processing on the extracted data."""
        result = {
            'invoice': {},
            'vendor': {},
            'amounts': {},
            'payment': {},
            'line_items': []
        }
        
        # Copy extracted data
        for section in ['invoice', 'vendor', 'amounts', 'payment']:
            if section in extraction_result and isinstance(extraction_result[section], dict):
                result[section] = extraction_result[section]
        
        if 'line_items' in extraction_result and isinstance(extraction_result['line_items'], list):
            result['line_items'] = extraction_result['line_items']
        
        # Process dates to ensure consistent format
        for date_field in ['issue_date', 'due_date']:
            if result.get('invoice', {}).get(date_field):
                try:
                    date_str = result['invoice'][date_field]
                    # Check if already in ISO format
                    if '-' in date_str and len(date_str) == 10:
                        parts = date_str.split('-')
                        if len(parts) == 3 and len(parts[0]) == 4:
                            # Already in YYYY-MM-DD format
                            continue
                            
                    # Try to parse and standardize date
                    parsed_date = self._parse_date(date_str)
                    if parsed_date:
                        result['invoice'][date_field] = parsed_date
                except Exception as e:
                    logger.warning(f"Failed to process date {date_field}: {e}")
        
        # Ensure currency defaults to AUD
        if 'amounts' in result and not result['amounts'].get('currency'):
            result['amounts']['currency'] = self.default_currency
            
        # Ensure tax_rate is set to standard GST if close
        if result.get('amounts', {}).get('tax_rate') is not None:
            tax_rate = result['amounts']['tax_rate']
            try:
                tax_rate = float(tax_rate)
                # Check if close to standard Australian GST rate (10%)
                if 9 <= tax_rate <= 11:
                    result['amounts']['tax_rate'] = self.gst_rate
            except:
                pass
        
        # For Australian context, copy 'gst' to 'tax' for compatibility
        if result.get('amounts', {}).get('gst') is not None:
            result['amounts']['tax'] = result['amounts']['gst']
            
        # Process line items
        for item in result.get('line_items', []):
            # Copy gst_amount to tax_amount for compatibility
            if item.get('gst_amount') is not None:
                item['tax_amount'] = item['gst_amount']
                
        return result
    
    def _fallback_extraction(self, full_text: str, table_text: str) -> Dict[str, Any]:
        """
        Fallback method to extract invoice data in multiple smaller LLM calls.
        Used when comprehensive extraction fails.
        """
        logger.info("Using fallback extraction method")
        
        result = {
            'invoice': self._extract_invoice_metadata(full_text),
            'vendor': self._extract_vendor_data(full_text),
            'amounts': self._extract_amounts_and_tax(full_text),
            'payment': self._extract_payment_data(full_text),
            'line_items': self._extract_line_items(full_text, table_text)
        }
        
        return result
    
    def _extract_invoice_metadata(self, text: str) -> Dict[str, Any]:
        """Extract invoice metadata using LLM."""
        prompt = f"""Extract the following invoice metadata from this Australian tax invoice:
- number: The tax invoice number or reference
- issue_date: The date the invoice was issued (in YYYY-MM-DD format)
- due_date: The payment due date (in YYYY-MM-DD format)
- po_number: The purchase order number if present
- reference: Any additional reference numbers

Australian date formats are commonly DD/MM/YYYY or DD-MM-YYYY.
Convert all dates to YYYY-MM-DD format.

Invoice text:
{text[:3000]}

Return ONLY a valid JSON object with these fields.
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract invoice metadata")
            return {}
    
    def _extract_vendor_data(self, text: str) -> Dict[str, Any]:
        """Extract vendor data using LLM."""
        prompt = f"""Extract the vendor information from this Australian tax invoice:
- name: The vendor/supplier company name
- abn: The Australian Business Number (format: XX XXX XXX XXX, 11 digits)
- acn: The Australian Company Number (9 digits) if present
- address: The complete vendor address
- contact: Email, phone or other contact information

Look for "ABN" to find the Australian Business Number.

Invoice text:
{text[:3000]}

Return ONLY a valid JSON object with these fields.
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract vendor data")
            return {}
    
    def _extract_amounts_and_tax(self, text: str) -> Dict[str, Any]:
        """Extract amount and GST information using LLM."""
        prompt = f"""Extract the financial information from this Australian tax invoice:
- subtotal: The amount before GST (net amount)
- gst: The GST amount 
- total: The total amount due including GST
- currency: The currency code (default AUD)
- tax_rate: The GST percentage (typically 10% in Australia)
- tax_status: GST status if specified (e.g., "GST inclusive", "GST free")

Return all amounts as numeric values (1234.56), not formatted strings.
Australian GST is typically 10% of the subtotal amount.

Invoice text:
{text}

Return ONLY a valid JSON object with these fields.
"""
        response = self.llm_client.generate(prompt)
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            # Ensure amounts are numeric
            for field in ['subtotal', 'gst', 'total', 'tax_rate']:
                if field in result and result[field] is not None:
                    try:
                        result[field] = float(result[field])
                    except:
                        result[field] = None
                        
            # Copy GST to tax for compatibility
            if 'gst' in result:
                result['tax'] = result['gst']
                
            return result
        except:
            logger.warning("Failed to extract amounts")
            return {}
    
    def _extract_payment_data(self, text: str) -> Dict[str, Any]:
        """Extract payment information using LLM."""
        prompt = f"""Extract the payment information from this Australian tax invoice:
- bank_name: The name of the bank
- bsb: The BSB number (format: XXX-XXX, 6 digits)
- account_number: The account number
- account_name: The account name or account holder
- payment_terms: Payment terms (e.g., "30 days")
- reference: Payment reference to include

Look for terms like "Direct Deposit", "EFT Details", "Banking Details".
Australian BSB numbers are 6 digits, usually formatted as XXX-XXX.

Invoice text:
{text}

Return ONLY a valid JSON object with these fields.
"""
        response = self.llm_client.generate(prompt)
        try:
            return json.loads(response.replace('```json','').replace('```','').replace('\n',''))
        except:
            logger.warning("Failed to extract payment data")
            return {}
    
    def _extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]:
        """Extract line items using LLM."""
        # Use table text if available, otherwise use full text
        context = table_text if table_text else text
        
        prompt = f"""Extract the line items from this Australian tax invoice.
Look for tables with descriptions, quantities, unit prices, and amounts.

Australian invoices typically include:
- Item description
- Quantity
- Unit price (excluding GST)
- GST amount for the line
- Total amount for the line (including GST)
- Whether GST is applicable

Return ONLY a valid JSON array of line items with these properties:
- description: Item description
- quantity: Numeric quantity
- unit_price: Numeric unit price (excluding GST)
- gst_amount: GST amount for this line item
- amount: Total amount for line item (including GST)
- gst_applicable: Boolean indicating if GST applies to this item

Return an empty array [] if no line items can be identified.

Invoice content:
{context}
"""
        response = self.llm_client.generate(prompt)
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            # Ensure numeric fields are properly formatted
            for item in result:
                for field in ['quantity', 'unit_price', 'gst_amount', 'amount']:
                    if field in item and item[field] is not None:
                        try:
                            item[field] = float(item[field])
                        except:
                            item[field] = None
                
                # Copy gst_amount to tax_amount for compatibility
                if 'gst_amount' in item:
                    item['tax_amount'] = item['gst_amount']
                    
            return result
        except:
            logger.warning("Failed to extract line items")
            return []
    
    def _parse_date(self, date_str: str) -> Optional[str]:
        """Parse a date string in various formats and return ISO format."""
        if not date_str:
            return None
            
        date_str = date_str.strip()
        
        # Common date formats in Australia
        date_formats = [
            '%d/%m/%Y', '%d-%m-%Y', '%d.%m.%Y', '%Y-%m-%d',
            '%d/%m/%y', '%d-%m-%y', '%d.%m.%y',
            '%Y/%m/%d', '%Y.%m.%d',
            '%d %B %Y', '%d %b %Y', '%B %d, %Y'
        ]
        
        # Try all formats
        for fmt in date_formats:
            try:
                date_obj = datetime.datetime.strptime(date_str, fmt)
                return date_obj.strftime('%Y-%m-%d')
            except ValueError:
                continue
                
        # If standard formats fail, rely on LLM to parse the date
        prompt = f"""Convert this date string: "{date_str}" to ISO format YYYY-MM-DD.
Remember that Australian dates typically use day first (DD/MM/YYYY), not month first like US dates.
Return ONLY the date in YYYY-MM-DD format, nothing else."""

        try:
            response = self.llm_client.generate(prompt)
            date_match = response.strip()
            # Validate format with simple check
            if len(date_match) == 10 and date_match[4] == '-' and date_match[7] == '-':
                return date_match
        except:
            pass
                
        # If all parsing attempts fail
        return None
                        

Improved Code

🔍 Code Extractor

class AUExtractor

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self, config=None)`

`extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]`

`_get_full_document_text(self, document: Dict[str, Any]) -> str`

`_format_table_content(self, tables: List[Dict[str, Any]]) -> str`

`_empty_extraction_result(self, language: str) -> Dict[str, Any]`

`_extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]`

`_post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]`

`_fallback_extraction(self, full_text: str, table_text: str) -> Dict[str, Any]`

`_extract_invoice_metadata(self, text: str) -> Dict[str, Any]`

`_extract_vendor_data(self, text: str) -> Dict[str, Any]`

`_extract_amounts_and_tax(self, text: str) -> Dict[str, Any]`

`_extract_payment_data(self, text: str) -> Dict[str, Any]`

`_extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]`

`_parse_date(self, date_str: str) -> Optional[str]`

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class TestAUExtractor 83.1% similar

class AUValidator 77.1% similar

class BaseExtractor 72.8% similar

class BEExtractor 72.5% similar

class TestAUValidator 70.3% similar

class AUExtractor

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self, config=None)

extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]

_get_full_document_text(self, document: Dict[str, Any]) -> str

_format_table_content(self, tables: List[Dict[str, Any]]) -> str

_empty_extraction_result(self, language: str) -> Dict[str, Any]

_extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]

_post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]

_fallback_extraction(self, full_text: str, table_text: str) -> Dict[str, Any]

_extract_invoice_metadata(self, text: str) -> Dict[str, Any]

_extract_vendor_data(self, text: str) -> Dict[str, Any]

_extract_amounts_and_tax(self, text: str) -> Dict[str, Any]

_extract_payment_data(self, text: str) -> Dict[str, Any]

_extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]

_parse_date(self, date_str: str) -> Optional[str]

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class TestAUExtractor 83.1% similar

class AUValidator 77.1% similar

class BaseExtractor 72.8% similar

class BEExtractor 72.5% similar

class TestAUValidator 70.3% similar

✨ Improve Code: AUExtractor

Code Comparison

`init(self, config=None)`

`extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]`

`_get_full_document_text(self, document: Dict[str, Any]) -> str`

`_format_table_content(self, tables: List[Dict[str, Any]]) -> str`

`_empty_extraction_result(self, language: str) -> Dict[str, Any]`

`_extract_all_invoice_data(self, full_text: str, table_text: str, language: str) -> Dict[str, Any]`

`_post_process_extraction(self, extraction_result: Dict[str, Any]) -> Dict[str, Any]`

`_fallback_extraction(self, full_text: str, table_text: str) -> Dict[str, Any]`

`_extract_invoice_metadata(self, text: str) -> Dict[str, Any]`

`_extract_vendor_data(self, text: str) -> Dict[str, Any]`

`_extract_amounts_and_tax(self, text: str) -> Dict[str, Any]`

`_extract_payment_data(self, text: str) -> Dict[str, Any]`

`_extract_line_items(self, text: str, table_text: str) -> List[Dict[str, Any]]`

`_parse_date(self, date_str: str) -> Optional[str]`