class TestUKExtractor
Unit test class for testing the UKExtractor class, which extracts structured data from UK invoices including VAT numbers, dates, amounts, and line items.
/tf/active/vicechatdev/invoice_extraction/tests/test_extractors.py
135 - 302
moderate
Purpose
This test class validates the functionality of the UKExtractor class by testing invoice data extraction, VAT number formatting, and UK date parsing. It uses mocked LLM clients to simulate invoice processing without requiring actual API calls. The tests ensure that UK-specific formats (DD/MM/YYYY dates, GB VAT numbers, GBP currency) are correctly parsed and standardized.
Source Code
class TestUKExtractor(unittest.TestCase):
"""Test cases for the UKExtractor class."""
def setUp(self):
"""Set up test environment before each test."""
self.config = {
'confidence_threshold': 0.7,
'llm': {
'provider': 'test',
'model': 'test-model'
}
}
# Create a mock LLM client
self.mock_llm = MockLLMClient({
'invoice metadata': json.dumps({
"number": "INV-12345",
"issue_date": "15/01/2023",
"due_date": "15/02/2023",
"reference": "PO-6789"
}),
'vendor data': json.dumps({
"name": "UK Test Vendor Ltd",
"vat_number": "GB123456789",
"address": "123 London Road, London, W1A 1AA, UK",
"contact": "contact@uktestvendor.com"
}),
'amounts': json.dumps({
"subtotal": 500.00,
"total": 600.00,
"currency": "GBP"
}),
'tax data': json.dumps({
"vat": 100.00,
"vat_rate": 20,
"vat_regime": "standard"
}),
'payment data': json.dumps({
"sort_code": "12-34-56",
"account_number": "12345678",
"payment_terms": "30 days",
"reference": "INV-12345"
}),
'line items': json.dumps([
{
"description": "Test Product 1",
"quantity": 2,
"unit_price": 100.00,
"vat_rate": 20,
"amount": 200.00
},
{
"description": "Test Product 2",
"quantity": 3,
"unit_price": 100.00,
"vat_rate": 20,
"amount": 300.00
}
])
})
# Sample UK invoice
self.uk_doc = {
'text': 'Invoice #INV-12345\nIssue Date: 15/01/2023\nUK Test Vendor Ltd\n'
'VAT: GB123456789\n123 London Road, London, W1A 1AA, UK\n'
'Subtotal: £500.00\nVAT (20%): £100.00\nTotal: £600.00',
'pages': [
{
'text': 'Invoice #INV-12345\nIssue Date: 15/01/2023\nUK Test Vendor Ltd\n'
'VAT: GB123456789\n123 London Road, London, W1A 1AA, UK',
'width': 800,
'height': 1000,
'tables': []
},
{
'text': 'Subtotal: £500.00\nVAT (20%): £100.00\nTotal: £600.00',
'width': 800,
'height': 1000,
'tables': []
}
]
}
@patch('extractors.uk_extractor.LLMClient')
def test_uk_extract(self, mock_llm_client):
"""Test extraction of data from UK invoice."""
# Setup the mock
mock_llm_client.return_value = self.mock_llm
# Create extractor
uk_extractor = UKExtractor(self.config)
# Extract data
result = uk_extractor.extract(self.uk_doc, 'en')
# Check that we have the expected sections
self.assertIn('invoice', result)
self.assertIn('vendor', result)
self.assertIn('amounts', result)
self.assertIn('payment', result)
self.assertIn('line_items', result)
# Check specific fields
self.assertEqual(result['invoice']['number'], 'INV-12345')
self.assertEqual(result['vendor']['vat_number'], 'GB123456789')
self.assertEqual(result['amounts']['total'], 600.00)
self.assertEqual(result['amounts']['vat'], 100.00)
self.assertEqual(result['payment']['sort_code'], '12-34-56')
self.assertEqual(len(result['line_items']), 2)
# Check confidence score is calculated
self.assertIn('confidence', result)
self.assertIsInstance(result['confidence'], float)
@patch('extractors.uk_extractor.LLMClient')
def test_uk_vat_number_formatting(self, mock_llm_client):
"""Test that UK VAT number is properly formatted."""
# Setup the mock with malformatted VAT number
mock_llm = MockLLMClient({
'vendor data': json.dumps({
"name": "UK Test Vendor Ltd",
"vat_number": "gb 123 456 789", # Malformatted
"address": "123 London Road, London, W1A 1AA, UK"
}),
# Add minimum required responses for other fields
'invoice metadata': json.dumps({"number": "12345", "issue_date": "15/01/2023"}),
'amounts': json.dumps({"subtotal": 100, "total": 120, "vat": 20, "currency": "GBP"}),
'tax data': json.dumps({"vat": 20, "vat_rate": 20}),
'line items': json.dumps([])
})
mock_llm_client.return_value = mock_llm
# Create extractor
uk_extractor = UKExtractor(self.config)
# Extract data
result = uk_extractor.extract(self.uk_doc, 'en')
# Check that VAT number was formatted correctly
self.assertEqual(result['vendor']['vat_number'], 'GB123456789')
@patch('extractors.uk_extractor.LLMClient')
def test_uk_date_parsing(self, mock_llm_client):
"""Test that UK dates are properly parsed to standard format."""
# Setup the mock with UK date format
mock_llm = MockLLMClient({
'invoice metadata': json.dumps({
"number": "12345",
"issue_date": "15/01/2023", # DD/MM/YYYY format
"due_date": "15th February 2023" # Text format
}),
# Add minimum required responses for other fields
'vendor data': json.dumps({"name": "UK Test Vendor", "vat_number": "GB123456789"}),
'amounts': json.dumps({"subtotal": 100, "total": 120, "vat": 20, "currency": "GBP"}),
'tax data': json.dumps({"vat": 20, "vat_rate": 20}),
'line items': json.dumps([])
})
mock_llm_client.return_value = mock_llm
# Create extractor
uk_extractor = UKExtractor(self.config)
# Extract data
result = uk_extractor.extract(self.uk_doc, 'en')
# Check that dates were parsed to standard ISO format
self.assertEqual(result['invoice']['issue_date'], '2023-01-15')
self.assertEqual(result['invoice']['due_date'], '2023-02-15')
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
unittest.TestCase | - |
Parameter Details
bases: Inherits from unittest.TestCase to provide testing framework functionality including assertions, test setup/teardown, and test discovery
Return Value
As a test class, it does not return values directly. Individual test methods use assertions to validate expected behavior. Test methods return None but raise AssertionError if tests fail.
Class Interface
Methods
setUp(self) -> None
Purpose: Initializes test environment before each test method runs, creating mock LLM client and sample UK invoice data
Returns: None - sets up instance attributes for use in test methods
test_uk_extract(self, mock_llm_client) -> None
Purpose: Tests complete extraction of data from UK invoice including invoice metadata, vendor info, amounts, payment details, and line items
Parameters:
mock_llm_client: Mocked LLMClient class injected by @patch decorator
Returns: None - uses assertions to validate extraction results
test_uk_vat_number_formatting(self, mock_llm_client) -> None
Purpose: Tests that malformatted UK VAT numbers (with spaces and lowercase) are properly normalized to standard GB format
Parameters:
mock_llm_client: Mocked LLMClient class injected by @patch decorator
Returns: None - asserts that VAT number is formatted as 'GB123456789'
test_uk_date_parsing(self, mock_llm_client) -> None
Purpose: Tests that UK date formats (DD/MM/YYYY and text formats) are correctly parsed to ISO format (YYYY-MM-DD)
Parameters:
mock_llm_client: Mocked LLMClient class injected by @patch decorator
Returns: None - asserts that dates are converted to ISO 8601 format
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
config |
dict | Configuration dictionary containing confidence_threshold and llm settings (provider, model) for the extractor | instance |
mock_llm |
MockLLMClient | Mock LLM client instance with predefined responses for invoice metadata, vendor data, amounts, tax data, payment data, and line items | instance |
uk_doc |
dict | Sample UK invoice document structure with text content and pages array, representing a typical UK invoice with VAT, GBP currency, and UK address format | instance |
Dependencies
unittestunittest.mockjsonloggingospathlibdatetimeextractors.base_extractorextractors.uk_extractorextractors.be_extractorextractors.au_extractor
Required Imports
import unittest
from unittest.mock import patch
from unittest.mock import MagicMock
import os
import json
import logging
from pathlib import Path
import datetime
from extractors.base_extractor import BaseExtractor
from extractors.uk_extractor import UKExtractor
from extractors.be_extractor import BEExtractor
from extractors.au_extractor import AUExtractor
Usage Example
import unittest
from unittest.mock import patch
import json
from extractors.uk_extractor import UKExtractor
# Run a specific test
if __name__ == '__main__':
suite = unittest.TestLoader().loadTestsFromTestCase(TestUKExtractor)
unittest.TextTestRunner(verbosity=2).run(suite)
# Or run individual test
test = TestUKExtractor()
test.setUp()
with patch('extractors.uk_extractor.LLMClient') as mock_llm:
mock_llm.return_value = test.mock_llm
test.test_uk_extract(mock_llm)
# Run all tests in the class
python -m unittest test_module.TestUKExtractor
Best Practices
- Always call setUp() before running tests to initialize test fixtures and mock objects
- Use the @patch decorator to mock external dependencies like LLMClient to avoid actual API calls
- Each test method should be independent and not rely on state from other tests
- Mock LLM responses should include all required fields to avoid KeyError exceptions
- Test data (self.uk_doc) should represent realistic UK invoice structure with proper formatting
- Verify both successful extraction and proper formatting/normalization of UK-specific data
- Use descriptive test method names that clearly indicate what is being tested
- Include assertions for confidence scores and data structure completeness
- Test edge cases like malformatted VAT numbers and various date formats
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class TestUKValidator 83.2% similar
-
class TestAUExtractor 79.3% similar
-
class TestBEExtractor 76.4% similar
-
class UKExtractor 75.7% similar
-
class UKValidator 71.7% similar