class EntityClassifier
Classifies which ViceBio entity (UK, Belgium, or Australia) an invoice is addressed to using rule-based pattern matching and LLM fallback.
/tf/active/vicechatdev/invoice_extraction/core/entity_classifier.py
7 - 199
moderate
Purpose
The EntityClassifier is responsible for determining the target ViceBio entity for invoices by analyzing document text for entity-specific patterns (VAT numbers, company names, addresses). It employs a two-stage approach: first attempting rule-based classification using regex patterns, then falling back to LLM-based classification if patterns are insufficient. This is critical for routing invoices to the correct legal entity for processing.
Source Code
class EntityClassifier:
"""Classifies which ViceBio entity the invoice is addressed to."""
def __init__(self, config=None):
self.config = config or {}
# Initialize LLM client
self.llm_client = LLMClient(self.config.get('llm', {}))
# Entity identifier patterns (VAT numbers, company names, etc.)
self.entity_patterns = {
'UK': {
'vat_patterns': [r'GB\d{9}', r'GB\d{12}'],
'name_patterns': [
r'ViceBio\s+(?:Ltd|Limited|UK)',
r'ViceBio\s+(?:United\s+Kingdom)',
]
},
'BE': {
'vat_patterns': [r'BE0\d{9}', r'BE\s+0\d{3}\.\d{3}\.\d{3}'],
'name_patterns': [
r'ViceBio\s+(?:Belgium|België|Belgique)',
r'ViceBio\s+(?:BV|BVBA|SRL)'
]
},
'AU': {
'vat_patterns': [r'\d{2}\s?\d{3}\s?\d{3}\s?\d{3}'], # ABN format
'name_patterns': [
r'ViceBio\s+(?:Australia|AU)',
r'ViceBio\s+(?:Pty|Proprietary|PTY\s+LTD)'
]
}
}
# Load entity details from config if available
if 'entities' in self.config:
for entity, details in self.config['entities'].items():
if entity in self.entity_patterns and 'identifiers' in details:
for id_type, patterns in details['identifiers'].items():
self.entity_patterns[entity][id_type] = patterns
def classify(self, document):
"""
Determine which ViceBio entity the invoice is for.
Args:
document: Processed document from DocumentProcessor
Returns:
String indicating entity ('UK', 'BE', or 'AU')
"""
logger.info("Classifying target entity for invoice")
# First try rule-based classification using regex patterns
entity = self._rule_based_classification(document)
if entity:
logger.info(f"Entity classified as {entity} using rule-based approach")
return entity
# If rule-based approach fails, use LLM for classification
entity = self._llm_classification(document)
logger.info(f"Entity classified as {entity} using LLM")
return entity
def _rule_based_classification(self, document):
"""Apply rule-based classification using regex patterns."""
# Extract the full text from the document
text = document.get('text', '')
# Scoring for each entity
scores = {'UK': 0, 'BE': 0, 'AU': 0}
# Check for entity patterns
for entity, patterns in self.entity_patterns.items():
# Check VAT number patterns - strong indicator
for pattern in patterns.get('vat_patterns', []):
if re.search(pattern, text, re.IGNORECASE):
scores[entity] += 3
logger.debug(f"Found {entity} VAT pattern: {pattern}")
# Check name patterns
for pattern in patterns.get('name_patterns', []):
if re.search(pattern, text, re.IGNORECASE):
scores[entity] += 2
logger.debug(f"Found {entity} name pattern: {pattern}")
# Check address patterns if defined
for pattern in patterns.get('address_patterns', []):
if re.search(pattern, text, re.IGNORECASE):
scores[entity] += 1
logger.debug(f"Found {entity} address pattern: {pattern}")
# Check for clear winner (must have a score and be >50% higher than second place)
max_entity = max(scores, key=scores.get)
max_score = scores[max_entity]
if max_score > 0:
# Sort scores to get second-highest
sorted_scores = sorted([(e, s) for e, s in scores.items() if e != max_entity],
key=lambda x: x[1], reverse=True)
# If no other entities found or clear winner
if not sorted_scores or sorted_scores[0][1] == 0:
return max_entity
second_score = sorted_scores[0][1]
if max_score > second_score * 1.5: # At least 50% more evidence
return max_entity
# Not enough evidence from rule-based approach
return None
def _llm_classification(self, document):
"""Use LLM to classify the target entity."""
# Prepare the prompt for LLM
prompt = self._prepare_entity_classification_prompt(document)
# Call LLM for classification
response = self.llm_client.generate(prompt)
# Parse the response
entity = self._parse_entity_from_llm_response(response)
# Use UK as default if parsing fails
return entity or 'UK'
def _prepare_entity_classification_prompt(self, document):
"""Prepare a prompt for LLM entity classification."""
# Extract text for the prompt
text = document.get('text', '')
# If document is too long, use first 2000 chars and key blocks
if len(text) > 2000:
trimmed_text = text[:2000]
# Try to add key header blocks if we can find them
header_blocks = []
for block in document.get('blocks', [])[:10]: # Check first 10 blocks
if block.get('text') and block.get('bbox') and block['bbox'][1] < 300: # Top of page
header_blocks.append(block.get('text', ''))
if header_blocks:
trimmed_text += "\n\n[Additional header text:]\n" + "\n".join(header_blocks)
else:
trimmed_text = text
# Construct the prompt
prompt = f"""You are analyzing an invoice to determine which entity of ViceBio it is addressed to.
The possible entities are:
1. UK Ltd (mother company) - based in the United Kingdom
2. Australian subsidiary - based in Australia
3. Belgian VAT entity - based in Belgium
Look for clues such as:
- The recipient's address
- VAT/Tax identification numbers (UK format: GB123456789, Belgium format: BE0123456789, Australia ABN: 12 345 678 901)
- Billing entity names or references
- Currency being used
- Tax or VAT rates mentioned
Invoice text:
{trimmed_text}
Respond with only one of these exact strings: "UK", "BE", or "AU" for the entity this invoice is addressed to.
If you cannot determine with confidence, respond with "UK" as the default entity.
"""
return prompt
def _parse_entity_from_llm_response(self, response):
"""Parse the entity from the LLM response."""
if not response:
return None
# Clean the response and look for entity indicators
clean_response = response.strip().upper()
# Direct matches
if clean_response in ['UK', 'BE', 'AU']:
return clean_response
# Look for entity mentions in the response
if 'UK' in clean_response or 'UNITED KINGDOM' in clean_response:
return 'UK'
elif 'BE' in clean_response or 'BELGIUM' in clean_response or 'BELGIAN' in clean_response:
return 'BE'
elif 'AU' in clean_response or 'AUSTRALIA' in clean_response or 'AUSTRALIAN' in clean_response:
return 'AU'
# No clear entity found
logger.warning(f"Could not parse entity from LLM response: {response}")
return None
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
config: Optional dictionary containing configuration settings. Can include 'llm' key with LLM client configuration, and 'entities' key with custom entity patterns. If None, uses default patterns for UK, BE, and AU entities. The 'entities' structure allows overriding default VAT patterns, name patterns, and address patterns for each entity.
Return Value
Instantiation returns an EntityClassifier object. The main classify() method returns a string ('UK', 'BE', or 'AU') indicating which ViceBio entity the invoice is addressed to. If classification fails or is uncertain, defaults to 'UK'.
Class Interface
Methods
__init__(self, config=None)
Purpose: Initializes the EntityClassifier with configuration and sets up entity patterns and LLM client
Parameters:
config: Optional dictionary with 'llm' and 'entities' keys for configuration
Returns: None (constructor)
classify(self, document) -> str
Purpose: Main method to determine which ViceBio entity (UK, BE, AU) the invoice is addressed to
Parameters:
document: Processed document dictionary from DocumentProcessor containing 'text' and 'blocks' keys
Returns: String indicating entity: 'UK', 'BE', or 'AU'
_rule_based_classification(self, document) -> str or None
Purpose: Applies regex pattern matching to classify entity based on VAT numbers, company names, and addresses
Parameters:
document: Document dictionary with 'text' key containing full document text
Returns: Entity string ('UK', 'BE', 'AU') if confident match found, None otherwise
_llm_classification(self, document) -> str
Purpose: Uses LLM to classify entity when rule-based approach is inconclusive
Parameters:
document: Document dictionary with 'text' and 'blocks' keys
Returns: Entity string ('UK', 'BE', 'AU'), defaults to 'UK' if parsing fails
_prepare_entity_classification_prompt(self, document) -> str
Purpose: Constructs a prompt for LLM entity classification, truncating long documents intelligently
Parameters:
document: Document dictionary with 'text' and 'blocks' keys
Returns: Formatted prompt string for LLM with instructions and document text
_parse_entity_from_llm_response(self, response) -> str or None
Purpose: Parses LLM response to extract entity classification
Parameters:
response: String response from LLM
Returns: Entity string ('UK', 'BE', 'AU') if successfully parsed, None otherwise
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
config |
dict | Configuration dictionary containing LLM settings and entity patterns | instance |
llm_client |
LLMClient | Instance of LLMClient for LLM-based classification fallback | instance |
entity_patterns |
dict | Dictionary mapping entity codes ('UK', 'BE', 'AU') to their identification patterns (vat_patterns, name_patterns, address_patterns). Each pattern is a list of regex strings. | instance |
Dependencies
loggingreutils.llm_client.LLMClient
Required Imports
import logging
import re
from utils.llm_client import LLMClient
Usage Example
# Basic usage with default configuration
classifier = EntityClassifier()
# Classify a document (from DocumentProcessor)
document = {
'text': 'Invoice to ViceBio Ltd, VAT: GB123456789...',
'blocks': [...]
}
entity = classifier.classify(document)
print(f"Invoice is for: {entity}") # Output: 'UK'
# Usage with custom configuration
config = {
'llm': {'model': 'gpt-4', 'api_key': 'your-key'},
'entities': {
'UK': {
'vat_patterns': [r'GB\d{9}'],
'name_patterns': [r'ViceBio UK']
}
}
}
classifier = EntityClassifier(config)
entity = classifier.classify(document)
Best Practices
- Always pass a processed document from DocumentProcessor with 'text' and 'blocks' keys
- Configure entity patterns in config if default patterns don't match your entity identifiers
- The classifier uses a scoring system: VAT patterns score 3 points, name patterns 2 points, address patterns 1 point
- Rule-based classification requires a clear winner (50% more evidence than second place) to avoid ambiguity
- LLM classification is used as fallback when rule-based approach is inconclusive
- Default entity is 'UK' if all classification methods fail
- For long documents (>2000 chars), only first 2000 chars plus header blocks are sent to LLM to reduce costs
- Ensure LLMClient is properly initialized with valid API credentials before use
- The classifier is stateless between classify() calls, safe for reuse across multiple documents
- Check logs for classification details and pattern matches during debugging
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class BaseExtractor 61.7% similar
-
class InvoiceProcessor 61.4% similar
-
class LanguageDetector 58.4% similar
-
class BaseValidator 57.0% similar
-
class BEExtractor 57.0% similar