EntityClassifier - Code Extractor

class EntityClassifier

Maturity: 51

Classifies which ViceBio entity (UK, Belgium, or Australia) an invoice is addressed to using rule-based pattern matching and LLM fallback.

File:
/tf/active/vicechatdev/invoice_extraction/core/entity_classifier.py

Lines:
7 - 199

Complexity:
moderate

Purpose

The EntityClassifier is responsible for determining the target ViceBio entity for invoices by analyzing document text for entity-specific patterns (VAT numbers, company names, addresses). It employs a two-stage approach: first attempting rule-based classification using regex patterns, then falling back to LLM-based classification if patterns are insufficient. This is critical for routing invoices to the correct legal entity for processing.

Source Code

class EntityClassifier:
    """Classifies which ViceBio entity the invoice is addressed to."""
    
    def __init__(self, config=None):
        self.config = config or {}
        
        # Initialize LLM client
        self.llm_client = LLMClient(self.config.get('llm', {}))
        
        # Entity identifier patterns (VAT numbers, company names, etc.)
        self.entity_patterns = {
            'UK': {
                'vat_patterns': [r'GB\d{9}', r'GB\d{12}'],
                'name_patterns': [
                    r'ViceBio\s+(?:Ltd|Limited|UK)',
                    r'ViceBio\s+(?:United\s+Kingdom)',
                ]
            },
            'BE': {
                'vat_patterns': [r'BE0\d{9}', r'BE\s+0\d{3}\.\d{3}\.\d{3}'],
                'name_patterns': [
                    r'ViceBio\s+(?:Belgium|België|Belgique)',
                    r'ViceBio\s+(?:BV|BVBA|SRL)'
                ]
            },
            'AU': {
                'vat_patterns': [r'\d{2}\s?\d{3}\s?\d{3}\s?\d{3}'],  # ABN format
                'name_patterns': [
                    r'ViceBio\s+(?:Australia|AU)',
                    r'ViceBio\s+(?:Pty|Proprietary|PTY\s+LTD)'
                ]
            }
        }
        
        # Load entity details from config if available
        if 'entities' in self.config:
            for entity, details in self.config['entities'].items():
                if entity in self.entity_patterns and 'identifiers' in details:
                    for id_type, patterns in details['identifiers'].items():
                        self.entity_patterns[entity][id_type] = patterns
    
    def classify(self, document):
        """
        Determine which ViceBio entity the invoice is for.
        
        Args:
            document: Processed document from DocumentProcessor
            
        Returns:
            String indicating entity ('UK', 'BE', or 'AU')
        """
        logger.info("Classifying target entity for invoice")
        
        # First try rule-based classification using regex patterns
        entity = self._rule_based_classification(document)
        if entity:
            logger.info(f"Entity classified as {entity} using rule-based approach")
            return entity
        
        # If rule-based approach fails, use LLM for classification
        entity = self._llm_classification(document)
        logger.info(f"Entity classified as {entity} using LLM")
        
        return entity
    
    def _rule_based_classification(self, document):
        """Apply rule-based classification using regex patterns."""
        # Extract the full text from the document
        text = document.get('text', '')
        
        # Scoring for each entity
        scores = {'UK': 0, 'BE': 0, 'AU': 0}
        
        # Check for entity patterns
        for entity, patterns in self.entity_patterns.items():
            # Check VAT number patterns - strong indicator
            for pattern in patterns.get('vat_patterns', []):
                if re.search(pattern, text, re.IGNORECASE):
                    scores[entity] += 3
                    logger.debug(f"Found {entity} VAT pattern: {pattern}")
            
            # Check name patterns
            for pattern in patterns.get('name_patterns', []):
                if re.search(pattern, text, re.IGNORECASE):
                    scores[entity] += 2
                    logger.debug(f"Found {entity} name pattern: {pattern}")
            
            # Check address patterns if defined
            for pattern in patterns.get('address_patterns', []):
                if re.search(pattern, text, re.IGNORECASE):
                    scores[entity] += 1
                    logger.debug(f"Found {entity} address pattern: {pattern}")
        
        # Check for clear winner (must have a score and be >50% higher than second place)
        max_entity = max(scores, key=scores.get)
        max_score = scores[max_entity]
        
        if max_score > 0:
            # Sort scores to get second-highest
            sorted_scores = sorted([(e, s) for e, s in scores.items() if e != max_entity], 
                                   key=lambda x: x[1], reverse=True)
            
            # If no other entities found or clear winner
            if not sorted_scores or sorted_scores[0][1] == 0:
                return max_entity
            
            second_score = sorted_scores[0][1]
            if max_score > second_score * 1.5:  # At least 50% more evidence
                return max_entity
        
        # Not enough evidence from rule-based approach
        return None
    
    def _llm_classification(self, document):
        """Use LLM to classify the target entity."""
        # Prepare the prompt for LLM
        prompt = self._prepare_entity_classification_prompt(document)
        
        # Call LLM for classification
        response = self.llm_client.generate(prompt)
        
        # Parse the response
        entity = self._parse_entity_from_llm_response(response)
        
        # Use UK as default if parsing fails
        return entity or 'UK'
    
    def _prepare_entity_classification_prompt(self, document):
        """Prepare a prompt for LLM entity classification."""
        # Extract text for the prompt
        text = document.get('text', '')
        
        # If document is too long, use first 2000 chars and key blocks
        if len(text) > 2000:
            trimmed_text = text[:2000]
            
            # Try to add key header blocks if we can find them
            header_blocks = []
            for block in document.get('blocks', [])[:10]:  # Check first 10 blocks
                if block.get('text') and block.get('bbox') and block['bbox'][1] < 300:  # Top of page
                    header_blocks.append(block.get('text', ''))
            
            if header_blocks:
                trimmed_text += "\n\n[Additional header text:]\n" + "\n".join(header_blocks)
        else:
            trimmed_text = text
        
        # Construct the prompt
        prompt = f"""You are analyzing an invoice to determine which entity of ViceBio it is addressed to.

The possible entities are:
1. UK Ltd (mother company) - based in the United Kingdom
2. Australian subsidiary - based in Australia  
3. Belgian VAT entity - based in Belgium

Look for clues such as:
- The recipient's address
- VAT/Tax identification numbers (UK format: GB123456789, Belgium format: BE0123456789, Australia ABN: 12 345 678 901)
- Billing entity names or references
- Currency being used
- Tax or VAT rates mentioned

Invoice text:
{trimmed_text}

Respond with only one of these exact strings: "UK", "BE", or "AU" for the entity this invoice is addressed to.
If you cannot determine with confidence, respond with "UK" as the default entity.
"""
        return prompt
    
    def _parse_entity_from_llm_response(self, response):
        """Parse the entity from the LLM response."""
        if not response:
            return None
            
        # Clean the response and look for entity indicators
        clean_response = response.strip().upper()
        
        # Direct matches
        if clean_response in ['UK', 'BE', 'AU']:
            return clean_response
            
        # Look for entity mentions in the response
        if 'UK' in clean_response or 'UNITED KINGDOM' in clean_response:
            return 'UK'
        elif 'BE' in clean_response or 'BELGIUM' in clean_response or 'BELGIAN' in clean_response:
            return 'BE'
        elif 'AU' in clean_response or 'AUSTRALIA' in clean_response or 'AUSTRALIAN' in clean_response:
            return 'AU'
            
        # No clear entity found
        logger.warning(f"Could not parse entity from LLM response: {response}")
        return None

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

config: Optional dictionary containing configuration settings. Can include 'llm' key with LLM client configuration, and 'entities' key with custom entity patterns. If None, uses default patterns for UK, BE, and AU entities. The 'entities' structure allows overriding default VAT patterns, name patterns, and address patterns for each entity.

Return Value

Instantiation returns an EntityClassifier object. The main classify() method returns a string ('UK', 'BE', or 'AU') indicating which ViceBio entity the invoice is addressed to. If classification fails or is uncertain, defaults to 'UK'.

Class Interface

Methods

`init(self, config=None)`

Purpose: Initializes the EntityClassifier with configuration and sets up entity patterns and LLM client

Parameters:

config: Optional dictionary with 'llm' and 'entities' keys for configuration

Returns: None (constructor)

`classify(self, document) -> str`

Purpose: Main method to determine which ViceBio entity (UK, BE, AU) the invoice is addressed to

Parameters:

document: Processed document dictionary from DocumentProcessor containing 'text' and 'blocks' keys

Returns: String indicating entity: 'UK', 'BE', or 'AU'

`_rule_based_classification(self, document) -> str or None`

Purpose: Applies regex pattern matching to classify entity based on VAT numbers, company names, and addresses

Parameters:

document: Document dictionary with 'text' key containing full document text

Returns: Entity string ('UK', 'BE', 'AU') if confident match found, None otherwise

`_llm_classification(self, document) -> str`

Purpose: Uses LLM to classify entity when rule-based approach is inconclusive

Parameters:

document: Document dictionary with 'text' and 'blocks' keys

Returns: Entity string ('UK', 'BE', 'AU'), defaults to 'UK' if parsing fails

`_prepare_entity_classification_prompt(self, document) -> str`

Purpose: Constructs a prompt for LLM entity classification, truncating long documents intelligently

Parameters:

document: Document dictionary with 'text' and 'blocks' keys

Returns: Formatted prompt string for LLM with instructions and document text

`_parse_entity_from_llm_response(self, response) -> str or None`

Purpose: Parses LLM response to extract entity classification

Parameters:

response: String response from LLM

Returns: Entity string ('UK', 'BE', 'AU') if successfully parsed, None otherwise

Attributes

Name	Type	Description	Scope
`config`	dict	Configuration dictionary containing LLM settings and entity patterns	instance
`llm_client`	LLMClient	Instance of LLMClient for LLM-based classification fallback	instance
`entity_patterns`	dict	Dictionary mapping entity codes ('UK', 'BE', 'AU') to their identification patterns (vat_patterns, name_patterns, address_patterns). Each pattern is a list of regex strings.	instance

Dependencies

logging
re
utils.llm_client.LLMClient

Required Imports

import logging
import re
from utils.llm_client import LLMClient

Usage Example

# Basic usage with default configuration
classifier = EntityClassifier()

# Classify a document (from DocumentProcessor)
document = {
    'text': 'Invoice to ViceBio Ltd, VAT: GB123456789...',
    'blocks': [...]
}
entity = classifier.classify(document)
print(f"Invoice is for: {entity}")  # Output: 'UK'

# Usage with custom configuration
config = {
    'llm': {'model': 'gpt-4', 'api_key': 'your-key'},
    'entities': {
        'UK': {
            'vat_patterns': [r'GB\d{9}'],
            'name_patterns': [r'ViceBio UK']
        }
    }
}
classifier = EntityClassifier(config)
entity = classifier.classify(document)

Best Practices

Always pass a processed document from DocumentProcessor with 'text' and 'blocks' keys
Configure entity patterns in config if default patterns don't match your entity identifiers
The classifier uses a scoring system: VAT patterns score 3 points, name patterns 2 points, address patterns 1 point
Rule-based classification requires a clear winner (50% more evidence than second place) to avoid ambiguity
LLM classification is used as fallback when rule-based approach is inconclusive
Default entity is 'UK' if all classification methods fail
For long documents (>2000 chars), only first 2000 chars plus header blocks are sent to LLM to reduce costs
Ensure LLMClient is properly initialized with valid API credentials before use
The classifier is stateless between classify() calls, safe for reuse across multiple documents
Check logs for classification details and pattern matches during debugging

Similar Components

AI-powered semantic similarity - components with related functionality:

class BaseExtractor 61.7% similar

Abstract base class that defines the interface and shared functionality for entity-specific invoice data extractors (UK, BE, AU), providing a multi-stage extraction pipeline for invoice processing.
From: /tf/active/vicechatdev/invoice_extraction/extractors/base_extractor.py
class InvoiceProcessor 61.4% similar

Main orchestrator class that coordinates the complete invoice processing pipeline from PDF extraction through validation to Excel generation.
From: /tf/active/vicechatdev/invoice_extraction/main.py
class LanguageDetector 58.4% similar

A language detection class that identifies whether invoice documents are written in English, French, or Dutch using both rule-based keyword matching and LLM-based detection.
From: /tf/active/vicechatdev/invoice_extraction/core/language_detector.py
class BaseValidator 57.0% similar

Abstract base class for validating extracted invoice data with entity-specific validation rules. Provides common validation functionality for required fields, field types, date consistency, and amount calculations.
From: /tf/active/vicechatdev/invoice_extraction/validators/base_validator.py
class BEExtractor 57.0% similar

Belgium-specific invoice data extractor that uses LLM (Large Language Model) to extract structured invoice data from Belgian invoices in multiple languages (English, French, Dutch).
From: /tf/active/vicechatdev/invoice_extraction/extractors/be_extractor.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class EntityClassifier:
    """Classifies which ViceBio entity the invoice is addressed to."""
    
    def __init__(self, config=None):
        self.config = config or {}
        
        # Initialize LLM client
        self.llm_client = LLMClient(self.config.get('llm', {}))
        
        # Entity identifier patterns (VAT numbers, company names, etc.)
        self.entity_patterns = {
            'UK': {
                'vat_patterns': [r'GB\d{9}', r'GB\d{12}'],
                'name_patterns': [
                    r'ViceBio\s+(?:Ltd|Limited|UK)',
                    r'ViceBio\s+(?:United\s+Kingdom)',
                ]
            },
            'BE': {
                'vat_patterns': [r'BE0\d{9}', r'BE\s+0\d{3}\.\d{3}\.\d{3}'],
                'name_patterns': [
                    r'ViceBio\s+(?:Belgium|België|Belgique)',
                    r'ViceBio\s+(?:BV|BVBA|SRL)'
                ]
            },
            'AU': {
                'vat_patterns': [r'\d{2}\s?\d{3}\s?\d{3}\s?\d{3}'],  # ABN format
                'name_patterns': [
                    r'ViceBio\s+(?:Australia|AU)',
                    r'ViceBio\s+(?:Pty|Proprietary|PTY\s+LTD)'
                ]
            }
        }
        
        # Load entity details from config if available
        if 'entities' in self.config:
            for entity, details in self.config['entities'].items():
                if entity in self.entity_patterns and 'identifiers' in details:
                    for id_type, patterns in details['identifiers'].items():
                        self.entity_patterns[entity][id_type] = patterns
    
    def classify(self, document):
        """
        Determine which ViceBio entity the invoice is for.
        
        Args:
            document: Processed document from DocumentProcessor
            
        Returns:
            String indicating entity ('UK', 'BE', or 'AU')
        """
        logger.info("Classifying target entity for invoice")
        
        # First try rule-based classification using regex patterns
        entity = self._rule_based_classification(document)
        if entity:
            logger.info(f"Entity classified as {entity} using rule-based approach")
            return entity
        
        # If rule-based approach fails, use LLM for classification
        entity = self._llm_classification(document)
        logger.info(f"Entity classified as {entity} using LLM")
        
        return entity
    
    def _rule_based_classification(self, document):
        """Apply rule-based classification using regex patterns."""
        # Extract the full text from the document
        text = document.get('text', '')
        
        # Scoring for each entity
        scores = {'UK': 0, 'BE': 0, 'AU': 0}
        
        # Check for entity patterns
        for entity, patterns in self.entity_patterns.items():
            # Check VAT number patterns - strong indicator
            for pattern in patterns.get('vat_patterns', []):
                if re.search(pattern, text, re.IGNORECASE):
                    scores[entity] += 3
                    logger.debug(f"Found {entity} VAT pattern: {pattern}")
            
            # Check name patterns
            for pattern in patterns.get('name_patterns', []):
                if re.search(pattern, text, re.IGNORECASE):
                    scores[entity] += 2
                    logger.debug(f"Found {entity} name pattern: {pattern}")
            
            # Check address patterns if defined
            for pattern in patterns.get('address_patterns', []):
                if re.search(pattern, text, re.IGNORECASE):
                    scores[entity] += 1
                    logger.debug(f"Found {entity} address pattern: {pattern}")
        
        # Check for clear winner (must have a score and be >50% higher than second place)
        max_entity = max(scores, key=scores.get)
        max_score = scores[max_entity]
        
        if max_score > 0:
            # Sort scores to get second-highest
            sorted_scores = sorted([(e, s) for e, s in scores.items() if e != max_entity], 
                                   key=lambda x: x[1], reverse=True)
            
            # If no other entities found or clear winner
            if not sorted_scores or sorted_scores[0][1] == 0:
                return max_entity
            
            second_score = sorted_scores[0][1]
            if max_score > second_score * 1.5:  # At least 50% more evidence
                return max_entity
        
        # Not enough evidence from rule-based approach
        return None
    
    def _llm_classification(self, document):
        """Use LLM to classify the target entity."""
        # Prepare the prompt for LLM
        prompt = self._prepare_entity_classification_prompt(document)
        
        # Call LLM for classification
        response = self.llm_client.generate(prompt)
        
        # Parse the response
        entity = self._parse_entity_from_llm_response(response)
        
        # Use UK as default if parsing fails
        return entity or 'UK'
    
    def _prepare_entity_classification_prompt(self, document):
        """Prepare a prompt for LLM entity classification."""
        # Extract text for the prompt
        text = document.get('text', '')
        
        # If document is too long, use first 2000 chars and key blocks
        if len(text) > 2000:
            trimmed_text = text[:2000]
            
            # Try to add key header blocks if we can find them
            header_blocks = []
            for block in document.get('blocks', [])[:10]:  # Check first 10 blocks
                if block.get('text') and block.get('bbox') and block['bbox'][1] < 300:  # Top of page
                    header_blocks.append(block.get('text', ''))
            
            if header_blocks:
                trimmed_text += "\n\n[Additional header text:]\n" + "\n".join(header_blocks)
        else:
            trimmed_text = text
        
        # Construct the prompt
        prompt = f"""You are analyzing an invoice to determine which entity of ViceBio it is addressed to.

The possible entities are:
1. UK Ltd (mother company) - based in the United Kingdom
2. Australian subsidiary - based in Australia  
3. Belgian VAT entity - based in Belgium

Look for clues such as:
- The recipient's address
- VAT/Tax identification numbers (UK format: GB123456789, Belgium format: BE0123456789, Australia ABN: 12 345 678 901)
- Billing entity names or references
- Currency being used
- Tax or VAT rates mentioned

Invoice text:
{trimmed_text}

Respond with only one of these exact strings: "UK", "BE", or "AU" for the entity this invoice is addressed to.
If you cannot determine with confidence, respond with "UK" as the default entity.
"""
        return prompt
    
    def _parse_entity_from_llm_response(self, response):
        """Parse the entity from the LLM response."""
        if not response:
            return None
            
        # Clean the response and look for entity indicators
        clean_response = response.strip().upper()
        
        # Direct matches
        if clean_response in ['UK', 'BE', 'AU']:
            return clean_response
            
        # Look for entity mentions in the response
        if 'UK' in clean_response or 'UNITED KINGDOM' in clean_response:
            return 'UK'
        elif 'BE' in clean_response or 'BELGIUM' in clean_response or 'BELGIAN' in clean_response:
            return 'BE'
        elif 'AU' in clean_response or 'AUSTRALIA' in clean_response or 'AUSTRALIAN' in clean_response:
            return 'AU'
            
        # No clear entity found
        logger.warning(f"Could not parse entity from LLM response: {response}")
        return None
                        

Improved Code

🔍 Code Extractor

class EntityClassifier

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self, config=None)`

`classify(self, document) -> str`

`_rule_based_classification(self, document) -> str or None`

`_llm_classification(self, document) -> str`

`_prepare_entity_classification_prompt(self, document) -> str`

`_parse_entity_from_llm_response(self, response) -> str or None`

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class BaseExtractor 61.7% similar

class InvoiceProcessor 61.4% similar

class LanguageDetector 58.4% similar

class BaseValidator 57.0% similar

class BEExtractor 57.0% similar

class EntityClassifier

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self, config=None)

classify(self, document) -> str

_rule_based_classification(self, document) -> str or None

_llm_classification(self, document) -> str

_prepare_entity_classification_prompt(self, document) -> str

_parse_entity_from_llm_response(self, response) -> str or None

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class BaseExtractor 61.7% similar

class InvoiceProcessor 61.4% similar

class LanguageDetector 58.4% similar

class BaseValidator 57.0% similar

class BEExtractor 57.0% similar

✨ Improve Code: EntityClassifier

Code Comparison

`init(self, config=None)`

`classify(self, document) -> str`

`_rule_based_classification(self, document) -> str or None`

`_llm_classification(self, document) -> str`

`_prepare_entity_classification_prompt(self, document) -> str`

`_parse_entity_from_llm_response(self, response) -> str or None`