class LanguageDetector
A language detection class that identifies whether invoice documents are written in English, French, or Dutch using both rule-based keyword matching and LLM-based detection.
/tf/active/vicechatdev/invoice_extraction/core/language_detector.py
10 - 236
moderate
Purpose
The LanguageDetector class provides robust language identification for invoice documents across three supported languages (English, French, Dutch). It employs a two-tier detection strategy: first attempting rule-based detection using keyword matching and character trigram analysis, then falling back to LLM-based detection if confidence is below a threshold. This approach balances speed and accuracy, making it suitable for processing invoice documents where language identification is critical for downstream processing.
Source Code
class LanguageDetector:
"""Detects the language of invoice documents."""
# Language codes for our target languages
SUPPORTED_LANGUAGES = {
'en': 'english',
'fr': 'french',
'nl': 'dutch'
}
# Language-specific keywords to aid identification
LANGUAGE_KEYWORDS = {
'en': [
'invoice', 'receipt', 'payment', 'order', 'total', 'tax',
'date', 'due', 'amount', 'quantity', 'price', 'discount',
'item', 'description', 'reference', 'account', 'number',
'customer', 'paid', 'balance', 'bill', 'purchase'
],
'fr': [
'facture', 'reçu', 'paiement', 'commande', 'total', 'taxe',
'date', 'échéance', 'montant', 'quantité', 'prix', 'remise',
'article', 'description', 'référence', 'compte', 'numéro',
'client', 'payé', 'solde', 'addition', 'achat'
],
'nl': [
'factuur', 'bon', 'betaling', 'bestelling', 'totaal', 'belasting',
'datum', 'vervaldatum', 'bedrag', 'hoeveelheid', 'prijs', 'korting',
'item', 'beschrijving', 'referentie', 'rekening', 'nummer',
'klant', 'betaald', 'saldo', 'rekening', 'aankoop'
]
}
def __init__(self, config=None):
self.config = config or {}
self.llm_client = LLMClient(self.config.get('llm', {}))
self.default_language = self.config.get('default_language', 'en')
self.confidence_threshold = self.config.get('language_confidence_threshold', 0.6)
# Load custom language keywords if provided
if 'language_keywords' in self.config:
for lang, keywords in self.config['language_keywords'].items():
if lang in self.LANGUAGE_KEYWORDS:
self.LANGUAGE_KEYWORDS[lang].extend(keywords)
def detect(self, document):
"""
Detect the language of a document.
Args:
document: Processed document from DocumentProcessor
Returns:
String language code ('en', 'fr', or 'nl')
"""
logger.info("Detecting document language")
# Extract text for language detection
text = document.get('text', '')
if not text:
logger.warning("No text available for language detection")
return self.default_language
# Try rule-based detection first
lang, confidence = self._rule_based_detection(text)
# If confident enough, return the result
if confidence >= self.confidence_threshold:
logger.info(f"Language detected as {lang} ({self.SUPPORTED_LANGUAGES[lang]}) with confidence {confidence:.2f}")
return lang
# Otherwise, try LLM-based detection
logger.info(f"Rule-based detection not confident ({confidence:.2f}), using LLM")
lang = self._llm_detection(text)
logger.info(f"Final language detection: {lang} ({self.SUPPORTED_LANGUAGES.get(lang, 'unknown')})")
return lang
def _rule_based_detection(self, text):
"""
Apply rule-based language detection using keyword matching and n-grams.
Returns:
Tuple of (language_code, confidence_score)
"""
# Normalize text for better matching
text_lower = text.lower()
# 1. Check for language-specific keywords
keyword_scores = {}
for lang, keywords in self.LANGUAGE_KEYWORDS.items():
matches = 0
for keyword in keywords:
if keyword.lower() in text_lower:
matches += 1
keyword_scores[lang] = matches / len(keywords) if keywords else 0
# 2. Apply character n-gram analysis (common in language detection)
# Extract representative sample of text (first 1000 chars)
sample = text_lower[:1000]
# Clean sample - keep only letters and spaces
sample = ''.join(c for c in sample if c.isalpha() or c.isspace())
# Count character trigrams
trigrams = self._get_trigrams(sample)
# Compare with language profiles (using approximate language profiles)
trigram_scores = {
'en': self._trigram_similarity(trigrams, self._get_english_profile()),
'fr': self._trigram_similarity(trigrams, self._get_french_profile()),
'nl': self._trigram_similarity(trigrams, self._get_dutch_profile())
}
# 3. Combine scores with weighting
final_scores = {}
for lang in self.SUPPORTED_LANGUAGES:
# Weight: 60% keyword matches, 40% trigram similarity
final_scores[lang] = 0.6 * keyword_scores.get(lang, 0) + 0.4 * trigram_scores.get(lang, 0)
# Get the language with highest score
best_lang = max(final_scores, key=final_scores.get)
confidence = final_scores[best_lang]
return best_lang, confidence
def _llm_detection(self, text):
"""Use LLM to detect document language."""
# Prepare sample text (use beginning of document)
sample = text[:1000].strip()
prompt = f"""Determine the language of the following invoice text.
Focus only on the text language, not the content.
The text could be in English, French, or Dutch.
Text sample:
{sample}
Respond with only one of these exact language codes: "en" (English), "fr" (French), or "nl" (Dutch).
If you can't determine with confidence, or if it's a different language, respond with "en" as the default.
"""
# Call LLM for language detection
response = self.llm_client.generate(prompt)
# Parse the response
lang = self._parse_lang_from_llm_response(response)
# Return detected language or default
return lang or self.default_language
def _parse_lang_from_llm_response(self, response):
"""Parse the language code from the LLM response."""
if not response:
return None
# Clean the response
clean_response = response.strip().lower()
# Direct matches for language codes
if clean_response in ['en', 'fr', 'nl']:
return clean_response
# Look for language mentions
if 'english' in clean_response:
return 'en'
elif 'french' in clean_response or 'français' in clean_response:
return 'fr'
elif 'dutch' in clean_response or 'nederlands' in clean_response:
return 'nl'
# No clear language found
logger.warning(f"Could not parse language from LLM response: {response}")
return None
def _get_trigrams(self, text):
"""Extract character trigrams from text."""
text = text.replace(' ', '_')
trigrams = []
for i in range(len(text) - 2):
trigram = text[i:i+3]
if len(trigram) == 3: # Ensure we have 3 chars
trigrams.append(trigram)
return Counter(trigrams)
def _trigram_similarity(self, trigrams, profile):
"""Calculate similarity between trigram counts and a language profile."""
if not trigrams or not profile:
return 0.0
# Get the top trigrams from the text
top_trigrams = dict(trigrams.most_common(300))
# Calculate jaccard similarity with the profile
common_trigrams = set(top_trigrams.keys()) & set(profile.keys())
all_trigrams = set(top_trigrams.keys()) | set(profile.keys())
return len(common_trigrams) / len(all_trigrams) if all_trigrams else 0
# The following are simplified language profiles containing common character trigrams
# In a production system, these would be more comprehensive and trained on large corpora
def _get_english_profile(self):
"""Return common English character trigrams."""
return {
'the': 1.0, 'and': 0.9, 'ing': 0.85, 'ion': 0.82, 'ent': 0.78,
'her': 0.76, 'for': 0.75, 'hat': 0.74, 'tha': 0.73, 'ere': 0.72,
'ate': 0.7, 'his': 0.69, 'con': 0.68, 'res': 0.67, 'ver': 0.66,
'all': 0.65, 'ons': 0.64, 'nce': 0.63, 'men': 0.62, 'ith': 0.61
}
def _get_french_profile(self):
"""Return common French character trigrams."""
return {
'les': 1.0, 'ent': 0.95, 'que': 0.92, 'des': 0.9, 'our': 0.88,
'con': 0.87, 'est': 0.86, 'ant': 0.84, 'men': 0.82, 'ion': 0.81,
'lle': 0.79, 'ait': 0.78, 'ans': 0.77, 'par': 0.76, 'tre': 0.75,
'une': 0.74, 'tio': 0.73, 'sur': 0.72, 'res': 0.71, 'ous': 0.7
}
def _get_dutch_profile(self):
"""Return common Dutch character trigrams."""
return {
'een': 1.0, 'de_': 0.95, 'van': 0.92, 'en_': 0.9, 'ing': 0.88,
'der': 0.87, 'het': 0.85, 'aar': 0.84, '_be': 0.82, 'ver': 0.8,
'aan': 0.79, 'oor': 0.78, 'nde': 0.77, 'zij': 0.76, 'dat': 0.75,
'gen': 0.74, 'ijd': 0.73, 'ten': 0.72, 'erk': 0.71, 'nie': 0.7
}
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
config: Optional dictionary containing configuration settings. Supported keys include: 'llm' (dict for LLMClient configuration), 'default_language' (string, defaults to 'en'), 'language_confidence_threshold' (float, defaults to 0.6, determines when to use LLM fallback), and 'language_keywords' (dict mapping language codes to additional keyword lists to extend the built-in keyword sets).
Return Value
The constructor returns a LanguageDetector instance. The main detect() method returns a string language code ('en', 'fr', or 'nl') representing the detected language. Internal methods return tuples (language_code, confidence_score) for rule-based detection, or language codes for LLM detection.
Class Interface
Methods
__init__(self, config=None)
Purpose: Initialize the LanguageDetector with optional configuration settings
Parameters:
config: Optional dictionary with keys: 'llm' (LLMClient config), 'default_language' (str), 'language_confidence_threshold' (float), 'language_keywords' (dict)
Returns: None (constructor)
detect(self, document) -> str
Purpose: Detect the language of a document using rule-based and optionally LLM-based methods
Parameters:
document: Dictionary containing document data, must have a 'text' key with the document text as a string
Returns: String language code: 'en' (English), 'fr' (French), or 'nl' (Dutch)
_rule_based_detection(self, text) -> tuple
Purpose: Apply rule-based language detection using keyword matching and character trigram analysis
Parameters:
text: String containing the document text to analyze
Returns: Tuple of (language_code: str, confidence_score: float) where confidence is between 0.0 and 1.0
_llm_detection(self, text) -> str
Purpose: Use LLM to detect document language when rule-based detection has low confidence
Parameters:
text: String containing the document text to analyze (first 1000 characters are used)
Returns: String language code ('en', 'fr', or 'nl') or default_language if detection fails
_parse_lang_from_llm_response(self, response) -> str or None
Purpose: Parse the language code from the LLM's text response
Parameters:
response: String response from the LLM containing language identification
Returns: String language code ('en', 'fr', or 'nl') or None if parsing fails
_get_trigrams(self, text) -> Counter
Purpose: Extract character trigrams from text for language fingerprinting
Parameters:
text: String text to extract trigrams from (spaces are replaced with underscores)
Returns: Counter object mapping trigram strings to their occurrence counts
_trigram_similarity(self, trigrams, profile) -> float
Purpose: Calculate Jaccard similarity between document trigrams and a language profile
Parameters:
trigrams: Counter object of trigrams from the documentprofile: Dictionary representing a language profile with trigram frequencies
Returns: Float similarity score between 0.0 and 1.0
_get_english_profile(self) -> dict
Purpose: Return common English character trigrams with frequency weights
Returns: Dictionary mapping English trigrams to normalized frequency scores
_get_french_profile(self) -> dict
Purpose: Return common French character trigrams with frequency weights
Returns: Dictionary mapping French trigrams to normalized frequency scores
_get_dutch_profile(self) -> dict
Purpose: Return common Dutch character trigrams with frequency weights
Returns: Dictionary mapping Dutch trigrams to normalized frequency scores
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
SUPPORTED_LANGUAGES |
dict | Class variable mapping language codes ('en', 'fr', 'nl') to full language names | class |
LANGUAGE_KEYWORDS |
dict | Class variable containing lists of invoice-related keywords for each supported language, used for keyword-based detection | class |
config |
dict | Instance configuration dictionary passed during initialization | instance |
llm_client |
LLMClient | Instance of LLMClient used for LLM-based language detection fallback | instance |
default_language |
str | Default language code to return when detection fails or text is unavailable (defaults to 'en') | instance |
confidence_threshold |
float | Minimum confidence score (0.0-1.0) required for rule-based detection before falling back to LLM (defaults to 0.6) | instance |
Dependencies
loggingrecollectionsstringpathlibutils.llm_client
Required Imports
import logging
import re
from collections import Counter
import string
from pathlib import Path
from utils.llm_client import LLMClient
Usage Example
# Basic usage
detector = LanguageDetector()
document = {'text': 'Invoice Number: 12345\nTotal: $100.00'}
language = detector.detect(document)
print(f"Detected language: {language}") # Output: en
# With custom configuration
config = {
'default_language': 'fr',
'language_confidence_threshold': 0.7,
'llm': {'model': 'gpt-4', 'temperature': 0.0},
'language_keywords': {
'en': ['vat', 'gst'],
'fr': ['tva', 'hors taxes']
}
}
detector = LanguageDetector(config)
document = {'text': 'Facture numéro: 12345\nTotal: 100,00 €'}
language = detector.detect(document)
print(f"Detected language: {language}") # Output: fr
Best Practices
- Always provide a document dict with a 'text' key containing the document text to analyze
- Set an appropriate confidence_threshold based on your accuracy requirements (lower values use LLM more often, higher values rely more on rule-based detection)
- The rule-based detection is faster but may be less accurate for short texts or mixed-language documents
- LLM detection is used as a fallback and requires proper LLMClient configuration
- The class is stateless after initialization, so a single instance can be reused for multiple documents
- Custom keywords can be added via config to improve detection for domain-specific terminology
- The detector only supports English, French, and Dutch; other languages will default to the configured default_language
- For best results, ensure the document text contains at least a few sentences of content
- The trigram analysis uses the first 1000 characters of text for efficiency
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class BEExtractor 64.4% similar
-
class EntityClassifier 58.4% similar
-
class AUExtractor 56.2% similar
-
class InvoiceProcessor 55.1% similar
-
class TestBEExtractor 52.1% similar