class UKExtractor
UK-specific invoice data extractor.
/tf/active/vicechatdev/invoice_extraction/extractors/uk_extractor.py
12 - 684
moderate
Purpose
UK-specific invoice data extractor.
Source Code
class UKExtractor(BaseExtractor):
"""UK-specific invoice data extractor."""
def __init__(self, config=None):
super().__init__(config)
# Only initialize LLM client if not already initialized by parent
if self.llm_client is None:
self.llm_client = LLMClient(self.config.get('llm', {}))
# UK-specific configuration
self.uk_config = self.config.get('uk_extractor', {})
# UK VAT rate options
self.vat_rates = self.uk_config.get('vat_rates', [0, 5, 20])
# Default currency
self.default_currency = self.uk_config.get('default_currency', 'GBP')
# Date format preferences for UK
self.date_formats = self.uk_config.get('date_formats', [
'%d/%m/%Y', '%d-%m-%Y', '%d.%m.%Y', # UK formats (day first)
'%Y-%m-%d', '%d %b %Y', '%d %B %Y' # ISO and written month formats
])
def _get_full_document_text(self, document: Dict[str, Any]) -> str:
"""
Extract the full text content from the document, handling different document structures.
Args:
document: The document object
Returns:
Combined text from all pages
"""
full_text = ""
# If text is directly in the document
if document.get('text'):
full_text = document['text']
# If text is embedded in pages
elif document.get('pages'):
page_texts = []
for page in document['pages']:
if page.get('text'):
page_texts.append(page['text'])
full_text = "\n\n".join(page_texts)
# If still no text, try extracting from blocks
if not full_text and document.get('blocks'):
block_texts = [block.get('text', '') for block in document['blocks']]
full_text = " ".join(block_texts)
# Finally check for blocks in pages
if not full_text and document.get('pages'):
block_texts = []
for page in document['pages']:
if page.get('blocks'):
for block in page['blocks']:
if block.get('text'):
block_texts.append(block['text'])
full_text = " ".join(block_texts)
return full_text
def extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
"""
Extract invoice data from the document using UK-specific logic.
Args:
document: Processed document from DocumentProcessor
language: Detected language of the document
Returns:
Dict containing extracted invoice fields
"""
logger.info("Extracting data with UK-specific LLM extractor")
# Use staged extraction process defined in base class
extraction_result = self.extract_staged(document, language)
# Add UK-specific metadata
extraction_result['metadata']['entity'] = 'UK'
extraction_result['metadata']['country'] = 'United Kingdom'
# Verify critical fields for UK
self._verify_uk_specific_fields(extraction_result)
return extraction_result
def extract_structure(self, document: Dict[str, Any]) -> Dict[str, Any]:
"""Extract document structure with UK invoice layout awareness using LLM."""
logger.info("Extracting document structure with LLM")
# Use LLM to identify regions
structure = self._extract_structure_with_llm(document)
if not structure:
# Fallback to basic structure from parent class
structure = super().extract_structure(document)
return structure
def extract_invoice_metadata(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
"""Extract UK invoice metadata (invoice number, dates, references) using LLM."""
logger.info("Extracting UK invoice metadata with LLM")
# Get full document text for context
full_text = self._get_full_document_text(document)
# Use LLM extraction
result = self._extract_invoice_metadata_with_llm(full_text)
return result
def extract_vendor_data(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
"""Extract vendor information from UK invoices using LLM."""
logger.info("Extracting UK vendor data with LLM")
# Get full document text for context
full_text = self._get_full_document_text(document)
# Use LLM extraction
result = self._extract_vendor_data_with_llm(full_text)
# Format VAT number if needed
if result.get('vat_number') and not result['vat_number'].upper().startswith('GB'):
result['vat_number'] = f"GB{result['vat_number']}"
return result
def extract_amounts(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
"""Extract amount information from UK invoices using LLM."""
logger.info("Extracting UK amount data with LLM")
# Get full document text for context
full_text = self._get_full_document_text(document)
# Use LLM extraction
result = self._extract_amounts_with_llm(full_text)
# Set default currency if not found
if not result.get('currency'):
result['currency'] = self.default_currency
return result
def extract_tax_data(self, document: Dict[str, Any], structure: Dict[str, Any],
amount_data: Dict[str, Any]) -> Dict[str, Any]:
"""Extract VAT information from UK invoices using LLM."""
logger.info("Extracting UK VAT data with LLM")
# Get full document text for context
full_text = self._get_full_document_text(document)
# Use LLM extraction with amount data for context
result = self._extract_tax_data_with_llm(full_text, amount_data)
return result
def extract_line_items(self, document: Dict[str, Any], structure: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Extract line items from UK invoices using LLM."""
logger.info("Extracting UK line items with LLM")
# Get full document text
full_text = self._get_full_document_text(document)
# Get tables if available
tables = []
for page in document.get('pages', []):
if page.get('tables'):
tables.extend(page['tables'])
# Use LLM extraction
line_items = self._extract_line_items_with_llm(full_text, tables)
return line_items
def extract_payment_data(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
"""Extract payment information from UK invoices using LLM."""
logger.info("Extracting UK payment data with LLM")
# Get full document text for context
full_text = self._get_full_document_text(document)
# Use LLM extraction
result = self._extract_payment_data_with_llm(full_text)
# Format sort code if needed (add dashes if missing)
if result.get('sort_code') and '-' not in result['sort_code']:
sc = result['sort_code'].replace(' ', '')
if len(sc) == 6:
result['sort_code'] = f"{sc[0:2]}-{sc[2:4]}-{sc[4:6]}"
return result
def _verify_uk_specific_fields(self, extraction_result: Dict[str, Any]) -> None:
"""Verify and fix UK-specific fields."""
# Ensure VAT number has GB prefix
if 'vendor' in extraction_result and extraction_result['vendor'].get('vat_number'):
vat = extraction_result['vendor']['vat_number']
if vat and not vat.upper().startswith('GB'):
extraction_result['vendor']['vat_number'] = f"GB{vat}"
# Set confidence to high for critical fields if present
if 'confidence' in extraction_result:
for field in ['invoice_number', 'vendor_name', 'vendor_vat_number', 'amounts_total']:
if field in extraction_result['confidence'] and extraction_result['confidence'][field] > 0:
extraction_result['confidence'][field] = 0.9
def _extract_structure_with_llm(self, document: Dict[str, Any]) -> Dict[str, Any]:
"""Use LLM to identify document structure regions."""
# Get document text
text = self._get_full_document_text(document)
if len(text) > 4000:
sample_text = text[:4000]
else:
sample_text = text
# Optimized prompt for document structure analysis
prompt = f"""You are an expert system analyzing UK invoices. Analyze the layout of this invoice text and identify the bounding coordinates for these key regions:
1. header: Contains company details, invoice number, dates
2. line_items: Contains the table of items/services with quantities and prices
3. totals: Contains subtotal, VAT, and total amounts
4. payment_info: Contains bank details and payment information
For each section, provide the approximate position as percentage values (x0, y0, x1, y1) where:
- x0, y0 is the top-left corner (0,0 being the top-left of document)
- x1, y1 is the bottom-right corner (100,100 being the bottom-right)
Be precise in your analysis - look for structural clues like section headings, table formats, and spacing.
Invoice text:
{sample_text}
Return ONLY a valid JSON object with the following structure, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"header": {{
"bbox": [x0, y0, x1, y1]
}},
"line_items": {{
"bbox": [x0, y0, x1, y1]
}},
"totals": {{
"bbox": [x0, y0, x1, y1]
}},
"payment_info": {{
"bbox": [x0, y0, x1, y1]
}}
}}
"""
# Call LLM
response = self.llm_client.generate(prompt)
print("prompt", prompt)
print("response", response)
# Parse the response
structure = {}
try:
llm_structure = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
# Convert percentage values to actual coordinates
first_page = document['pages'][0] if document.get('pages') else None
if first_page:
width = first_page.get('width', 1000)
height = first_page.get('height', 1000)
for section, section_data in llm_structure.items():
if section_data.get('bbox'):
x0, y0, x1, y1 = section_data['bbox']
section_data['bbox'] = [
width * x0 / 100,
height * y0 / 100,
width * x1 / 100,
height * y1 / 100
]
structure[section] = section_data
# Mark as structured
structure['is_structured'] = True
except Exception as e:
logger.warning(f"Failed to parse LLM structure response: {e}")
return structure
def _extract_invoice_metadata_with_llm(self, text: str) -> Dict[str, Any]:
"""Use LLM to extract invoice metadata with optimized prompt."""
# Limit text size for prompt
if len(text) > 6000:
text = text[:6000]
# Optimized prompt for invoice metadata extraction
prompt = f"""You are an expert system extracting data from UK invoices. Extract the following invoice metadata precisely:
1. invoice_number: The invoice number or reference (alphanumeric identifier)
2. issue_date: The date the invoice was issued, convert to YYYY-MM-DD format
3. due_date: The date payment is due, convert to YYYY-MM-DD format
4. po_number: The purchase order number referenced (if any)
5. reference: Any additional reference number or code
Pay attention to typical UK invoice layouts. Look for clear labels like "Invoice #", "Invoice Date", "Due Date", etc.
For dates, convert any format (DD/MM/YYYY, DD-MM-YYYY, etc.) to YYYY-MM-DD consistently.
Invoice text:
{text}
Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"number": "extracted invoice number",
"issue_date": "YYYY-MM-DD",
"due_date": "YYYY-MM-DD",
"po_number": "extracted PO number or empty string if not found",
"reference": "any reference number or empty string if not found"
}}
"""
# Call LLM
response = self.llm_client.generate(prompt)
# Parse response
try:
result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
return result
except Exception as e:
logger.warning(f"Failed to parse LLM invoice metadata: {e}")
return {
'number': '',
'issue_date': '',
'due_date': '',
'po_number': '',
'reference': ''
}
def _extract_vendor_data_with_llm(self, text: str) -> Dict[str, Any]:
"""Use LLM to extract vendor information with optimized prompt."""
# Limit text size for prompt
if len(text) > 6000:
text = text[:6000]
# Optimized prompt for vendor data extraction
prompt = f"""You are an expert system extracting data from UK invoices. Extract the following vendor information precisely:
1. name: The legal name of the vendor/supplier company
2. vat_number: The UK VAT registration number (should start with GB followed by 9 digits, often in format GB 123 4567 89)
3. address: The complete postal address of the vendor including postcode
4. company_number: The UK company registration number (usually 8 digits, often labeled as "Company No" or "Registered No")
5. contact: Email, phone number or website for the vendor
Pay attention to:
- VAT numbers typically appear with labels like "VAT Reg No", "VAT Number" or "VAT"
- Company numbers typically appear near registration statements or after "Registered in England"
- The company name is typically at the top of the invoice or near logo
- UK postcodes have formats like "AB12 3CD" and are at the end of addresses
Invoice text:
{text}
Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"name": "full vendor company name",
"vat_number": "full VAT number with GB prefix",
"address": "complete vendor address on one line",
"company_number": "company registration number",
"contact": "contact information"
}}
"""
# Call LLM
response = self.llm_client.generate(prompt)
#print("prompt", prompt)
#print("response", response)
# Parse response
try:
result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
return result
except Exception as e:
logger.warning(f"Failed to parse LLM vendor data: {e}")
return {
'name': '',
'vat_number': '',
'address': '',
'company_number': '',
'contact': ''
}
def _extract_amounts_with_llm(self, text: str) -> Dict[str, Any]:
"""Use LLM to extract amount information with optimized prompt."""
# Limit text size for prompt
if len(text) > 6000:
# For amounts, the end of the document is more relevant
text = text[-6000:]
# Optimized prompt for amounts extraction
prompt = f"""You are an expert system extracting data from UK invoices. Extract the following financial information precisely:
1. subtotal: The amount before VAT/tax (also called net amount, goods/services total, or amount excluding VAT)
2. total: The total amount due/payable (also called gross amount, balance due, amount including VAT, or total due)
3. currency: The currency code (GBP, USD, EUR, etc.)
Important guidelines:
- Convert all amounts to decimal numbers (e.g., 1234.56, not £1,234.56)
- Look for symbols (£, $, €) to determine currency
- Look for explicit currency labels like "Currency: GBP" or "All amounts in GBP"
- The total is typically the largest amount and appears near the bottom
- Amounts often appear in a summary section with clear labels
- In UK, GBP is the default currency
Invoice text:
{text}
Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"subtotal": numeric_value_without_currency_symbol,
"total": numeric_value_without_currency_symbol,
"currency": "three_letter_currency_code"
}}
"""
# Call LLM
response = self.llm_client.generate(prompt)
# Parse response
try:
result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
# Ensure numeric values
for field in ['subtotal', 'total']:
if field in result and result[field] is not None:
result[field] = float(result[field])
return result
except Exception as e:
logger.warning(f"Failed to parse LLM amount data: {e}")
return {
'subtotal': None,
'total': None,
'currency': self.default_currency
}
def _extract_tax_data_with_llm(self, text: str, amount_data: Dict[str, Any]) -> Dict[str, Any]:
"""Use LLM to extract VAT/tax information with optimized prompt."""
# Limit text size for prompt
if len(text) > 6000:
# For tax data, the end of the document is more relevant
text = text[-6000:]
# Add context about already extracted amounts
amount_context = ""
if amount_data.get('subtotal') is not None and amount_data.get('total') is not None:
amount_context = f"\nAdditional context: Subtotal = {amount_data['subtotal']}, Total = {amount_data['total']}, Currency = {amount_data.get('currency', 'GBP')}"
# Optimized prompt for VAT extraction
prompt = f"""You are an expert system extracting data from UK invoices. Extract the following VAT/tax information precisely:
1. vat: The VAT amount (the tax amount added to the subtotal)
2. vat_rate: The VAT rate applied as a percentage (standard UK rates are 0%, 5%, or 20%)
Important guidelines:
- Convert amounts to decimal numbers without currency symbols
- VAT amounts are typically labeled as "VAT", "V.A.T.", or "Tax"
- The VAT rate is often shown as a percentage (e.g., "VAT @ 20%")
- VAT can be calculated as (Total - Subtotal) if not explicitly stated
- Common UK VAT rates are 0% (zero-rated), 5% (reduced), and 20% (standard)
- VAT amounts typically appear near the subtotal and total{amount_context}
Invoice text:
{text}
Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"vat": numeric_vat_amount,
"vat_rate": numeric_vat_percentage_rate
}}
"""
# Call LLM
response = self.llm_client.generate(prompt)
# Parse response
try:
result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
# Ensure numeric values
for field in ['vat', 'vat_rate']:
if field in result and result[field] is not None:
result[field] = float(result[field])
# If VAT is not extracted but we have subtotal and total, calculate it
if result.get('vat') is None and amount_data.get('subtotal') and amount_data.get('total'):
result['vat'] = round(amount_data['total'] - amount_data['subtotal'], 2)
# Try to determine rate from the calculated VAT
if result.get('vat_rate') is None and amount_data.get('subtotal') > 0:
rate = (result['vat'] / amount_data['subtotal']) * 100
# Match to nearest standard rate
nearest_rate = min(self.vat_rates, key=lambda x: abs(x - rate))
if abs(nearest_rate - rate) < 1.0: # Within 1% of a standard rate
result['vat_rate'] = nearest_rate
return result
except Exception as e:
logger.warning(f"Failed to parse LLM tax data: {e}")
return {
'vat': None,
'vat_rate': None
}
def _extract_line_items_with_llm(self, text: str, tables: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Use LLM to extract line items with optimized prompt."""
# Prepare table information
table_descriptions = []
if tables:
for i, table in enumerate(tables):
cells_text = " | ".join(cell.get('text', '') for cell in table.get('cells', []))
table_descriptions.append(f"Table {i+1}: {cells_text}")
table_context = ""
if table_descriptions:
table_context = "\n\nDetected tables:\n" + "\n".join(table_descriptions)
# Limit text size for prompt
if len(text) > 6000:
# For line items, the middle of the document is more relevant
middle_start = len(text) // 4
middle_end = 3 * len(text) // 4
text = text[middle_start:middle_end]
# Optimized prompt for line items extraction
prompt = f"""You are an expert system extracting data from UK invoices. Extract the line items (products or services) from this invoice.
Pay close attention to the structure of the line items section, which typically appears as a table or list with columns such as:
- Description or Item/Service
- Quantity or Qty
- Unit Price or Rate
- VAT Rate or Tax
- Amount or Total
Important guidelines:
- Each line item represents a separate product or service being invoiced
- Extract ALL line items, even if there are many
- Convert all numeric values to plain numbers without currency symbols
- The "description" should include the full product/service name
- If the VAT/tax rate is specified per line item, include it as a percentage
- If any field is missing, use null for numeric fields or empty string for text fields{table_context}
Invoice text:
{text}
Return ONLY a valid JSON array of line items with exactly this structure, and do not precede your output with any other text, curly bracket should be the first character of your output:
[
{{
"description": "full item description",
"quantity": numeric_quantity,
"unit_price": numeric_unit_price,
"vat_rate": numeric_vat_rate_percentage,
"amount": numeric_line_total
}},
...
]
If you cannot identify any line items, return an empty array [].
"""
# Call LLM
response = self.llm_client.generate(prompt)
# Parse response
try:
result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
# Ensure numeric values in all line items
for item in result:
for field in ['quantity', 'unit_price', 'amount', 'vat_rate']:
if field in item and item[field] is not None:
try:
item[field] = float(item[field])
except:
item[field] = None
return result
except Exception as e:
logger.warning(f"Failed to parse LLM line items: {e}")
return []
def _extract_payment_data_with_llm(self, text: str) -> Dict[str, Any]:
"""Use LLM to extract payment information with optimized prompt."""
# Limit text size for prompt
if len(text) > 6000:
# For payment data, the end of the document is more relevant
text = text[-6000:]
# Optimized prompt for payment data extraction
prompt = f"""You are an expert system extracting data from UK invoices. Extract the following payment information precisely:
1. bank_name: The name of the bank holding the account
2. account_number: The UK bank account number (typically 8 digits)
3. sort_code: The UK bank sort code (format: XX-XX-XX or XXXXXX)
4. iban: The International Bank Account Number (if present)
5. payment_terms: Payment terms (e.g., "30 days", "Net 15", "Payment due on receipt")
Important guidelines:
- Look for a dedicated "Payment Details" or "Banking Details" section
- Bank account details typically appear at the bottom of the invoice
- Sort codes are typically 6 digits, often formatted as XX-XX-XX
- Account numbers are typically 8 digits
- Payment terms often indicate the timeframe for payment (e.g., "30 days from invoice date")
- IBAN numbers for UK typically start with GB followed by 2 digits and then 18+ characters
Invoice text:
{text}
Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"bank_name": "name of bank",
"account_number": "account number digits only",
"sort_code": "sort code digits with or without dashes",
"iban": "full IBAN if present",
"payment_terms": "payment terms text"
}}
"""
# Call LLM
response = self.llm_client.generate(prompt)
# Parse response
try:
result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
return result
except Exception as e:
logger.warning(f"Failed to parse LLM payment data: {e}")
return {
'bank_name': '',
'account_number': '',
'sort_code': '',
'iban': '',
'payment_terms': ''
}
def _parse_uk_date(self, date_str: str) -> Optional[str]:
"""Parse a date string in various UK formats and return ISO format."""
if not date_str:
return None
# Clean the date string
date_str = date_str.strip()
# Try all configured date formats
for fmt in self.date_formats:
try:
date_obj = datetime.datetime.strptime(date_str, fmt)
return date_obj.strftime('%Y-%m-%d')
except ValueError:
continue
# If standard formats fail, try some common variations
try:
# Handle "1st January 2023" type formats
date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
# Try again with standard formats
for fmt in ['%d %B %Y', '%d %b %Y', '%B %d %Y', '%b %d %Y']:
try:
date_obj = datetime.datetime.strptime(date_str, fmt)
return date_obj.strftime('%Y-%m-%d')
except ValueError:
continue
except Exception as e:
logger.warning(f"Error in date parsing: {e}")
# If all parsing attempts fail
return None
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
BaseExtractor | - |
Parameter Details
bases: Parameter of type BaseExtractor
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self, config)
Purpose: Internal method: init
Parameters:
config: Parameter
Returns: None
_get_full_document_text(self, document) -> str
Purpose: Extract the full text content from the document, handling different document structures. Args: document: The document object Returns: Combined text from all pages
Parameters:
document: Type: Dict[str, Any]
Returns: Returns str
extract(self, document, language) -> Dict[str, Any]
Purpose: Extract invoice data from the document using UK-specific logic. Args: document: Processed document from DocumentProcessor language: Detected language of the document Returns: Dict containing extracted invoice fields
Parameters:
document: Type: Dict[str, Any]language: Type: str
Returns: Returns Dict[str, Any]
extract_structure(self, document) -> Dict[str, Any]
Purpose: Extract document structure with UK invoice layout awareness using LLM.
Parameters:
document: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
extract_invoice_metadata(self, document, structure) -> Dict[str, Any]
Purpose: Extract UK invoice metadata (invoice number, dates, references) using LLM.
Parameters:
document: Type: Dict[str, Any]structure: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
extract_vendor_data(self, document, structure) -> Dict[str, Any]
Purpose: Extract vendor information from UK invoices using LLM.
Parameters:
document: Type: Dict[str, Any]structure: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
extract_amounts(self, document, structure) -> Dict[str, Any]
Purpose: Extract amount information from UK invoices using LLM.
Parameters:
document: Type: Dict[str, Any]structure: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
extract_tax_data(self, document, structure, amount_data) -> Dict[str, Any]
Purpose: Extract VAT information from UK invoices using LLM.
Parameters:
document: Type: Dict[str, Any]structure: Type: Dict[str, Any]amount_data: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
extract_line_items(self, document, structure) -> List[Dict[str, Any]]
Purpose: Extract line items from UK invoices using LLM.
Parameters:
document: Type: Dict[str, Any]structure: Type: Dict[str, Any]
Returns: Returns List[Dict[str, Any]]
extract_payment_data(self, document, structure) -> Dict[str, Any]
Purpose: Extract payment information from UK invoices using LLM.
Parameters:
document: Type: Dict[str, Any]structure: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
_verify_uk_specific_fields(self, extraction_result) -> None
Purpose: Verify and fix UK-specific fields.
Parameters:
extraction_result: Type: Dict[str, Any]
Returns: Returns None
_extract_structure_with_llm(self, document) -> Dict[str, Any]
Purpose: Use LLM to identify document structure regions.
Parameters:
document: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
_extract_invoice_metadata_with_llm(self, text) -> Dict[str, Any]
Purpose: Use LLM to extract invoice metadata with optimized prompt.
Parameters:
text: Type: str
Returns: Returns Dict[str, Any]
_extract_vendor_data_with_llm(self, text) -> Dict[str, Any]
Purpose: Use LLM to extract vendor information with optimized prompt.
Parameters:
text: Type: str
Returns: Returns Dict[str, Any]
_extract_amounts_with_llm(self, text) -> Dict[str, Any]
Purpose: Use LLM to extract amount information with optimized prompt.
Parameters:
text: Type: str
Returns: Returns Dict[str, Any]
_extract_tax_data_with_llm(self, text, amount_data) -> Dict[str, Any]
Purpose: Use LLM to extract VAT/tax information with optimized prompt.
Parameters:
text: Type: stramount_data: Type: Dict[str, Any]
Returns: Returns Dict[str, Any]
_extract_line_items_with_llm(self, text, tables) -> List[Dict[str, Any]]
Purpose: Use LLM to extract line items with optimized prompt.
Parameters:
text: Type: strtables: Type: List[Dict[str, Any]]
Returns: Returns List[Dict[str, Any]]
_extract_payment_data_with_llm(self, text) -> Dict[str, Any]
Purpose: Use LLM to extract payment information with optimized prompt.
Parameters:
text: Type: str
Returns: Returns Dict[str, Any]
_parse_uk_date(self, date_str) -> Optional[str]
Purpose: Parse a date string in various UK formats and return ISO format.
Parameters:
date_str: Type: str
Returns: Returns Optional[str]
Required Imports
import re
import logging
import datetime
from typing import Dict
from typing import List
Usage Example
# Example usage:
# result = UKExtractor(bases)
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class TestUKExtractor 75.7% similar
-
class BaseExtractor 74.5% similar
-
class UKValidator 70.0% similar
-
class AUExtractor 65.3% similar
-
class TestUKValidator 63.7% similar