UKExtractor - Code Extractor

class UKExtractor

Maturity: 27

UK-specific invoice data extractor.

File:
/tf/active/vicechatdev/invoice_extraction/extractors/uk_extractor.py

Lines:
12 - 684

Complexity:
moderate

Purpose

UK-specific invoice data extractor.

Source Code

class UKExtractor(BaseExtractor):
    """UK-specific invoice data extractor."""
    
    def __init__(self, config=None):
        super().__init__(config)
        # Only initialize LLM client if not already initialized by parent
        if self.llm_client is None:
            self.llm_client = LLMClient(self.config.get('llm', {}))
        
        # UK-specific configuration
        self.uk_config = self.config.get('uk_extractor', {})
        
        # UK VAT rate options
        self.vat_rates = self.uk_config.get('vat_rates', [0, 5, 20])
        
        # Default currency
        self.default_currency = self.uk_config.get('default_currency', 'GBP')
        
        # Date format preferences for UK
        self.date_formats = self.uk_config.get('date_formats', [
            '%d/%m/%Y', '%d-%m-%Y', '%d.%m.%Y',  # UK formats (day first)
            '%Y-%m-%d', '%d %b %Y', '%d %B %Y'   # ISO and written month formats
        ])

    def _get_full_document_text(self, document: Dict[str, Any]) -> str:
        """
        Extract the full text content from the document, handling different document structures.
        
        Args:
            document: The document object
            
        Returns:
            Combined text from all pages
        """
        full_text = ""
        
        # If text is directly in the document
        if document.get('text'):
            full_text = document['text']
        # If text is embedded in pages
        elif document.get('pages'):
            page_texts = []
            for page in document['pages']:
                if page.get('text'):
                    page_texts.append(page['text'])
            full_text = "\n\n".join(page_texts)
        
        # If still no text, try extracting from blocks
        if not full_text and document.get('blocks'):
            block_texts = [block.get('text', '') for block in document['blocks']]
            full_text = " ".join(block_texts)
        
        # Finally check for blocks in pages
        if not full_text and document.get('pages'):
            block_texts = []
            for page in document['pages']:
                if page.get('blocks'):
                    for block in page['blocks']:
                        if block.get('text'):
                            block_texts.append(block['text'])
            full_text = " ".join(block_texts)
        
        return full_text

    def extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
        """
        Extract invoice data from the document using UK-specific logic.
        
        Args:
            document: Processed document from DocumentProcessor
            language: Detected language of the document
            
        Returns:
            Dict containing extracted invoice fields
        """
        logger.info("Extracting data with UK-specific LLM extractor")
        
        # Use staged extraction process defined in base class
        extraction_result = self.extract_staged(document, language)
        
        # Add UK-specific metadata
        extraction_result['metadata']['entity'] = 'UK'
        extraction_result['metadata']['country'] = 'United Kingdom'
        
        # Verify critical fields for UK
        self._verify_uk_specific_fields(extraction_result)
        
        return extraction_result
    
    def extract_structure(self, document: Dict[str, Any]) -> Dict[str, Any]:
        """Extract document structure with UK invoice layout awareness using LLM."""
        logger.info("Extracting document structure with LLM")
        
        # Use LLM to identify regions
        structure = self._extract_structure_with_llm(document)
        
        if not structure:
            # Fallback to basic structure from parent class
            structure = super().extract_structure(document)
        
        return structure
    
    def extract_invoice_metadata(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
        """Extract UK invoice metadata (invoice number, dates, references) using LLM."""
        logger.info("Extracting UK invoice metadata with LLM")
        
        # Get full document text for context
        full_text = self._get_full_document_text(document)
        
        # Use LLM extraction
        result = self._extract_invoice_metadata_with_llm(full_text)
        
        return result
    
    def extract_vendor_data(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
        """Extract vendor information from UK invoices using LLM."""
        logger.info("Extracting UK vendor data with LLM")
        
        # Get full document text for context
        full_text = self._get_full_document_text(document)
        
        # Use LLM extraction
        result = self._extract_vendor_data_with_llm(full_text)
        
        # Format VAT number if needed
        if result.get('vat_number') and not result['vat_number'].upper().startswith('GB'):
            result['vat_number'] = f"GB{result['vat_number']}"
            
        return result
    
    def extract_amounts(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
        """Extract amount information from UK invoices using LLM."""
        logger.info("Extracting UK amount data with LLM")
        
        # Get full document text for context
        full_text = self._get_full_document_text(document)
        
        # Use LLM extraction
        result = self._extract_amounts_with_llm(full_text)
        
        # Set default currency if not found
        if not result.get('currency'):
            result['currency'] = self.default_currency
            
        return result
    
    def extract_tax_data(self, document: Dict[str, Any], structure: Dict[str, Any], 
                         amount_data: Dict[str, Any]) -> Dict[str, Any]:
        """Extract VAT information from UK invoices using LLM."""
        logger.info("Extracting UK VAT data with LLM")
        
        # Get full document text for context
        full_text = self._get_full_document_text(document)
        
        # Use LLM extraction with amount data for context
        result = self._extract_tax_data_with_llm(full_text, amount_data)
        
        return result
    
    def extract_line_items(self, document: Dict[str, Any], structure: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Extract line items from UK invoices using LLM."""
        logger.info("Extracting UK line items with LLM")
        
        # Get full document text
        full_text = self._get_full_document_text(document)
        
        # Get tables if available
        tables = []
        for page in document.get('pages', []):
            if page.get('tables'):
                tables.extend(page['tables'])
        
        # Use LLM extraction
        line_items = self._extract_line_items_with_llm(full_text, tables)
        
        return line_items
    
    def extract_payment_data(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
        """Extract payment information from UK invoices using LLM."""
        logger.info("Extracting UK payment data with LLM")
        
        # Get full document text for context
        full_text = self._get_full_document_text(document)
        
        # Use LLM extraction
        result = self._extract_payment_data_with_llm(full_text)
        
        # Format sort code if needed (add dashes if missing)
        if result.get('sort_code') and '-' not in result['sort_code']:
            sc = result['sort_code'].replace(' ', '')
            if len(sc) == 6:
                result['sort_code'] = f"{sc[0:2]}-{sc[2:4]}-{sc[4:6]}"
        
        return result
    
    def _verify_uk_specific_fields(self, extraction_result: Dict[str, Any]) -> None:
        """Verify and fix UK-specific fields."""
        # Ensure VAT number has GB prefix
        if 'vendor' in extraction_result and extraction_result['vendor'].get('vat_number'):
            vat = extraction_result['vendor']['vat_number']
            if vat and not vat.upper().startswith('GB'):
                extraction_result['vendor']['vat_number'] = f"GB{vat}"
        
        # Set confidence to high for critical fields if present
        if 'confidence' in extraction_result:
            for field in ['invoice_number', 'vendor_name', 'vendor_vat_number', 'amounts_total']:
                if field in extraction_result['confidence'] and extraction_result['confidence'][field] > 0:
                    extraction_result['confidence'][field] = 0.9
    
    def _extract_structure_with_llm(self, document: Dict[str, Any]) -> Dict[str, Any]:
        """Use LLM to identify document structure regions."""
        # Get document text
        text = self._get_full_document_text(document)
        if len(text) > 4000:
            sample_text = text[:4000]
        else:
            sample_text = text
            
        # Optimized prompt for document structure analysis
        prompt = f"""You are an expert system analyzing UK invoices. Analyze the layout of this invoice text and identify the bounding coordinates for these key regions:

1. header: Contains company details, invoice number, dates
2. line_items: Contains the table of items/services with quantities and prices
3. totals: Contains subtotal, VAT, and total amounts
4. payment_info: Contains bank details and payment information

For each section, provide the approximate position as percentage values (x0, y0, x1, y1) where:
- x0, y0 is the top-left corner (0,0 being the top-left of document)
- x1, y1 is the bottom-right corner (100,100 being the bottom-right)

Be precise in your analysis - look for structural clues like section headings, table formats, and spacing.

Invoice text:
{sample_text}

Return ONLY a valid JSON object with the following structure, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"header": {{
    "bbox": [x0, y0, x1, y1]
}},
"line_items": {{
    "bbox": [x0, y0, x1, y1]
}},
"totals": {{
    "bbox": [x0, y0, x1, y1]
}},
"payment_info": {{
    "bbox": [x0, y0, x1, y1]
}}
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        print("prompt", prompt)
        print("response", response)
        
        # Parse the response
        structure = {}
        try:
            llm_structure = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            
            # Convert percentage values to actual coordinates
            first_page = document['pages'][0] if document.get('pages') else None
            if first_page:
                width = first_page.get('width', 1000)
                height = first_page.get('height', 1000)
                
                for section, section_data in llm_structure.items():
                    if section_data.get('bbox'):
                        x0, y0, x1, y1 = section_data['bbox']
                        section_data['bbox'] = [
                            width * x0 / 100,
                            height * y0 / 100,
                            width * x1 / 100,
                            height * y1 / 100
                        ]
                        structure[section] = section_data
                
                # Mark as structured
                structure['is_structured'] = True
        except Exception as e:
            logger.warning(f"Failed to parse LLM structure response: {e}")
        
        return structure

    def _extract_invoice_metadata_with_llm(self, text: str) -> Dict[str, Any]:
        """Use LLM to extract invoice metadata with optimized prompt."""
        # Limit text size for prompt
        if len(text) > 6000:
            text = text[:6000]
            
        # Optimized prompt for invoice metadata extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the following invoice metadata precisely:

1. invoice_number: The invoice number or reference (alphanumeric identifier)
2. issue_date: The date the invoice was issued, convert to YYYY-MM-DD format
3. due_date: The date payment is due, convert to YYYY-MM-DD format
4. po_number: The purchase order number referenced (if any)
5. reference: Any additional reference number or code

Pay attention to typical UK invoice layouts. Look for clear labels like "Invoice #", "Invoice Date", "Due Date", etc. 
For dates, convert any format (DD/MM/YYYY, DD-MM-YYYY, etc.) to YYYY-MM-DD consistently.

Invoice text:
{text}

Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"number": "extracted invoice number",
"issue_date": "YYYY-MM-DD",
"due_date": "YYYY-MM-DD",
"po_number": "extracted PO number or empty string if not found",
"reference": "any reference number or empty string if not found"
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM invoice metadata: {e}")
            return {
                'number': '',
                'issue_date': '',
                'due_date': '',
                'po_number': '',
                'reference': ''
            }

    def _extract_vendor_data_with_llm(self, text: str) -> Dict[str, Any]:
        """Use LLM to extract vendor information with optimized prompt."""
        # Limit text size for prompt
        if len(text) > 6000:
            text = text[:6000]
            
        # Optimized prompt for vendor data extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the following vendor information precisely:

1. name: The legal name of the vendor/supplier company
2. vat_number: The UK VAT registration number (should start with GB followed by 9 digits, often in format GB 123 4567 89)
3. address: The complete postal address of the vendor including postcode
4. company_number: The UK company registration number (usually 8 digits, often labeled as "Company No" or "Registered No")
5. contact: Email, phone number or website for the vendor

Pay attention to:
- VAT numbers typically appear with labels like "VAT Reg No", "VAT Number" or "VAT"
- Company numbers typically appear near registration statements or after "Registered in England"
- The company name is typically at the top of the invoice or near logo
- UK postcodes have formats like "AB12 3CD" and are at the end of addresses

Invoice text:
{text}

Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"name": "full vendor company name",
"vat_number": "full VAT number with GB prefix",
"address": "complete vendor address on one line",
"company_number": "company registration number",
"contact": "contact information"
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        #print("prompt", prompt)
        #print("response", response) 
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM vendor data: {e}")
            return {
                'name': '',
                'vat_number': '',
                'address': '',
                'company_number': '',
                'contact': ''
            }

    def _extract_amounts_with_llm(self, text: str) -> Dict[str, Any]:
        """Use LLM to extract amount information with optimized prompt."""
        # Limit text size for prompt
        if len(text) > 6000:
            # For amounts, the end of the document is more relevant
            text = text[-6000:]
            
        # Optimized prompt for amounts extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the following financial information precisely:

1. subtotal: The amount before VAT/tax (also called net amount, goods/services total, or amount excluding VAT)
2. total: The total amount due/payable (also called gross amount, balance due, amount including VAT, or total due)
3. currency: The currency code (GBP, USD, EUR, etc.)

Important guidelines:
- Convert all amounts to decimal numbers (e.g., 1234.56, not £1,234.56)
- Look for symbols (£, $, €) to determine currency
- Look for explicit currency labels like "Currency: GBP" or "All amounts in GBP"
- The total is typically the largest amount and appears near the bottom
- Amounts often appear in a summary section with clear labels
- In UK, GBP is the default currency

Invoice text:
{text}

Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"subtotal": numeric_value_without_currency_symbol,
"total": numeric_value_without_currency_symbol,
"currency": "three_letter_currency_code"
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            
            # Ensure numeric values
            for field in ['subtotal', 'total']:
                if field in result and result[field] is not None:
                    result[field] = float(result[field])
            
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM amount data: {e}")
            return {
                'subtotal': None,
                'total': None,
                'currency': self.default_currency
            }

    def _extract_tax_data_with_llm(self, text: str, amount_data: Dict[str, Any]) -> Dict[str, Any]:
        """Use LLM to extract VAT/tax information with optimized prompt."""
        # Limit text size for prompt
        if len(text) > 6000:
            # For tax data, the end of the document is more relevant
            text = text[-6000:]
            
        # Add context about already extracted amounts
        amount_context = ""
        if amount_data.get('subtotal') is not None and amount_data.get('total') is not None:
            amount_context = f"\nAdditional context: Subtotal = {amount_data['subtotal']}, Total = {amount_data['total']}, Currency = {amount_data.get('currency', 'GBP')}"
            
        # Optimized prompt for VAT extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the following VAT/tax information precisely:

1. vat: The VAT amount (the tax amount added to the subtotal)
2. vat_rate: The VAT rate applied as a percentage (standard UK rates are 0%, 5%, or 20%)

Important guidelines:
- Convert amounts to decimal numbers without currency symbols
- VAT amounts are typically labeled as "VAT", "V.A.T.", or "Tax"
- The VAT rate is often shown as a percentage (e.g., "VAT @ 20%")
- VAT can be calculated as (Total - Subtotal) if not explicitly stated
- Common UK VAT rates are 0% (zero-rated), 5% (reduced), and 20% (standard)
- VAT amounts typically appear near the subtotal and total{amount_context}

Invoice text:
{text}

Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"vat": numeric_vat_amount,
"vat_rate": numeric_vat_percentage_rate
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            
            # Ensure numeric values
            for field in ['vat', 'vat_rate']:
                if field in result and result[field] is not None:
                    result[field] = float(result[field])
            
            # If VAT is not extracted but we have subtotal and total, calculate it
            if result.get('vat') is None and amount_data.get('subtotal') and amount_data.get('total'):
                result['vat'] = round(amount_data['total'] - amount_data['subtotal'], 2)
                
                # Try to determine rate from the calculated VAT
                if result.get('vat_rate') is None and amount_data.get('subtotal') > 0:
                    rate = (result['vat'] / amount_data['subtotal']) * 100
                    # Match to nearest standard rate
                    nearest_rate = min(self.vat_rates, key=lambda x: abs(x - rate))
                    if abs(nearest_rate - rate) < 1.0:  # Within 1% of a standard rate
                        result['vat_rate'] = nearest_rate
            
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM tax data: {e}")
            return {
                'vat': None,
                'vat_rate': None
            }

    def _extract_line_items_with_llm(self, text: str, tables: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Use LLM to extract line items with optimized prompt."""
        # Prepare table information
        table_descriptions = []
        if tables:
            for i, table in enumerate(tables):
                cells_text = " | ".join(cell.get('text', '') for cell in table.get('cells', []))
                table_descriptions.append(f"Table {i+1}: {cells_text}")
            
        table_context = ""
        if table_descriptions:
            table_context = "\n\nDetected tables:\n" + "\n".join(table_descriptions)
        
        # Limit text size for prompt
        if len(text) > 6000:
            # For line items, the middle of the document is more relevant
            middle_start = len(text) // 4
            middle_end = 3 * len(text) // 4
            text = text[middle_start:middle_end]
            
        # Optimized prompt for line items extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the line items (products or services) from this invoice.

Pay close attention to the structure of the line items section, which typically appears as a table or list with columns such as:
- Description or Item/Service
- Quantity or Qty
- Unit Price or Rate
- VAT Rate or Tax
- Amount or Total

Important guidelines:
- Each line item represents a separate product or service being invoiced
- Extract ALL line items, even if there are many
- Convert all numeric values to plain numbers without currency symbols
- The "description" should include the full product/service name
- If the VAT/tax rate is specified per line item, include it as a percentage
- If any field is missing, use null for numeric fields or empty string for text fields{table_context}

Invoice text:
{text}

Return ONLY a valid JSON array of line items with exactly this structure, and do not precede your output with any other text, curly bracket should be the first character of your output:
[
    {{
        "description": "full item description",
        "quantity": numeric_quantity,
        "unit_price": numeric_unit_price,
        "vat_rate": numeric_vat_rate_percentage,
        "amount": numeric_line_total
    }},
    ...
]

If you cannot identify any line items, return an empty array [].
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            
            # Ensure numeric values in all line items
            for item in result:
                for field in ['quantity', 'unit_price', 'amount', 'vat_rate']:
                    if field in item and item[field] is not None:
                        try:
                            item[field] = float(item[field])
                        except:
                            item[field] = None
            
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM line items: {e}")
            return []

    def _extract_payment_data_with_llm(self, text: str) -> Dict[str, Any]:
        """Use LLM to extract payment information with optimized prompt."""
        # Limit text size for prompt
        if len(text) > 6000:
            # For payment data, the end of the document is more relevant
            text = text[-6000:]
            
        # Optimized prompt for payment data extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the following payment information precisely:

1. bank_name: The name of the bank holding the account
2. account_number: The UK bank account number (typically 8 digits)
3. sort_code: The UK bank sort code (format: XX-XX-XX or XXXXXX)
4. iban: The International Bank Account Number (if present)
5. payment_terms: Payment terms (e.g., "30 days", "Net 15", "Payment due on receipt")

Important guidelines:
- Look for a dedicated "Payment Details" or "Banking Details" section
- Bank account details typically appear at the bottom of the invoice
- Sort codes are typically 6 digits, often formatted as XX-XX-XX
- Account numbers are typically 8 digits
- Payment terms often indicate the timeframe for payment (e.g., "30 days from invoice date")
- IBAN numbers for UK typically start with GB followed by 2 digits and then 18+ characters

Invoice text:
{text}

Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"bank_name": "name of bank",
"account_number": "account number digits only",
"sort_code": "sort code digits with or without dashes",
"iban": "full IBAN if present",
"payment_terms": "payment terms text"
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM payment data: {e}")
            return {
                'bank_name': '',
                'account_number': '',
                'sort_code': '',
                'iban': '',
                'payment_terms': ''
            }
    
    def _parse_uk_date(self, date_str: str) -> Optional[str]:
        """Parse a date string in various UK formats and return ISO format."""
        if not date_str:
            return None
            
        # Clean the date string
        date_str = date_str.strip()
        
        # Try all configured date formats
        for fmt in self.date_formats:
            try:
                date_obj = datetime.datetime.strptime(date_str, fmt)
                return date_obj.strftime('%Y-%m-%d')
            except ValueError:
                continue
                
        # If standard formats fail, try some common variations
        try:
            # Handle "1st January 2023" type formats
            date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
            
            # Try again with standard formats
            for fmt in ['%d %B %Y', '%d %b %Y', '%B %d %Y', '%b %d %Y']:
                try:
                    date_obj = datetime.datetime.strptime(date_str, fmt)
                    return date_obj.strftime('%Y-%m-%d')
                except ValueError:
                    continue
        except Exception as e:
            logger.warning(f"Error in date parsing: {e}")
                
        # If all parsing attempts fail
        return None

Parameters

Name	Type	Default	Kind
`bases`	BaseExtractor	-

Parameter Details

bases: Parameter of type BaseExtractor

Return Value

Returns unspecified type

Class Interface

Methods

`init(self, config)`

Purpose: Internal method: init

Parameters:

config: Parameter

Returns: None

`_get_full_document_text(self, document) -> str`

Purpose: Extract the full text content from the document, handling different document structures. Args: document: The document object Returns: Combined text from all pages

Parameters:

document: Type: Dict[str, Any]

Returns: Returns str

`extract(self, document, language) -> Dict[str, Any]`

Purpose: Extract invoice data from the document using UK-specific logic. Args: document: Processed document from DocumentProcessor language: Detected language of the document Returns: Dict containing extracted invoice fields

Parameters:

document: Type: Dict[str, Any]
language: Type: str

Returns: Returns Dict[str, Any]

`extract_structure(self, document) -> Dict[str, Any]`

Purpose: Extract document structure with UK invoice layout awareness using LLM.

Parameters:

document: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`extract_invoice_metadata(self, document, structure) -> Dict[str, Any]`

Purpose: Extract UK invoice metadata (invoice number, dates, references) using LLM.

Parameters:

document: Type: Dict[str, Any]
structure: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`extract_vendor_data(self, document, structure) -> Dict[str, Any]`

Purpose: Extract vendor information from UK invoices using LLM.

Parameters:

document: Type: Dict[str, Any]
structure: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`extract_amounts(self, document, structure) -> Dict[str, Any]`

Purpose: Extract amount information from UK invoices using LLM.

Parameters:

document: Type: Dict[str, Any]
structure: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`extract_tax_data(self, document, structure, amount_data) -> Dict[str, Any]`

Purpose: Extract VAT information from UK invoices using LLM.

Parameters:

document: Type: Dict[str, Any]
structure: Type: Dict[str, Any]
amount_data: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`extract_line_items(self, document, structure) -> List[Dict[str, Any]]`

Purpose: Extract line items from UK invoices using LLM.

Parameters:

document: Type: Dict[str, Any]
structure: Type: Dict[str, Any]

Returns: Returns List[Dict[str, Any]]

`extract_payment_data(self, document, structure) -> Dict[str, Any]`

Purpose: Extract payment information from UK invoices using LLM.

Parameters:

document: Type: Dict[str, Any]
structure: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`_verify_uk_specific_fields(self, extraction_result) -> None`

Purpose: Verify and fix UK-specific fields.

Parameters:

extraction_result: Type: Dict[str, Any]

Returns: Returns None

`_extract_structure_with_llm(self, document) -> Dict[str, Any]`

Purpose: Use LLM to identify document structure regions.

Parameters:

document: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`_extract_invoice_metadata_with_llm(self, text) -> Dict[str, Any]`

Purpose: Use LLM to extract invoice metadata with optimized prompt.

Parameters:

text: Type: str

Returns: Returns Dict[str, Any]

`_extract_vendor_data_with_llm(self, text) -> Dict[str, Any]`

Purpose: Use LLM to extract vendor information with optimized prompt.

Parameters:

text: Type: str

Returns: Returns Dict[str, Any]

`_extract_amounts_with_llm(self, text) -> Dict[str, Any]`

Purpose: Use LLM to extract amount information with optimized prompt.

Parameters:

text: Type: str

Returns: Returns Dict[str, Any]

`_extract_tax_data_with_llm(self, text, amount_data) -> Dict[str, Any]`

Purpose: Use LLM to extract VAT/tax information with optimized prompt.

Parameters:

text: Type: str
amount_data: Type: Dict[str, Any]

Returns: Returns Dict[str, Any]

`_extract_line_items_with_llm(self, text, tables) -> List[Dict[str, Any]]`

Purpose: Use LLM to extract line items with optimized prompt.

Parameters:

text: Type: str
tables: Type: List[Dict[str, Any]]

Returns: Returns List[Dict[str, Any]]

`_extract_payment_data_with_llm(self, text) -> Dict[str, Any]`

Purpose: Use LLM to extract payment information with optimized prompt.

Parameters:

text: Type: str

Returns: Returns Dict[str, Any]

`_parse_uk_date(self, date_str) -> Optional[str]`

Purpose: Parse a date string in various UK formats and return ISO format.

Parameters:

date_str: Type: str

Returns: Returns Optional[str]

Required Imports

import re
import logging
import datetime
from typing import Dict
from typing import List

Usage Example

# Example usage:
# result = UKExtractor(bases)

Similar Components

AI-powered semantic similarity - components with related functionality:

class TestUKExtractor 75.7% similar

Unit test class for testing the UKExtractor class, which extracts structured data from UK invoices including VAT numbers, dates, amounts, and line items.
From: /tf/active/vicechatdev/invoice_extraction/tests/test_extractors.py
class BaseExtractor 74.5% similar

Abstract base class that defines the interface and shared functionality for entity-specific invoice data extractors (UK, BE, AU), providing a multi-stage extraction pipeline for invoice processing.
From: /tf/active/vicechatdev/invoice_extraction/extractors/base_extractor.py
class UKValidator 70.0% similar

UK-specific invoice data validator that extends BaseValidator to implement validation rules specific to UK invoices including VAT number format, UK addresses, VAT rates, and banking details.
From: /tf/active/vicechatdev/invoice_extraction/validators/uk_validator.py
class AUExtractor 65.3% similar

Australia-specific invoice data extractor that uses LLM (Large Language Model) to extract structured invoice data from Australian tax invoices, handling ABN, ACN, GST, BSB numbers and Australian date formats.
From: /tf/active/vicechatdev/invoice_extraction/extractors/au_extractor.py
class TestUKValidator 63.7% similar

Unit test class for validating the UKValidator class functionality, testing UK-specific invoice validation rules including VAT numbers, addresses, banking details, and currency.
From: /tf/active/vicechatdev/invoice_extraction/tests/test_validators.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class UKExtractor(BaseExtractor):
    """UK-specific invoice data extractor."""
    
    def __init__(self, config=None):
        super().__init__(config)
        # Only initialize LLM client if not already initialized by parent
        if self.llm_client is None:
            self.llm_client = LLMClient(self.config.get('llm', {}))
        
        # UK-specific configuration
        self.uk_config = self.config.get('uk_extractor', {})
        
        # UK VAT rate options
        self.vat_rates = self.uk_config.get('vat_rates', [0, 5, 20])
        
        # Default currency
        self.default_currency = self.uk_config.get('default_currency', 'GBP')
        
        # Date format preferences for UK
        self.date_formats = self.uk_config.get('date_formats', [
            '%d/%m/%Y', '%d-%m-%Y', '%d.%m.%Y',  # UK formats (day first)
            '%Y-%m-%d', '%d %b %Y', '%d %B %Y'   # ISO and written month formats
        ])

    def _get_full_document_text(self, document: Dict[str, Any]) -> str:
        """
        Extract the full text content from the document, handling different document structures.
        
        Args:
            document: The document object
            
        Returns:
            Combined text from all pages
        """
        full_text = ""
        
        # If text is directly in the document
        if document.get('text'):
            full_text = document['text']
        # If text is embedded in pages
        elif document.get('pages'):
            page_texts = []
            for page in document['pages']:
                if page.get('text'):
                    page_texts.append(page['text'])
            full_text = "\n\n".join(page_texts)
        
        # If still no text, try extracting from blocks
        if not full_text and document.get('blocks'):
            block_texts = [block.get('text', '') for block in document['blocks']]
            full_text = " ".join(block_texts)
        
        # Finally check for blocks in pages
        if not full_text and document.get('pages'):
            block_texts = []
            for page in document['pages']:
                if page.get('blocks'):
                    for block in page['blocks']:
                        if block.get('text'):
                            block_texts.append(block['text'])
            full_text = " ".join(block_texts)
        
        return full_text

    def extract(self, document: Dict[str, Any], language: str) -> Dict[str, Any]:
        """
        Extract invoice data from the document using UK-specific logic.
        
        Args:
            document: Processed document from DocumentProcessor
            language: Detected language of the document
            
        Returns:
            Dict containing extracted invoice fields
        """
        logger.info("Extracting data with UK-specific LLM extractor")
        
        # Use staged extraction process defined in base class
        extraction_result = self.extract_staged(document, language)
        
        # Add UK-specific metadata
        extraction_result['metadata']['entity'] = 'UK'
        extraction_result['metadata']['country'] = 'United Kingdom'
        
        # Verify critical fields for UK
        self._verify_uk_specific_fields(extraction_result)
        
        return extraction_result
    
    def extract_structure(self, document: Dict[str, Any]) -> Dict[str, Any]:
        """Extract document structure with UK invoice layout awareness using LLM."""
        logger.info("Extracting document structure with LLM")
        
        # Use LLM to identify regions
        structure = self._extract_structure_with_llm(document)
        
        if not structure:
            # Fallback to basic structure from parent class
            structure = super().extract_structure(document)
        
        return structure
    
    def extract_invoice_metadata(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
        """Extract UK invoice metadata (invoice number, dates, references) using LLM."""
        logger.info("Extracting UK invoice metadata with LLM")
        
        # Get full document text for context
        full_text = self._get_full_document_text(document)
        
        # Use LLM extraction
        result = self._extract_invoice_metadata_with_llm(full_text)
        
        return result
    
    def extract_vendor_data(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
        """Extract vendor information from UK invoices using LLM."""
        logger.info("Extracting UK vendor data with LLM")
        
        # Get full document text for context
        full_text = self._get_full_document_text(document)
        
        # Use LLM extraction
        result = self._extract_vendor_data_with_llm(full_text)
        
        # Format VAT number if needed
        if result.get('vat_number') and not result['vat_number'].upper().startswith('GB'):
            result['vat_number'] = f"GB{result['vat_number']}"
            
        return result
    
    def extract_amounts(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
        """Extract amount information from UK invoices using LLM."""
        logger.info("Extracting UK amount data with LLM")
        
        # Get full document text for context
        full_text = self._get_full_document_text(document)
        
        # Use LLM extraction
        result = self._extract_amounts_with_llm(full_text)
        
        # Set default currency if not found
        if not result.get('currency'):
            result['currency'] = self.default_currency
            
        return result
    
    def extract_tax_data(self, document: Dict[str, Any], structure: Dict[str, Any], 
                         amount_data: Dict[str, Any]) -> Dict[str, Any]:
        """Extract VAT information from UK invoices using LLM."""
        logger.info("Extracting UK VAT data with LLM")
        
        # Get full document text for context
        full_text = self._get_full_document_text(document)
        
        # Use LLM extraction with amount data for context
        result = self._extract_tax_data_with_llm(full_text, amount_data)
        
        return result
    
    def extract_line_items(self, document: Dict[str, Any], structure: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Extract line items from UK invoices using LLM."""
        logger.info("Extracting UK line items with LLM")
        
        # Get full document text
        full_text = self._get_full_document_text(document)
        
        # Get tables if available
        tables = []
        for page in document.get('pages', []):
            if page.get('tables'):
                tables.extend(page['tables'])
        
        # Use LLM extraction
        line_items = self._extract_line_items_with_llm(full_text, tables)
        
        return line_items
    
    def extract_payment_data(self, document: Dict[str, Any], structure: Dict[str, Any]) -> Dict[str, Any]:
        """Extract payment information from UK invoices using LLM."""
        logger.info("Extracting UK payment data with LLM")
        
        # Get full document text for context
        full_text = self._get_full_document_text(document)
        
        # Use LLM extraction
        result = self._extract_payment_data_with_llm(full_text)
        
        # Format sort code if needed (add dashes if missing)
        if result.get('sort_code') and '-' not in result['sort_code']:
            sc = result['sort_code'].replace(' ', '')
            if len(sc) == 6:
                result['sort_code'] = f"{sc[0:2]}-{sc[2:4]}-{sc[4:6]}"
        
        return result
    
    def _verify_uk_specific_fields(self, extraction_result: Dict[str, Any]) -> None:
        """Verify and fix UK-specific fields."""
        # Ensure VAT number has GB prefix
        if 'vendor' in extraction_result and extraction_result['vendor'].get('vat_number'):
            vat = extraction_result['vendor']['vat_number']
            if vat and not vat.upper().startswith('GB'):
                extraction_result['vendor']['vat_number'] = f"GB{vat}"
        
        # Set confidence to high for critical fields if present
        if 'confidence' in extraction_result:
            for field in ['invoice_number', 'vendor_name', 'vendor_vat_number', 'amounts_total']:
                if field in extraction_result['confidence'] and extraction_result['confidence'][field] > 0:
                    extraction_result['confidence'][field] = 0.9
    
    def _extract_structure_with_llm(self, document: Dict[str, Any]) -> Dict[str, Any]:
        """Use LLM to identify document structure regions."""
        # Get document text
        text = self._get_full_document_text(document)
        if len(text) > 4000:
            sample_text = text[:4000]
        else:
            sample_text = text
            
        # Optimized prompt for document structure analysis
        prompt = f"""You are an expert system analyzing UK invoices. Analyze the layout of this invoice text and identify the bounding coordinates for these key regions:

1. header: Contains company details, invoice number, dates
2. line_items: Contains the table of items/services with quantities and prices
3. totals: Contains subtotal, VAT, and total amounts
4. payment_info: Contains bank details and payment information

For each section, provide the approximate position as percentage values (x0, y0, x1, y1) where:
- x0, y0 is the top-left corner (0,0 being the top-left of document)
- x1, y1 is the bottom-right corner (100,100 being the bottom-right)

Be precise in your analysis - look for structural clues like section headings, table formats, and spacing.

Invoice text:
{sample_text}

Return ONLY a valid JSON object with the following structure, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"header": {{
    "bbox": [x0, y0, x1, y1]
}},
"line_items": {{
    "bbox": [x0, y0, x1, y1]
}},
"totals": {{
    "bbox": [x0, y0, x1, y1]
}},
"payment_info": {{
    "bbox": [x0, y0, x1, y1]
}}
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        print("prompt", prompt)
        print("response", response)
        
        # Parse the response
        structure = {}
        try:
            llm_structure = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            
            # Convert percentage values to actual coordinates
            first_page = document['pages'][0] if document.get('pages') else None
            if first_page:
                width = first_page.get('width', 1000)
                height = first_page.get('height', 1000)
                
                for section, section_data in llm_structure.items():
                    if section_data.get('bbox'):
                        x0, y0, x1, y1 = section_data['bbox']
                        section_data['bbox'] = [
                            width * x0 / 100,
                            height * y0 / 100,
                            width * x1 / 100,
                            height * y1 / 100
                        ]
                        structure[section] = section_data
                
                # Mark as structured
                structure['is_structured'] = True
        except Exception as e:
            logger.warning(f"Failed to parse LLM structure response: {e}")
        
        return structure

    def _extract_invoice_metadata_with_llm(self, text: str) -> Dict[str, Any]:
        """Use LLM to extract invoice metadata with optimized prompt."""
        # Limit text size for prompt
        if len(text) > 6000:
            text = text[:6000]
            
        # Optimized prompt for invoice metadata extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the following invoice metadata precisely:

1. invoice_number: The invoice number or reference (alphanumeric identifier)
2. issue_date: The date the invoice was issued, convert to YYYY-MM-DD format
3. due_date: The date payment is due, convert to YYYY-MM-DD format
4. po_number: The purchase order number referenced (if any)
5. reference: Any additional reference number or code

Pay attention to typical UK invoice layouts. Look for clear labels like "Invoice #", "Invoice Date", "Due Date", etc. 
For dates, convert any format (DD/MM/YYYY, DD-MM-YYYY, etc.) to YYYY-MM-DD consistently.

Invoice text:
{text}

Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"number": "extracted invoice number",
"issue_date": "YYYY-MM-DD",
"due_date": "YYYY-MM-DD",
"po_number": "extracted PO number or empty string if not found",
"reference": "any reference number or empty string if not found"
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM invoice metadata: {e}")
            return {
                'number': '',
                'issue_date': '',
                'due_date': '',
                'po_number': '',
                'reference': ''
            }

    def _extract_vendor_data_with_llm(self, text: str) -> Dict[str, Any]:
        """Use LLM to extract vendor information with optimized prompt."""
        # Limit text size for prompt
        if len(text) > 6000:
            text = text[:6000]
            
        # Optimized prompt for vendor data extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the following vendor information precisely:

1. name: The legal name of the vendor/supplier company
2. vat_number: The UK VAT registration number (should start with GB followed by 9 digits, often in format GB 123 4567 89)
3. address: The complete postal address of the vendor including postcode
4. company_number: The UK company registration number (usually 8 digits, often labeled as "Company No" or "Registered No")
5. contact: Email, phone number or website for the vendor

Pay attention to:
- VAT numbers typically appear with labels like "VAT Reg No", "VAT Number" or "VAT"
- Company numbers typically appear near registration statements or after "Registered in England"
- The company name is typically at the top of the invoice or near logo
- UK postcodes have formats like "AB12 3CD" and are at the end of addresses

Invoice text:
{text}

Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"name": "full vendor company name",
"vat_number": "full VAT number with GB prefix",
"address": "complete vendor address on one line",
"company_number": "company registration number",
"contact": "contact information"
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        #print("prompt", prompt)
        #print("response", response) 
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM vendor data: {e}")
            return {
                'name': '',
                'vat_number': '',
                'address': '',
                'company_number': '',
                'contact': ''
            }

    def _extract_amounts_with_llm(self, text: str) -> Dict[str, Any]:
        """Use LLM to extract amount information with optimized prompt."""
        # Limit text size for prompt
        if len(text) > 6000:
            # For amounts, the end of the document is more relevant
            text = text[-6000:]
            
        # Optimized prompt for amounts extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the following financial information precisely:

1. subtotal: The amount before VAT/tax (also called net amount, goods/services total, or amount excluding VAT)
2. total: The total amount due/payable (also called gross amount, balance due, amount including VAT, or total due)
3. currency: The currency code (GBP, USD, EUR, etc.)

Important guidelines:
- Convert all amounts to decimal numbers (e.g., 1234.56, not £1,234.56)
- Look for symbols (£, $, €) to determine currency
- Look for explicit currency labels like "Currency: GBP" or "All amounts in GBP"
- The total is typically the largest amount and appears near the bottom
- Amounts often appear in a summary section with clear labels
- In UK, GBP is the default currency

Invoice text:
{text}

Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"subtotal": numeric_value_without_currency_symbol,
"total": numeric_value_without_currency_symbol,
"currency": "three_letter_currency_code"
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            
            # Ensure numeric values
            for field in ['subtotal', 'total']:
                if field in result and result[field] is not None:
                    result[field] = float(result[field])
            
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM amount data: {e}")
            return {
                'subtotal': None,
                'total': None,
                'currency': self.default_currency
            }

    def _extract_tax_data_with_llm(self, text: str, amount_data: Dict[str, Any]) -> Dict[str, Any]:
        """Use LLM to extract VAT/tax information with optimized prompt."""
        # Limit text size for prompt
        if len(text) > 6000:
            # For tax data, the end of the document is more relevant
            text = text[-6000:]
            
        # Add context about already extracted amounts
        amount_context = ""
        if amount_data.get('subtotal') is not None and amount_data.get('total') is not None:
            amount_context = f"\nAdditional context: Subtotal = {amount_data['subtotal']}, Total = {amount_data['total']}, Currency = {amount_data.get('currency', 'GBP')}"
            
        # Optimized prompt for VAT extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the following VAT/tax information precisely:

1. vat: The VAT amount (the tax amount added to the subtotal)
2. vat_rate: The VAT rate applied as a percentage (standard UK rates are 0%, 5%, or 20%)

Important guidelines:
- Convert amounts to decimal numbers without currency symbols
- VAT amounts are typically labeled as "VAT", "V.A.T.", or "Tax"
- The VAT rate is often shown as a percentage (e.g., "VAT @ 20%")
- VAT can be calculated as (Total - Subtotal) if not explicitly stated
- Common UK VAT rates are 0% (zero-rated), 5% (reduced), and 20% (standard)
- VAT amounts typically appear near the subtotal and total{amount_context}

Invoice text:
{text}

Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"vat": numeric_vat_amount,
"vat_rate": numeric_vat_percentage_rate
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            
            # Ensure numeric values
            for field in ['vat', 'vat_rate']:
                if field in result and result[field] is not None:
                    result[field] = float(result[field])
            
            # If VAT is not extracted but we have subtotal and total, calculate it
            if result.get('vat') is None and amount_data.get('subtotal') and amount_data.get('total'):
                result['vat'] = round(amount_data['total'] - amount_data['subtotal'], 2)
                
                # Try to determine rate from the calculated VAT
                if result.get('vat_rate') is None and amount_data.get('subtotal') > 0:
                    rate = (result['vat'] / amount_data['subtotal']) * 100
                    # Match to nearest standard rate
                    nearest_rate = min(self.vat_rates, key=lambda x: abs(x - rate))
                    if abs(nearest_rate - rate) < 1.0:  # Within 1% of a standard rate
                        result['vat_rate'] = nearest_rate
            
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM tax data: {e}")
            return {
                'vat': None,
                'vat_rate': None
            }

    def _extract_line_items_with_llm(self, text: str, tables: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Use LLM to extract line items with optimized prompt."""
        # Prepare table information
        table_descriptions = []
        if tables:
            for i, table in enumerate(tables):
                cells_text = " | ".join(cell.get('text', '') for cell in table.get('cells', []))
                table_descriptions.append(f"Table {i+1}: {cells_text}")
            
        table_context = ""
        if table_descriptions:
            table_context = "\n\nDetected tables:\n" + "\n".join(table_descriptions)
        
        # Limit text size for prompt
        if len(text) > 6000:
            # For line items, the middle of the document is more relevant
            middle_start = len(text) // 4
            middle_end = 3 * len(text) // 4
            text = text[middle_start:middle_end]
            
        # Optimized prompt for line items extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the line items (products or services) from this invoice.

Pay close attention to the structure of the line items section, which typically appears as a table or list with columns such as:
- Description or Item/Service
- Quantity or Qty
- Unit Price or Rate
- VAT Rate or Tax
- Amount or Total

Important guidelines:
- Each line item represents a separate product or service being invoiced
- Extract ALL line items, even if there are many
- Convert all numeric values to plain numbers without currency symbols
- The "description" should include the full product/service name
- If the VAT/tax rate is specified per line item, include it as a percentage
- If any field is missing, use null for numeric fields or empty string for text fields{table_context}

Invoice text:
{text}

Return ONLY a valid JSON array of line items with exactly this structure, and do not precede your output with any other text, curly bracket should be the first character of your output:
[
    {{
        "description": "full item description",
        "quantity": numeric_quantity,
        "unit_price": numeric_unit_price,
        "vat_rate": numeric_vat_rate_percentage,
        "amount": numeric_line_total
    }},
    ...
]

If you cannot identify any line items, return an empty array [].
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            
            # Ensure numeric values in all line items
            for item in result:
                for field in ['quantity', 'unit_price', 'amount', 'vat_rate']:
                    if field in item and item[field] is not None:
                        try:
                            item[field] = float(item[field])
                        except:
                            item[field] = None
            
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM line items: {e}")
            return []

    def _extract_payment_data_with_llm(self, text: str) -> Dict[str, Any]:
        """Use LLM to extract payment information with optimized prompt."""
        # Limit text size for prompt
        if len(text) > 6000:
            # For payment data, the end of the document is more relevant
            text = text[-6000:]
            
        # Optimized prompt for payment data extraction
        prompt = f"""You are an expert system extracting data from UK invoices. Extract the following payment information precisely:

1. bank_name: The name of the bank holding the account
2. account_number: The UK bank account number (typically 8 digits)
3. sort_code: The UK bank sort code (format: XX-XX-XX or XXXXXX)
4. iban: The International Bank Account Number (if present)
5. payment_terms: Payment terms (e.g., "30 days", "Net 15", "Payment due on receipt")

Important guidelines:
- Look for a dedicated "Payment Details" or "Banking Details" section
- Bank account details typically appear at the bottom of the invoice
- Sort codes are typically 6 digits, often formatted as XX-XX-XX
- Account numbers are typically 8 digits
- Payment terms often indicate the timeframe for payment (e.g., "30 days from invoice date")
- IBAN numbers for UK typically start with GB followed by 2 digits and then 18+ characters

Invoice text:
{text}

Return ONLY a valid JSON object with exactly these fields, and do not precede your output with any other text, curly bracket should be the first character of your output:
{{
"bank_name": "name of bank",
"account_number": "account number digits only",
"sort_code": "sort code digits with or without dashes",
"iban": "full IBAN if present",
"payment_terms": "payment terms text"
}}
"""

        # Call LLM
        response = self.llm_client.generate(prompt)
        
        # Parse response
        try:
            result = json.loads(response.replace('```json','').replace('```','').replace('\n',''))
            return result
        except Exception as e:
            logger.warning(f"Failed to parse LLM payment data: {e}")
            return {
                'bank_name': '',
                'account_number': '',
                'sort_code': '',
                'iban': '',
                'payment_terms': ''
            }
    
    def _parse_uk_date(self, date_str: str) -> Optional[str]:
        """Parse a date string in various UK formats and return ISO format."""
        if not date_str:
            return None
            
        # Clean the date string
        date_str = date_str.strip()
        
        # Try all configured date formats
        for fmt in self.date_formats:
            try:
                date_obj = datetime.datetime.strptime(date_str, fmt)
                return date_obj.strftime('%Y-%m-%d')
            except ValueError:
                continue
                
        # If standard formats fail, try some common variations
        try:
            # Handle "1st January 2023" type formats
            date_str = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)
            
            # Try again with standard formats
            for fmt in ['%d %B %Y', '%d %b %Y', '%B %d %Y', '%b %d %Y']:
                try:
                    date_obj = datetime.datetime.strptime(date_str, fmt)
                    return date_obj.strftime('%Y-%m-%d')
                except ValueError:
                    continue
        except Exception as e:
            logger.warning(f"Error in date parsing: {e}")
                
        # If all parsing attempts fail
        return None
                        

Improved Code

🔍 Code Extractor

class UKExtractor

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self, config)`

`_get_full_document_text(self, document) -> str`

`extract(self, document, language) -> Dict[str, Any]`

`extract_structure(self, document) -> Dict[str, Any]`

`extract_invoice_metadata(self, document, structure) -> Dict[str, Any]`

`extract_vendor_data(self, document, structure) -> Dict[str, Any]`

`extract_amounts(self, document, structure) -> Dict[str, Any]`

`extract_tax_data(self, document, structure, amount_data) -> Dict[str, Any]`

`extract_line_items(self, document, structure) -> List[Dict[str, Any]]`

`extract_payment_data(self, document, structure) -> Dict[str, Any]`

`_verify_uk_specific_fields(self, extraction_result) -> None`

`_extract_structure_with_llm(self, document) -> Dict[str, Any]`

`_extract_invoice_metadata_with_llm(self, text) -> Dict[str, Any]`

`_extract_vendor_data_with_llm(self, text) -> Dict[str, Any]`

`_extract_amounts_with_llm(self, text) -> Dict[str, Any]`

`_extract_tax_data_with_llm(self, text, amount_data) -> Dict[str, Any]`

`_extract_line_items_with_llm(self, text, tables) -> List[Dict[str, Any]]`

`_extract_payment_data_with_llm(self, text) -> Dict[str, Any]`

`_parse_uk_date(self, date_str) -> Optional[str]`

Required Imports

Usage Example

Tags

Similar Components

class TestUKExtractor 75.7% similar

class BaseExtractor 74.5% similar

class UKValidator 70.0% similar

class AUExtractor 65.3% similar

class TestUKValidator 63.7% similar

class UKExtractor

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self, config)

_get_full_document_text(self, document) -> str

extract(self, document, language) -> Dict[str, Any]

extract_structure(self, document) -> Dict[str, Any]

extract_invoice_metadata(self, document, structure) -> Dict[str, Any]

extract_vendor_data(self, document, structure) -> Dict[str, Any]

extract_amounts(self, document, structure) -> Dict[str, Any]

extract_tax_data(self, document, structure, amount_data) -> Dict[str, Any]

extract_line_items(self, document, structure) -> List[Dict[str, Any]]

extract_payment_data(self, document, structure) -> Dict[str, Any]

_verify_uk_specific_fields(self, extraction_result) -> None

_extract_structure_with_llm(self, document) -> Dict[str, Any]

_extract_invoice_metadata_with_llm(self, text) -> Dict[str, Any]

_extract_vendor_data_with_llm(self, text) -> Dict[str, Any]

_extract_amounts_with_llm(self, text) -> Dict[str, Any]

_extract_tax_data_with_llm(self, text, amount_data) -> Dict[str, Any]

_extract_line_items_with_llm(self, text, tables) -> List[Dict[str, Any]]

_extract_payment_data_with_llm(self, text) -> Dict[str, Any]

_parse_uk_date(self, date_str) -> Optional[str]

Required Imports

Usage Example

Tags

Similar Components

class TestUKExtractor 75.7% similar

class BaseExtractor 74.5% similar

class UKValidator 70.0% similar

class AUExtractor 65.3% similar

class TestUKValidator 63.7% similar

✨ Improve Code: UKExtractor

Code Comparison

`init(self, config)`

`_get_full_document_text(self, document) -> str`

`extract(self, document, language) -> Dict[str, Any]`

`extract_structure(self, document) -> Dict[str, Any]`

`extract_invoice_metadata(self, document, structure) -> Dict[str, Any]`

`extract_vendor_data(self, document, structure) -> Dict[str, Any]`

`extract_amounts(self, document, structure) -> Dict[str, Any]`

`extract_tax_data(self, document, structure, amount_data) -> Dict[str, Any]`

`extract_line_items(self, document, structure) -> List[Dict[str, Any]]`

`extract_payment_data(self, document, structure) -> Dict[str, Any]`

`_verify_uk_specific_fields(self, extraction_result) -> None`

`_extract_structure_with_llm(self, document) -> Dict[str, Any]`

`_extract_invoice_metadata_with_llm(self, text) -> Dict[str, Any]`

`_extract_vendor_data_with_llm(self, text) -> Dict[str, Any]`

`_extract_amounts_with_llm(self, text) -> Dict[str, Any]`

`_extract_tax_data_with_llm(self, text, amount_data) -> Dict[str, Any]`

`_extract_line_items_with_llm(self, text, tables) -> List[Dict[str, Any]]`

`_extract_payment_data_with_llm(self, text) -> Dict[str, Any]`

`_parse_uk_date(self, date_str) -> Optional[str]`