class ExcelGenerator
Generates structured Excel files from extracted invoice data.
/tf/active/vicechatdev/invoice_extraction/core/excel_generator.py
11 - 502
moderate
Purpose
Generates structured Excel files from extracted invoice data.
Source Code
class ExcelGenerator:
"""Generates structured Excel files from extracted invoice data."""
def __init__(self, config=None):
self.config = config or {}
# Configure output directory
self.output_dir = self.config.get('excel_output_dir', 'inbound/output')
Path(self.output_dir).mkdir(parents=True, exist_ok=True)
# Excel styling options
self.apply_styling = self.config.get('excel_styling', True)
# Template path (if using templates)
self.template_path = self.config.get('excel_template')
# Date format for filenames and cells
self.date_format = self.config.get('excel_date_format', '%Y-%m-%d')
self.datetime_format = self.config.get('excel_datetime_format', '%Y-%m-%d %H:%M:%S')
# Invoice tracking table configuration
self.tracking_enabled = self.config.get('tracking_enabled', True)
self.tracking_file = self.config.get('tracking_file', os.path.join(self.output_dir, 'invoice_tracking.xlsx'))
def generate(self, extraction_result, entity):
"""
Generate an Excel file from extracted invoice data.
Args:
extraction_result: Dictionary of extracted invoice data
entity: Entity code ('UK', 'BE', or 'AU')
Returns:
Path to the generated Excel file
"""
logger.info(f"Generating Excel output for {entity} invoice")
# Generate filename based on invoice number, date, and entity
filename = self._generate_filename(extraction_result, entity)
output_path = os.path.join(self.output_dir, filename)
# Select the appropriate method based on configuration
if self.template_path and os.path.exists(self.template_path):
self._generate_from_template(extraction_result, entity, output_path)
else:
self._generate_standard_excel(extraction_result, entity, output_path)
# Update central tracking table if enabled
if self.tracking_enabled:
self._update_tracking_table(extraction_result, entity, output_path)
logger.info(f"Excel file generated: {output_path}")
return output_path
def _generate_filename(self, extraction_result, entity):
"""Generate a standardized filename for the Excel output."""
# Extract invoice details for the filename
invoice_number = extraction_result.get('invoice', {}).get('number', 'unknown')
# Clean the invoice number for a valid filename
invoice_number = ''.join(c if c.isalnum() else '_' for c in invoice_number)
# Get invoice date or use current date
invoice_date = extraction_result.get('invoice', {}).get('issue_date')
if invoice_date:
# Try to parse date if it's a string
if isinstance(invoice_date, str):
try:
# Try different date formats
for fmt in ['%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y', '%d-%m-%Y', '%m-%d-%Y']:
try:
date_obj = datetime.strptime(invoice_date, fmt)
date_str = date_obj.strftime('%Y%m%d')
break
except ValueError:
continue
else:
# If no format works, use today
date_str = datetime.now().strftime('%Y%m%d')
except Exception:
date_str = datetime.now().strftime('%Y%m%d')
else:
date_str = datetime.now().strftime('%Y%m%d')
else:
date_str = datetime.now().strftime('%Y%m%d')
# Get vendor name
vendor_name = extraction_result.get('vendor', {}).get('name', 'unknown')
vendor_name = ''.join(c if c.isalnum() else '_' for c in vendor_name)[:20] # Truncate and clean
# Construct the filename with entity, date, vendor and invoice number
filename = f"{entity}_{date_str}_{vendor_name}_{invoice_number}.xlsx"
return filename
def _generate_standard_excel(self, extraction_result, entity, output_path):
"""Generate a standard Excel file with multiple sheets for invoice data."""
# Create a new Excel workbook
writer = pd.ExcelWriter(output_path, engine='openpyxl')
# Sheet 1: Summary information
self._create_summary_sheet(extraction_result, entity, writer)
# Sheet 2: Line items (if available)
self._create_line_items_sheet(extraction_result, writer)
# Sheet 3: Payment information
self._create_payment_sheet(extraction_result, entity, writer)
# Sheet 4: Raw data (for debugging/verification)
self._create_raw_data_sheet(extraction_result, writer)
# Save the workbook
writer.close()
# Apply additional styling if needed
if self.apply_styling:
self._apply_styling(output_path)
def _create_summary_sheet(self, extraction_result, entity, writer):
"""Create the summary sheet with key invoice information."""
# Extract summary data
invoice = extraction_result.get('invoice', {})
vendor = extraction_result.get('vendor', {})
amounts = extraction_result.get('amounts', {})
# Create a summary dataframe
summary_data = {
'Field': [
'Entity', 'Vendor Name', 'Vendor VAT/Tax Number',
'Invoice Number', 'Issue Date', 'Due Date',
'Subtotal', 'VAT/Tax Amount', 'Total Amount',
'Currency'
],
'Value': [
entity,
vendor.get('name', ''),
vendor.get('vat_number', ''),
invoice.get('number', ''),
invoice.get('issue_date', ''),
invoice.get('due_date', ''),
amounts.get('subtotal', ''),
amounts.get('vat', ''),
amounts.get('total', ''),
amounts.get('currency', '')
]
}
df_summary = pd.DataFrame(summary_data)
df_summary.to_excel(writer, sheet_name='Summary', index=False)
# Auto-adjust column width (done after save)
worksheet = writer.sheets['Summary']
for column_cells in worksheet.columns:
length = max(len(str(cell.value) or '') for cell in column_cells)
worksheet.column_dimensions[column_cells[0].column_letter].width = length + 5
def _create_line_items_sheet(self, extraction_result, writer):
"""Create sheet with line items if available."""
line_items = extraction_result.get('line_items', [])
if line_items:
# Check what fields exist in the line items
all_keys = set()
for item in line_items:
all_keys.update(item.keys())
# Create data with consistent columns
rows = []
for item in line_items:
row = {key: item.get(key, '') for key in all_keys}
rows.append(row)
# Create dataframe and write to Excel
df_items = pd.DataFrame(rows)
df_items.to_excel(writer, sheet_name='Line Items', index=False)
# Auto-adjust column width
worksheet = writer.sheets['Line Items']
for column_cells in worksheet.columns:
length = max(len(str(cell.value) or '') for cell in column_cells)
worksheet.column_dimensions[column_cells[0].column_letter].width = length + 2
else:
# Create empty sheet
pd.DataFrame().to_excel(writer, sheet_name='Line Items')
def _create_payment_sheet(self, extraction_result, entity, writer):
"""Create sheet with payment information."""
# Extract payment details
payment = extraction_result.get('payment', {})
invoice = extraction_result.get('invoice', {})
vendor = extraction_result.get('vendor', {})
amounts = extraction_result.get('amounts', {})
# Format for payment approval/processing
payment_data = {
'Field': [
'Entity', 'Payment To', 'Invoice Reference',
'Bank Name', 'Account Number', 'Sort Code/SWIFT',
'Payment Amount', 'Currency', 'Due Date',
'Payment Status', 'Approved By', 'Payment Date'
],
'Value': [
entity,
vendor.get('name', ''),
invoice.get('number', ''),
payment.get('bank_name', ''),
payment.get('account_number', ''),
payment.get('sort_code', ''),
amounts.get('total', ''),
amounts.get('currency', ''),
invoice.get('due_date', ''),
'Pending', # Default status
'', # To be filled by approver
'' # To be filled when paid
]
}
df_payment = pd.DataFrame(payment_data)
df_payment.to_excel(writer, sheet_name='Payment', index=False)
# Auto-adjust column width
worksheet = writer.sheets['Payment']
for column_cells in worksheet.columns:
length = max(len(str(cell.value) or '') for cell in column_cells)
worksheet.column_dimensions[column_cells[0].column_letter].width = length + 5
def _create_raw_data_sheet(self, extraction_result, writer):
"""Create sheet with raw extraction data for verification."""
# Flatten the extraction result for Excel
flattened_data = self._flatten_dict(extraction_result)
raw_data = {
'Field': list(flattened_data.keys()),
'Value': list(flattened_data.values()),
'Confidence': [extraction_result.get('confidence', {}).get(key, '') for key in flattened_data.keys()]
}
df_raw = pd.DataFrame(raw_data)
df_raw.to_excel(writer, sheet_name='Raw Data', index=False)
# Auto-adjust column width
worksheet = writer.sheets['Raw Data']
for column_cells in worksheet.columns:
length = min(max(len(str(cell.value) or '') for cell in column_cells), 100) # Cap at 100 chars
worksheet.column_dimensions[column_cells[0].column_letter].width = length + 2
def _flatten_dict(self, d, parent_key='', sep='_'):
"""Flatten a nested dictionary for easier Excel representation."""
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(self._flatten_dict(v, new_key, sep=sep).items())
elif isinstance(v, list):
# Skip lists of dictionaries (handled in separate sheets)
if v and isinstance(v[0], dict):
items.append((new_key, f"[List with {len(v)} items]"))
else:
items.append((new_key, str(v)))
else:
items.append((new_key, v))
return dict(items)
def _generate_from_template(self, extraction_result, entity, output_path):
"""Generate Excel file using a template."""
# Load the template
template_wb = openpyxl.load_workbook(self.template_path)
# Find placeholders and replace with data
for sheet_name in template_wb.sheetnames:
sheet = template_wb[sheet_name]
self._fill_template_sheet(sheet, extraction_result, entity)
# Save the filled template
template_wb.save(output_path)
def _fill_template_sheet(self, sheet, extraction_result, entity):
"""Fill a template sheet with extraction data."""
# Find cells with placeholders (format: {{field_name}})
for row in sheet.iter_rows():
for cell in row:
if cell.value and isinstance(cell.value, str) and '{{' in cell.value and '}}' in cell.value:
# Extract field name from placeholder
placeholder = cell.value
field_name = placeholder[placeholder.find('{{')+2:placeholder.find('}}')]
# Get the value from extraction result
flattened_data = self._flatten_dict(extraction_result)
value = flattened_data.get(field_name, '')
# Replace the placeholder with the actual value
if value is not None:
cell.value = placeholder.replace('{{' + field_name + '}}', str(value))
def _apply_styling(self, output_path):
"""Apply styling to the generated Excel file."""
wb = openpyxl.load_workbook(output_path)
# Define styles
header_font = Font(bold=True, size=12, color='FFFFFF')
header_fill = PatternFill(start_color='4F81BD', end_color='4F81BD', fill_type='solid')
thin_border = Border(
left=Side(style='thin'), right=Side(style='thin'),
top=Side(style='thin'), bottom=Side(style='thin')
)
centered_alignment = Alignment(horizontal='center')
# Style each sheet
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
# Style headers
for cell in ws[1]:
cell.font = header_font
cell.fill = header_fill
cell.border = thin_border
cell.alignment = centered_alignment
# Add borders to all data cells
for row in ws.iter_rows(min_row=2):
for cell in row:
cell.border = thin_border
# Auto-filter the header row
ws.auto_filter.ref = ws.dimensions
# Freeze the header row
ws.freeze_panes = 'A2'
# Save the styled workbook
wb.save(output_path)
def _update_tracking_table(self, extraction_result, entity, output_path):
"""
Update the central invoice tracking table with information from this invoice.
Args:
extraction_result: Dictionary of extracted invoice data
entity: Entity code ('UK', 'BE', or 'AU')
output_path: Path to the generated Excel file for this invoice
"""
logger.info(f"Updating invoice tracking table")
# Extract key data for tracking
invoice = extraction_result.get('invoice', {})
vendor = extraction_result.get('vendor', {})
amounts = extraction_result.get('amounts', {})
payment = extraction_result.get('payment', {})
line_items = extraction_result.get('line_items', [])
# Create a row for the tracking table
tracking_data = {
'ProcessingDate': datetime.now().strftime(self.datetime_format),
'Entity': entity,
'InvoiceNumber': invoice.get('number', ''),
'IssueDate': invoice.get('issue_date', ''),
'DueDate': invoice.get('due_date', ''),
'VendorName': vendor.get('name', ''),
'VendorVATNumber': vendor.get('vat_number', ''),
'VendorAddress': vendor.get('address', ''),
'VendorContact': vendor.get('contact', ''),
'Subtotal': amounts.get('subtotal', ''),
'TaxAmount': amounts.get('vat', amounts.get('tax', '')),
'TotalAmount': amounts.get('total', ''),
'Currency': amounts.get('currency', ''),
'BankName': payment.get('bank_name', ''),
'AccountNumber': payment.get('account_number', ''),
'SortCodeOrIBAN': payment.get('sort_code', payment.get('iban', '')),
'PaymentReference': payment.get('reference', ''),
'PaymentStatus': 'Pending',
'LineItemCount': len(line_items),
'ExcelFilePath': os.path.basename(output_path),
'ConfidenceScore': extraction_result.get('confidence', {}).get('overall', 0.0)
}
# Load existing tracking table or create a new one
if os.path.exists(self.tracking_file):
try:
existing_df = pd.read_excel(self.tracking_file)
# Check if this invoice already exists in the tracking table
invoice_exists = False
if 'InvoiceNumber' in existing_df.columns and 'VendorName' in existing_df.columns:
invoice_mask = (existing_df['InvoiceNumber'] == invoice.get('number', '')) & \
(existing_df['VendorName'] == vendor.get('name', ''))
invoice_exists = invoice_mask.any()
if invoice_exists:
# Update existing entry
for col, value in tracking_data.items():
if col in existing_df.columns:
existing_df.loc[invoice_mask, col] = value
else:
# Add new entry
new_row = pd.DataFrame([tracking_data])
existing_df = pd.concat([existing_df, new_row], ignore_index=True)
# Save updated tracking table
self._save_tracking_table(existing_df)
except Exception as e:
logger.error(f"Error updating existing tracking table: {e}")
# Create a new tracking table if there was an error reading the existing one
self._create_new_tracking_table(tracking_data)
else:
# Create a new tracking table
self._create_new_tracking_table(tracking_data)
def _create_new_tracking_table(self, first_row_data):
"""Create a new invoice tracking table with the first invoice data."""
df = pd.DataFrame([first_row_data])
self._save_tracking_table(df)
def _save_tracking_table(self, df):
"""Save the tracking table dataframe to Excel with proper formatting."""
# Ensure the directory exists
os.makedirs(os.path.dirname(self.tracking_file), exist_ok=True)
# Sort by processing date, most recent first
if 'ProcessingDate' in df.columns:
df = df.sort_values(by='ProcessingDate', ascending=False)
# Save to Excel
with pd.ExcelWriter(self.tracking_file, engine='openpyxl') as writer:
df.to_excel(writer, sheet_name='Invoice Tracking', index=False)
# Auto-adjust column widths
worksheet = writer.sheets['Invoice Tracking']
for idx, col in enumerate(df.columns):
# Set column width based on max length in column
max_len = max(
df[col].astype(str).map(len).max(), # max length of values
len(str(col)) # length of column name
) + 2 # adding a little extra space
worksheet.column_dimensions[openpyxl.utils.get_column_letter(idx+1)].width = max_len
# Apply styling if needed
if self.apply_styling:
self._apply_tracking_table_styling()
def _apply_tracking_table_styling(self):
"""Apply styling to the invoice tracking table."""
try:
# Load the workbook
wb = openpyxl.load_workbook(self.tracking_file)
ws = wb['Invoice Tracking']
# Define styles
header_font = Font(bold=True, size=12, color='FFFFFF')
header_fill = PatternFill(start_color='4F81BD', end_color='4F81BD', fill_type='solid')
thin_border = Border(
left=Side(style='thin'), right=Side(style='thin'),
top=Side(style='thin'), bottom=Side(style='thin')
)
alternate_row_fill = PatternFill(start_color='F2F2F2', end_color='F2F2F2', fill_type='solid')
# Style header row
for cell in ws[1]:
cell.font = header_font
cell.fill = header_fill
cell.border = thin_border
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
# Style data rows with alternating colors and borders
for row_idx, row in enumerate(ws.iter_rows(min_row=2)):
for cell in row:
cell.border = thin_border
if row_idx % 2 == 1: # Alternate rows
cell.fill = alternate_row_fill
# Add auto-filter
ws.auto_filter.ref = ws.dimensions
# Freeze header row
ws.freeze_panes = 'A2'
# Save the styled workbook
wb.save(self.tracking_file)
except Exception as e:
logger.error(f"Error applying styling to tracking table: {e}")
def get_tracking_table(self):
"""Get the current invoice tracking table as a pandas DataFrame."""
if os.path.exists(self.tracking_file):
try:
return pd.read_excel(self.tracking_file)
except Exception as e:
logger.error(f"Error reading tracking table: {e}")
return pd.DataFrame()
else:
return pd.DataFrame()
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
bases: Parameter of type
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self, config)
Purpose: Internal method: init
Parameters:
config: Parameter
Returns: None
generate(self, extraction_result, entity)
Purpose: Generate an Excel file from extracted invoice data. Args: extraction_result: Dictionary of extracted invoice data entity: Entity code ('UK', 'BE', or 'AU') Returns: Path to the generated Excel file
Parameters:
extraction_result: Parameterentity: Parameter
Returns: See docstring for return details
_generate_filename(self, extraction_result, entity)
Purpose: Generate a standardized filename for the Excel output.
Parameters:
extraction_result: Parameterentity: Parameter
Returns: None
_generate_standard_excel(self, extraction_result, entity, output_path)
Purpose: Generate a standard Excel file with multiple sheets for invoice data.
Parameters:
extraction_result: Parameterentity: Parameteroutput_path: Parameter
Returns: None
_create_summary_sheet(self, extraction_result, entity, writer)
Purpose: Create the summary sheet with key invoice information.
Parameters:
extraction_result: Parameterentity: Parameterwriter: Parameter
Returns: None
_create_line_items_sheet(self, extraction_result, writer)
Purpose: Create sheet with line items if available.
Parameters:
extraction_result: Parameterwriter: Parameter
Returns: None
_create_payment_sheet(self, extraction_result, entity, writer)
Purpose: Create sheet with payment information.
Parameters:
extraction_result: Parameterentity: Parameterwriter: Parameter
Returns: None
_create_raw_data_sheet(self, extraction_result, writer)
Purpose: Create sheet with raw extraction data for verification.
Parameters:
extraction_result: Parameterwriter: Parameter
Returns: None
_flatten_dict(self, d, parent_key, sep)
Purpose: Flatten a nested dictionary for easier Excel representation.
Parameters:
d: Parameterparent_key: Parametersep: Parameter
Returns: None
_generate_from_template(self, extraction_result, entity, output_path)
Purpose: Generate Excel file using a template.
Parameters:
extraction_result: Parameterentity: Parameteroutput_path: Parameter
Returns: None
_fill_template_sheet(self, sheet, extraction_result, entity)
Purpose: Fill a template sheet with extraction data.
Parameters:
sheet: Parameterextraction_result: Parameterentity: Parameter
Returns: None
_apply_styling(self, output_path)
Purpose: Apply styling to the generated Excel file.
Parameters:
output_path: Parameter
Returns: None
_update_tracking_table(self, extraction_result, entity, output_path)
Purpose: Update the central invoice tracking table with information from this invoice. Args: extraction_result: Dictionary of extracted invoice data entity: Entity code ('UK', 'BE', or 'AU') output_path: Path to the generated Excel file for this invoice
Parameters:
extraction_result: Parameterentity: Parameteroutput_path: Parameter
Returns: None
_create_new_tracking_table(self, first_row_data)
Purpose: Create a new invoice tracking table with the first invoice data.
Parameters:
first_row_data: Parameter
Returns: None
_save_tracking_table(self, df)
Purpose: Save the tracking table dataframe to Excel with proper formatting.
Parameters:
df: Parameter
Returns: None
_apply_tracking_table_styling(self)
Purpose: Apply styling to the invoice tracking table.
Returns: None
get_tracking_table(self)
Purpose: Get the current invoice tracking table as a pandas DataFrame.
Returns: None
Required Imports
import os
import logging
from pathlib import Path
from datetime import datetime
import pandas as pd
Usage Example
# Example usage:
# result = ExcelGenerator(bases)
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class UKExtractor 58.6% similar
-
class BaseExtractor 53.3% similar
-
class PDFGenerator 51.7% similar
-
function create_excel_report 50.6% similar
-
function test_excel_output 49.8% similar