function read_excel_file
Reads Excel files and returns either metadata for all sheets or detailed data for a specific sheet, including format validation, European decimal conversion, and rich metadata extraction.
/tf/active/vicechatdev/vice_ai/smartstat_service.py
867 - 995
complex
Purpose
This function serves as a comprehensive Excel file reader that provides two modes of operation: (1) metadata overview of all sheets in a workbook for exploration, and (2) detailed data extraction for a specific sheet including dataframe content, format validation, contextual information from information sheets, and Excel metadata like comments, document properties, and merged cells. It handles European decimal format conversion and validates sheet structure to identify data sheets versus information sheets.
Source Code
def read_excel_file(file_path: str, sheet_name: Optional[str] = None) -> Dict[str, Any]:
"""
Read Excel file with sheet format validation
Args:
file_path: Path to Excel file
sheet_name: If provided, return only this sheet's data. If None, return all sheet info
Returns:
If sheet_name is None: Dict with 'sheets' containing metadata about all sheets
If sheet_name provided: Dict with 'dataframe' and sheet metadata
"""
try:
# Read Excel file
excel_file = pd.ExcelFile(file_path)
if sheet_name is None:
# Return metadata about all sheets WITH format validation
sheets_info = []
for sheet in excel_file.sheet_names:
df_preview = pd.read_excel(file_path, sheet_name=sheet, nrows=5)
df_full = pd.read_excel(file_path, sheet_name=sheet)
# Apply European decimal conversion to both preview and full data
df_preview = convert_european_decimals(df_preview)
df_full = convert_european_decimals(df_full)
# Validate sheet format
validation = validate_sheet_format(df_full, sheet)
sheet_info = {
'name': sheet,
'rows': len(df_full),
'columns': len(df_full.columns),
'column_names': df_full.columns.tolist(),
'preview': df_preview.head(3).fillna('').to_dict('records'),
'format_validation': validation
}
sheets_info.append(sheet_info)
logger.info(f"Excel file has {len(sheets_info)} sheets: {[s['name'] for s in sheets_info]}")
return {
'sheets': sheets_info,
'total_sheets': len(sheets_info)
}
else:
# Return specific sheet data
df = pd.read_excel(file_path, sheet_name=sheet_name)
# Validate sheet format
validation = validate_sheet_format(df, sheet_name)
# Extract rich Excel metadata (comments, document properties, etc.)
excel_metadata = extract_excel_metadata(file_path, sheet_name)
# Apply European decimal conversion (Excel files can also have regional format issues)
df = convert_european_decimals(df)
logger.info(f"Successfully loaded sheet '{sheet_name}' with {len(df)} rows and {len(df.columns)} columns")
# If it's an information sheet, extract enhanced context
sheet_context = None
if validation['sheet_type'] == 'information_sheet':
# Extract content from dataframe
sheet_context = extract_sheet_context(df, sheet_name)
# Enhance context with Excel metadata
metadata_sections = []
# Add document properties if available
doc_props = excel_metadata.get('document_properties', {})
if any(doc_props.values()):
metadata_sections.append("\n## Document Metadata\n")
if doc_props.get('title'):
metadata_sections.append(f"**Title**: {doc_props['title']}")
if doc_props.get('author'):
metadata_sections.append(f"**Author**: {doc_props['author']}")
if doc_props.get('subject'):
metadata_sections.append(f"**Subject**: {doc_props['subject']}")
if doc_props.get('description'):
metadata_sections.append(f"**Description**: {doc_props['description']}")
if doc_props.get('keywords'):
metadata_sections.append(f"**Keywords**: {doc_props['keywords']}")
if doc_props.get('comments'):
metadata_sections.append(f"**Comments**: {doc_props['comments']}")
# Add cell comments if any
comments = excel_metadata.get('comments', [])
if comments:
metadata_sections.append("\n## Cell Comments & Notes\n")
for comment in comments[:20]: # Limit to first 20 comments
cell_ref = comment.get('cell', '')
cell_val = comment.get('value', '')
comment_text = comment.get('comment', '')
author = comment.get('author', 'Unknown')
metadata_sections.append(f"- **{cell_ref}** ({cell_val}): {comment_text} *[{author}]*")
# Add merged cell info if significant
merged_cells = excel_metadata.get('merged_cells', [])
if merged_cells:
# Filter for merged cells with actual content (likely headers/titles)
significant_merged = [m for m in merged_cells if m.get('value', '').strip()]
if significant_merged:
metadata_sections.append("\n## Headers & Titles (from merged cells)\n")
for merged in significant_merged[:10]:
metadata_sections.append(f"- **{merged['range']}**: {merged['value']}")
# Combine dataframe context with metadata
if metadata_sections:
sheet_context = sheet_context + '\n' + '\n'.join(metadata_sections)
logger.info(f"Sheet '{sheet_name}' identified as information sheet - "
f"extracted {len(sheet_context)} chars with metadata enhancement")
return {
'dataframe': df,
'sheet_name': sheet_name,
'rows': len(df),
'columns': len(df.columns),
'column_names': df.columns.tolist(),
'format_validation': validation,
'context': sheet_context,
'excel_metadata': excel_metadata # Include full metadata in response
}
except Exception as e:
logger.error(f"Error reading Excel file {file_path}: {e}")
raise
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
file_path |
str | - | positional_or_keyword |
sheet_name |
Optional[str] | None | positional_or_keyword |
Parameter Details
file_path: String path to the Excel file (.xlsx or .xls) to be read. Must be a valid file path accessible by the application.
sheet_name: Optional string specifying the name of a specific sheet to read. If None, returns metadata for all sheets in the workbook. If provided, returns detailed data for only that sheet including the dataframe, validation results, and extracted context.
Return Value
Type: Dict[str, Any]
Returns a dictionary with different structures based on sheet_name parameter. If sheet_name is None: {'sheets': list of sheet metadata dicts, 'total_sheets': int}. Each sheet metadata contains name, rows, columns, column_names, preview (first 3 rows), and format_validation. If sheet_name is provided: {'dataframe': pandas DataFrame, 'sheet_name': str, 'rows': int, 'columns': int, 'column_names': list, 'format_validation': dict, 'context': str or None (extracted text for information sheets), 'excel_metadata': dict (comments, document properties, merged cells)}.
Dependencies
pandasopenpyxllogging
Required Imports
import pandas as pd
import openpyxl
import logging
from typing import Dict, Any, Optional
Usage Example
import pandas as pd
from typing import Dict, Any, Optional
# Get metadata for all sheets
result = read_excel_file('data.xlsx')
print(f"Total sheets: {result['total_sheets']}")
for sheet in result['sheets']:
print(f"Sheet: {sheet['name']}, Rows: {sheet['rows']}, Valid: {sheet['format_validation']['is_valid']}")
# Read specific sheet with full data
sheet_data = read_excel_file('data.xlsx', sheet_name='Sales Data')
df = sheet_data['dataframe']
print(f"Loaded {sheet_data['rows']} rows")
print(f"Columns: {sheet_data['column_names']}")
if sheet_data['context']:
print(f"Information sheet context: {sheet_data['context'][:200]}...")
if sheet_data['excel_metadata']['comments']:
print(f"Found {len(sheet_data['excel_metadata']['comments'])} cell comments")
Best Practices
- Always check the 'format_validation' field in the returned dictionary to ensure the sheet structure is valid before processing
- Handle exceptions when calling this function as it raises exceptions for file reading errors
- For large Excel files, consider using sheet_name parameter to avoid loading all sheets into memory
- The 'context' field is only populated for information sheets (non-tabular data), check 'format_validation.sheet_type' to determine sheet type
- Preview data in the all-sheets mode is limited to 3 rows to minimize memory usage
- Cell comments are limited to first 20, and merged cells to first 10 significant ones to prevent excessive metadata
- Ensure all helper functions (convert_european_decimals, validate_sheet_format, extract_excel_metadata, extract_sheet_context) are properly defined before using this function
- The function uses fillna('') on preview data to handle missing values in the preview records
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function extract_excel_metadata 77.2% similar
-
function extract_metadata_pdf 58.0% similar
-
function validate_sheet_format 57.7% similar
-
function extract_metadata_docx 54.0% similar
-
function extract_sheet_context 50.3% similar