class DatabaseSchema
A dataclass that represents comprehensive database schema information, including table structures, columns, relationships, and categorizations for SQL database introspection and query generation.
/tf/active/vicechatdev/full_smartstat/sql_query_generator.py
18 - 307
complex
Purpose
DatabaseSchema serves as a central repository for database metadata, enabling validation of table and column references, categorization of tables by business function, and providing schema-aware query assistance. It supports both legacy and comprehensive schema formats, loads relationship data, and offers methods to fix invalid column references, suggest alternatives, and identify problematic tables. This class is essential for building intelligent SQL query generators and database documentation tools.
Source Code
class DatabaseSchema:
"""Database schema information"""
database_name: str
description: str
table_categories: Dict[str, Any]
complete_table_list: List[str]
key_relationships: Dict[str, Any]
system_architecture: Dict[str, Any]
@classmethod
def from_json(cls, json_path: str) -> 'DatabaseSchema':
"""Load database schema from JSON file - enhanced to handle comprehensive schema format"""
try:
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Check if this is the new comprehensive schema format
if 'database_name' in data and 'total_tables' in data and 'tables' in data:
return cls._from_comprehensive_schema(data, json_path)
else:
# Legacy format
return cls(
database_name=data.get('database_info', {}).get('name', 'Unknown'),
description=data.get('database_info', {}).get('description', ''),
table_categories=data.get('table_categories', {}),
complete_table_list=data.get('complete_table_list', []),
key_relationships=data.get('key_relationships', {}),
system_architecture=data.get('system_architecture', {})
)
except Exception as e:
logger.error(f"Error loading database schema from {json_path}: {str(e)}")
raise
@classmethod
def _from_comprehensive_schema(cls, data: Dict[str, Any], json_path: str) -> 'DatabaseSchema':
"""Load from comprehensive database schema format with detailed table/column info"""
# Extract table information
tables = data.get('tables', [])
table_list = []
table_info = {}
for table in tables:
if isinstance(table, dict) and 'TABLE_NAME' in table:
table_name = table['TABLE_NAME']
table_list.append(table_name)
table_info[table_name] = {
'schema': table.get('TABLE_SCHEMA', 'dbo'),
'type': table.get('TABLE_TYPE', 'BASE TABLE'),
'row_count': table.get('ROW_COUNT', 0)
}
# Extract column information by table
columns_by_table = data.get('columns_by_table', {})
# Create table categories based on naming patterns and business logic
table_categories = cls._create_table_categories(table_list)
# System architecture from comprehensive data
system_architecture = {
'database_name': data.get('database_name', 'Unknown'),
'server': data.get('server', 'Unknown'),
'total_tables': data.get('total_tables', len(table_list)),
'total_columns': data.get('total_columns', 0),
'total_rows': data.get('total_rows', 0),
'generated_on': data.get('generated_on', ''),
'table_info': table_info,
'columns_by_table': columns_by_table
}
# Try to load relationship information from companion file
relationships = []
relationship_file = json_path.replace('database_schema', 'database_relationships')
try:
if os.path.exists(relationship_file):
with open(relationship_file, 'r', encoding='utf-8') as f:
rel_data = json.load(f)
relationships = rel_data.get('relationships', [])
logger.info(f"Loaded {len(relationships)} relationships from {relationship_file}")
except Exception as e:
logger.warning(f"Could not load relationships from {relationship_file}: {str(e)}")
return cls(
database_name=data.get('database_name', 'Unknown'),
description=f"Comprehensive database schema with {len(table_list)} tables",
table_categories=table_categories,
complete_table_list=table_list,
key_relationships=relationships,
system_architecture=system_architecture
)
@classmethod
def _create_table_categories(cls, table_list: List[str]) -> Dict[str, List[str]]:
"""Create table categories based on naming patterns and business logic"""
categories = {
'Core Laboratory Operations': [],
'Testing Results': [],
'Customer & Workflow Management': [],
'Specialized Testing': [],
'System & Configuration': [],
'Audit & Logging': [],
'Legacy & Archive': []
}
for table_name in table_list:
table_lower = table_name.lower()
# Core laboratory operations
if any(keyword in table_lower for keyword in ['request', 'sample', 'analysis']):
if table_name not in categories['Core Laboratory Operations']:
categories['Core Laboratory Operations'].append(table_name)
# Testing results
elif any(keyword in table_lower for keyword in ['result', 'test', 'titer']):
categories['Testing Results'].append(table_name)
# Specialized testing
elif any(keyword in table_lower for keyword in ['bact', 'pcr', 'serolog', 'hygienogram']):
categories['Specialized Testing'].append(table_name)
# Customer and workflow
elif any(keyword in table_lower for keyword in ['company', 'customer', 'dierenarts', 'veterinarian', 'flock']):
categories['Customer & Workflow Management'].append(table_name)
# System and configuration
elif any(keyword in table_lower for keyword in ['config', 'user', 'role', 'permission', 'menu', 'form']):
categories['System & Configuration'].append(table_name)
# Audit and logging
elif any(keyword in table_lower for keyword in ['audit', 'log', 'trail', 'version']):
categories['Audit & Logging'].append(table_name)
# Legacy and archive
elif any(keyword in table_lower for keyword in ['deleted', 'old', 'temp', 'staging']):
categories['Legacy & Archive'].append(table_name)
# Default category for uncategorized tables
else:
categories['System & Configuration'].append(table_name)
return categories
def validate_column_reference(self, table_name: str, column_name: str) -> Dict[str, Any]:
"""Validate if a column exists in a table and suggest alternatives"""
result = {
'exists': False,
'table_exists': False,
'suggested_columns': [],
'exact_column_name': None
}
# Check if we have comprehensive schema information
if hasattr(self, 'system_architecture') and self.system_architecture:
columns_by_table = self.system_architecture.get('columns_by_table', {})
if table_name in columns_by_table:
result['table_exists'] = True
table_columns = columns_by_table[table_name]
# Get actual column names
actual_columns = [col.get('COLUMN_NAME', '') for col in table_columns if isinstance(col, dict)]
# Check exact match
if column_name in actual_columns:
result['exists'] = True
result['exact_column_name'] = column_name
else:
# Find similar columns (case-insensitive partial matches)
column_lower = column_name.lower()
suggestions = []
for actual_col in actual_columns:
actual_lower = actual_col.lower()
# Look for partial matches or similar patterns
if (column_lower in actual_lower or
actual_lower in column_lower or
any(part in actual_lower for part in column_lower.split('_')) or
any(part in column_lower for part in actual_lower.split('_'))):
suggestions.append(actual_col)
result['suggested_columns'] = suggestions[:5] # Top 5 suggestions
return result
def get_available_columns(self, table_name: str) -> List[str]:
"""Get all available columns for a table"""
try:
if hasattr(self, 'system_architecture') and self.system_architecture:
columns_by_table = self.system_architecture.get('columns_by_table', {})
# Debug logging
logger.info(f"Looking for table '{table_name}' in schema with {len(columns_by_table)} tables available")
if table_name in columns_by_table:
table_columns = columns_by_table[table_name]
if isinstance(table_columns, list):
columns = [col.get('COLUMN_NAME', '') for col in table_columns if isinstance(col, dict) and col.get('COLUMN_NAME')]
logger.info(f"Found {len(columns)} columns for table '{table_name}': {columns[:5]}...")
return columns
else:
logger.warning(f"Table '{table_name}' columns data is not a list: {type(table_columns)}")
else:
# Check for case-insensitive match
matching_tables = [t for t in columns_by_table.keys() if t.lower() == table_name.lower()]
if matching_tables:
actual_table_name = matching_tables[0]
logger.info(f"Found case-insensitive match: '{table_name}' -> '{actual_table_name}'")
table_columns = columns_by_table[actual_table_name]
if isinstance(table_columns, list):
columns = [col.get('COLUMN_NAME', '') for col in table_columns if isinstance(col, dict) and col.get('COLUMN_NAME')]
logger.info(f"Found {len(columns)} columns for table '{actual_table_name}': {columns[:5]}...")
return columns
logger.warning(f"Table '{table_name}' not found in schema. Sample available: {list(columns_by_table.keys())[:5]}")
else:
logger.warning(f"No system_architecture found in schema for table '{table_name}'")
except Exception as e:
logger.error(f"Error retrieving columns for table '{table_name}': {str(e)}")
return []
def fix_column_reference(self, table_name: str, column_name: str) -> str:
"""Fix column reference using comprehensive schema and column mapping"""
# First check if column exists as-is
validation = self.validate_column_reference(table_name, column_name)
if validation['exists']:
return column_name
# Check column mapping for known fixes
if table_name in COLUMN_MAPPING and column_name in COLUMN_MAPPING[table_name]:
fixed_column = COLUMN_MAPPING[table_name][column_name]
logger.info(f"Fixed column reference: {table_name}.{column_name} -> {table_name}.{fixed_column}")
return fixed_column
# Try suggestions from validation
if validation['suggested_columns']:
suggested = validation['suggested_columns'][0] # Best suggestion
logger.info(f"Suggested column reference: {table_name}.{column_name} -> {table_name}.{suggested}")
return suggested
# Fallback: return original (will likely cause SQL error but logged)
logger.warning(f"Could not fix column reference: {table_name}.{column_name} - column may not exist")
return column_name
def get_recommended_joins(self) -> Dict[str, Any]:
"""Get recommended JOIN patterns based on schema analysis"""
return RECOMMENDED_JOINS
def is_table_problematic(self, table_name: str) -> Dict[str, Any]:
"""Check if a table is known to be problematic"""
if table_name in PROBLEMATIC_TABLES:
return {
'is_problematic': True,
'details': PROBLEMATIC_TABLES[table_name]
}
return {'is_problematic': False}
def get_best_result_tables(self) -> List[str]:
"""Get list of high-value result tables for laboratory data"""
return RECOMMENDED_RESULT_TABLES
@staticmethod
def _categorize_tables(table_list: List[str], table_info: Dict[str, Any]) -> Dict[str, Any]:
"""Categorize tables based on naming patterns and content"""
categories = {
'core_data': [],
'results': [],
'lookup': [],
'system': [],
'audit': []
}
for table_name in table_list:
table_lower = table_name.lower()
info = table_info.get(table_name, {})
row_count = info.get('row_count', 0)
# Categorize based on naming patterns and row counts
if any(keyword in table_lower for keyword in ['result', 'test', 'analysis']):
categories['results'].append(table_name)
elif any(keyword in table_lower for keyword in ['type', 'status', 'lookup', 'reference']) or row_count < 1000:
categories['lookup'].append(table_name)
elif any(keyword in table_lower for keyword in ['log', 'audit', 'history']):
categories['audit'].append(table_name)
elif any(keyword in table_lower for keyword in ['user', 'role', 'permission', 'account']):
categories['system'].append(table_name)
else:
categories['core_data'].append(table_name)
return categories
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
database_name: The name of the database this schema represents
description: A human-readable description of the database and its contents
table_categories: Dictionary mapping category names (e.g., 'Core Laboratory Operations', 'Testing Results') to lists of table names that belong to each category
complete_table_list: List of all table names present in the database
key_relationships: Dictionary or list containing foreign key relationships and table associations
system_architecture: Dictionary containing comprehensive metadata including server info, total tables/columns/rows, table_info with row counts, and columns_by_table mapping
Return Value
Instantiation returns a DatabaseSchema object with all schema metadata populated. Key methods return: from_json() returns DatabaseSchema instance; validate_column_reference() returns dict with 'exists', 'table_exists', 'suggested_columns', 'exact_column_name'; get_available_columns() returns list of column name strings; fix_column_reference() returns corrected column name string; get_recommended_joins() returns dict of JOIN patterns; is_table_problematic() returns dict with 'is_problematic' boolean and 'details'; get_best_result_tables() returns list of recommended table names.
Class Interface
Methods
from_json(cls, json_path: str) -> 'DatabaseSchema'
Purpose: Class method to load database schema from a JSON file, automatically detecting and handling both legacy and comprehensive schema formats
Parameters:
json_path: Path to the JSON file containing database schema information
Returns: A fully populated DatabaseSchema instance with all metadata loaded from the JSON file
_from_comprehensive_schema(cls, data: Dict[str, Any], json_path: str) -> 'DatabaseSchema'
Purpose: Private class method to parse comprehensive schema format with detailed table and column information, and optionally load relationships from companion file
Parameters:
data: Dictionary containing comprehensive schema data with 'database_name', 'tables', 'columns_by_table' keysjson_path: Path to the schema JSON file, used to locate companion relationship file
Returns: DatabaseSchema instance populated from comprehensive format data
_create_table_categories(cls, table_list: List[str]) -> Dict[str, List[str]]
Purpose: Private class method to automatically categorize tables into business-functional groups based on naming patterns and keywords
Parameters:
table_list: List of all table names to categorize
Returns: Dictionary mapping category names to lists of table names, with categories like 'Core Laboratory Operations', 'Testing Results', 'Customer & Workflow Management', etc.
validate_column_reference(self, table_name: str, column_name: str) -> Dict[str, Any]
Purpose: Validates whether a column exists in a specified table and provides suggestions for similar columns if not found
Parameters:
table_name: Name of the table to checkcolumn_name: Name of the column to validate
Returns: Dictionary with keys: 'exists' (bool), 'table_exists' (bool), 'suggested_columns' (list of up to 5 similar column names), 'exact_column_name' (str or None)
get_available_columns(self, table_name: str) -> List[str]
Purpose: Retrieves all column names available for a specified table, with case-insensitive fallback matching
Parameters:
table_name: Name of the table to get columns for
Returns: List of column name strings for the table, or empty list if table not found or error occurs
fix_column_reference(self, table_name: str, column_name: str) -> str
Purpose: Attempts to fix an invalid column reference by checking validation, column mappings, and suggestions in order
Parameters:
table_name: Name of the table containing the columncolumn_name: Name of the column to fix
Returns: Corrected column name string, or original column name if no fix found (with warning logged)
get_recommended_joins(self) -> Dict[str, Any]
Purpose: Returns recommended JOIN patterns based on schema analysis from the RECOMMENDED_JOINS constant
Returns: Dictionary containing recommended JOIN patterns and relationships between tables
is_table_problematic(self, table_name: str) -> Dict[str, Any]
Purpose: Checks if a table is known to have issues (empty, deprecated, etc.) based on PROBLEMATIC_TABLES constant
Parameters:
table_name: Name of the table to check
Returns: Dictionary with 'is_problematic' (bool) and optional 'details' key containing problem description
get_best_result_tables(self) -> List[str]
Purpose: Returns a curated list of high-value result tables for laboratory data queries from RECOMMENDED_RESULT_TABLES constant
Returns: List of recommended table names for accessing laboratory results
_categorize_tables(table_list: List[str], table_info: Dict[str, Any]) -> Dict[str, Any]
static
Purpose: Static method to categorize tables into functional groups based on naming patterns and row counts
Parameters:
table_list: List of all table names to categorizetable_info: Dictionary mapping table names to metadata including row_count
Returns: Dictionary with categories 'core_data', 'results', 'lookup', 'system', 'audit' mapping to lists of table names
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
database_name |
str | The name of the database this schema represents | instance |
description |
str | Human-readable description of the database and its purpose | instance |
table_categories |
Dict[str, Any] | Dictionary mapping category names to lists of tables, organizing tables by business function (e.g., 'Core Laboratory Operations', 'Testing Results') | instance |
complete_table_list |
List[str] | Complete list of all table names present in the database | instance |
key_relationships |
Dict[str, Any] | Dictionary or list containing foreign key relationships and associations between tables, loaded from companion relationship file if available | instance |
system_architecture |
Dict[str, Any] | Comprehensive metadata dictionary containing database_name, server, total_tables, total_columns, total_rows, generated_on timestamp, table_info (with row counts), and columns_by_table (detailed column metadata) | instance |
Dependencies
jsonloggingostypingdataclassespathlibcolumn_mappingdatetimeurllib.parsereimportlib.util
Required Imports
import json
import logging
import os
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass
from pathlib import Path
from column_mapping import COLUMN_MAPPING, RECOMMENDED_JOINS, PROBLEMATIC_TABLES, RECOMMENDED_RESULT_TABLES
Usage Example
# Load schema from JSON file
schema = DatabaseSchema.from_json('path/to/database_schema.json')
# Check if a column exists and get suggestions
validation = schema.validate_column_reference('tblRequest', 'RequestID')
if validation['exists']:
print(f"Column exists: {validation['exact_column_name']}")
else:
print(f"Suggestions: {validation['suggested_columns']}")
# Get all columns for a table
columns = schema.get_available_columns('tblRequest')
print(f"Available columns: {columns}")
# Fix an invalid column reference
fixed_column = schema.fix_column_reference('tblRequest', 'req_id')
print(f"Fixed column name: {fixed_column}")
# Check if table is problematic
problematic = schema.is_table_problematic('tblOldData')
if problematic['is_problematic']:
print(f"Warning: {problematic['details']}")
# Get recommended result tables
result_tables = schema.get_best_result_tables()
print(f"Best tables for results: {result_tables}")
# Access schema attributes
print(f"Database: {schema.database_name}")
print(f"Total tables: {len(schema.complete_table_list)}")
for category, tables in schema.table_categories.items():
print(f"{category}: {len(tables)} tables")
Best Practices
- Always use the from_json() class method to instantiate DatabaseSchema rather than direct construction
- The schema supports both legacy and comprehensive formats - ensure your JSON files match one of these formats
- Use validate_column_reference() before fix_column_reference() to understand what corrections will be made
- The system_architecture attribute contains the most detailed metadata including columns_by_table - check this exists before accessing
- Column name matching is case-sensitive by default but get_available_columns() includes case-insensitive fallback logic
- Relationship data is loaded from a companion file (database_relationships.json) if available - this is optional but recommended
- Table categorization is automatic based on naming patterns - review _create_table_categories() logic to understand categorization rules
- The class depends on external constants (COLUMN_MAPPING, RECOMMENDED_JOINS, etc.) from column_mapping module - ensure this module is properly configured
- Use get_available_columns() to retrieve actual column names before generating SQL queries
- Check is_table_problematic() before querying tables to avoid known issues with empty or deprecated tables
- The schema is immutable after loading - reload from JSON if database structure changes
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DatabaseSchema_v1 85.8% similar
-
class DatabaseInfo 78.7% similar
-
class TableInfo 71.0% similar
-
class DataSource 59.8% similar
-
class SqlGenerationResult 58.9% similar