DatabaseSchema - Code Extractor

class DatabaseSchema

Maturity: 47

A dataclass that represents comprehensive database schema information, including table structures, columns, relationships, and categorizations for SQL database introspection and query generation.

File:
/tf/active/vicechatdev/full_smartstat/sql_query_generator.py

Lines:
18 - 307

Complexity:
complex

Purpose

DatabaseSchema serves as a central repository for database metadata, enabling validation of table and column references, categorization of tables by business function, and providing schema-aware query assistance. It supports both legacy and comprehensive schema formats, loads relationship data, and offers methods to fix invalid column references, suggest alternatives, and identify problematic tables. This class is essential for building intelligent SQL query generators and database documentation tools.

Source Code

class DatabaseSchema:
    """Database schema information"""
    database_name: str
    description: str
    table_categories: Dict[str, Any]
    complete_table_list: List[str]
    key_relationships: Dict[str, Any]
    system_architecture: Dict[str, Any]
    
    @classmethod
    def from_json(cls, json_path: str) -> 'DatabaseSchema':
        """Load database schema from JSON file - enhanced to handle comprehensive schema format"""
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Check if this is the new comprehensive schema format
            if 'database_name' in data and 'total_tables' in data and 'tables' in data:
                return cls._from_comprehensive_schema(data, json_path)
            else:
                # Legacy format
                return cls(
                    database_name=data.get('database_info', {}).get('name', 'Unknown'),
                    description=data.get('database_info', {}).get('description', ''),
                    table_categories=data.get('table_categories', {}),
                    complete_table_list=data.get('complete_table_list', []),
                    key_relationships=data.get('key_relationships', {}),
                    system_architecture=data.get('system_architecture', {})
                )
        except Exception as e:
            logger.error(f"Error loading database schema from {json_path}: {str(e)}")
            raise
    
    @classmethod
    def _from_comprehensive_schema(cls, data: Dict[str, Any], json_path: str) -> 'DatabaseSchema':
        """Load from comprehensive database schema format with detailed table/column info"""
        
        # Extract table information
        tables = data.get('tables', [])
        table_list = []
        table_info = {}
        
        for table in tables:
            if isinstance(table, dict) and 'TABLE_NAME' in table:
                table_name = table['TABLE_NAME']
                table_list.append(table_name)
                table_info[table_name] = {
                    'schema': table.get('TABLE_SCHEMA', 'dbo'),
                    'type': table.get('TABLE_TYPE', 'BASE TABLE'),
                    'row_count': table.get('ROW_COUNT', 0)
                }
        
        # Extract column information by table  
        columns_by_table = data.get('columns_by_table', {})
        
        # Create table categories based on naming patterns and business logic
        table_categories = cls._create_table_categories(table_list)
        
        # System architecture from comprehensive data
        system_architecture = {
            'database_name': data.get('database_name', 'Unknown'),
            'server': data.get('server', 'Unknown'), 
            'total_tables': data.get('total_tables', len(table_list)),
            'total_columns': data.get('total_columns', 0),
            'total_rows': data.get('total_rows', 0),
            'generated_on': data.get('generated_on', ''),
            'table_info': table_info,
            'columns_by_table': columns_by_table
        }
        
        # Try to load relationship information from companion file
        relationships = []
        relationship_file = json_path.replace('database_schema', 'database_relationships')
        try:
            if os.path.exists(relationship_file):
                with open(relationship_file, 'r', encoding='utf-8') as f:
                    rel_data = json.load(f)
                    relationships = rel_data.get('relationships', [])
                    logger.info(f"Loaded {len(relationships)} relationships from {relationship_file}")
        except Exception as e:
            logger.warning(f"Could not load relationships from {relationship_file}: {str(e)}")
        
        return cls(
            database_name=data.get('database_name', 'Unknown'),
            description=f"Comprehensive database schema with {len(table_list)} tables",
            table_categories=table_categories,
            complete_table_list=table_list,
            key_relationships=relationships,
            system_architecture=system_architecture
        )
    
    @classmethod
    def _create_table_categories(cls, table_list: List[str]) -> Dict[str, List[str]]:
        """Create table categories based on naming patterns and business logic"""
        categories = {
            'Core Laboratory Operations': [],
            'Testing Results': [],
            'Customer & Workflow Management': [], 
            'Specialized Testing': [],
            'System & Configuration': [],
            'Audit & Logging': [],
            'Legacy & Archive': []
        }
        
        for table_name in table_list:
            table_lower = table_name.lower()
            
            # Core laboratory operations
            if any(keyword in table_lower for keyword in ['request', 'sample', 'analysis']):
                if table_name not in categories['Core Laboratory Operations']:
                    categories['Core Laboratory Operations'].append(table_name)
            
            # Testing results
            elif any(keyword in table_lower for keyword in ['result', 'test', 'titer']):
                categories['Testing Results'].append(table_name)
            
            # Specialized testing
            elif any(keyword in table_lower for keyword in ['bact', 'pcr', 'serolog', 'hygienogram']):
                categories['Specialized Testing'].append(table_name)
            
            # Customer and workflow 
            elif any(keyword in table_lower for keyword in ['company', 'customer', 'dierenarts', 'veterinarian', 'flock']):
                categories['Customer & Workflow Management'].append(table_name)
            
            # System and configuration
            elif any(keyword in table_lower for keyword in ['config', 'user', 'role', 'permission', 'menu', 'form']):
                categories['System & Configuration'].append(table_name)
            
            # Audit and logging
            elif any(keyword in table_lower for keyword in ['audit', 'log', 'trail', 'version']):
                categories['Audit & Logging'].append(table_name)
            
            # Legacy and archive
            elif any(keyword in table_lower for keyword in ['deleted', 'old', 'temp', 'staging']):
                categories['Legacy & Archive'].append(table_name)
            
            # Default category for uncategorized tables
            else:
                categories['System & Configuration'].append(table_name)
        
        return categories
    
    def validate_column_reference(self, table_name: str, column_name: str) -> Dict[str, Any]:
        """Validate if a column exists in a table and suggest alternatives"""
        result = {
            'exists': False,
            'table_exists': False,
            'suggested_columns': [],
            'exact_column_name': None
        }
        
        # Check if we have comprehensive schema information
        if hasattr(self, 'system_architecture') and self.system_architecture:
            columns_by_table = self.system_architecture.get('columns_by_table', {})
            
            if table_name in columns_by_table:
                result['table_exists'] = True
                table_columns = columns_by_table[table_name]
                
                # Get actual column names
                actual_columns = [col.get('COLUMN_NAME', '') for col in table_columns if isinstance(col, dict)]
                
                # Check exact match
                if column_name in actual_columns:
                    result['exists'] = True
                    result['exact_column_name'] = column_name
                else:
                    # Find similar columns (case-insensitive partial matches)
                    column_lower = column_name.lower()
                    suggestions = []
                    
                    for actual_col in actual_columns:
                        actual_lower = actual_col.lower()
                        # Look for partial matches or similar patterns
                        if (column_lower in actual_lower or 
                            actual_lower in column_lower or
                            any(part in actual_lower for part in column_lower.split('_')) or
                            any(part in column_lower for part in actual_lower.split('_'))):
                            suggestions.append(actual_col)
                    
                    result['suggested_columns'] = suggestions[:5]  # Top 5 suggestions
        
        return result
    
    def get_available_columns(self, table_name: str) -> List[str]:
        """Get all available columns for a table"""
        try:
            if hasattr(self, 'system_architecture') and self.system_architecture:
                columns_by_table = self.system_architecture.get('columns_by_table', {})
                
                # Debug logging
                logger.info(f"Looking for table '{table_name}' in schema with {len(columns_by_table)} tables available")
                
                if table_name in columns_by_table:
                    table_columns = columns_by_table[table_name]
                    if isinstance(table_columns, list):
                        columns = [col.get('COLUMN_NAME', '') for col in table_columns if isinstance(col, dict) and col.get('COLUMN_NAME')]
                        logger.info(f"Found {len(columns)} columns for table '{table_name}': {columns[:5]}...")
                        return columns
                    else:
                        logger.warning(f"Table '{table_name}' columns data is not a list: {type(table_columns)}")
                else:
                    # Check for case-insensitive match
                    matching_tables = [t for t in columns_by_table.keys() if t.lower() == table_name.lower()]
                    if matching_tables:
                        actual_table_name = matching_tables[0]
                        logger.info(f"Found case-insensitive match: '{table_name}' -> '{actual_table_name}'")
                        table_columns = columns_by_table[actual_table_name]
                        if isinstance(table_columns, list):
                            columns = [col.get('COLUMN_NAME', '') for col in table_columns if isinstance(col, dict) and col.get('COLUMN_NAME')]
                            logger.info(f"Found {len(columns)} columns for table '{actual_table_name}': {columns[:5]}...")
                            return columns
                    
                    logger.warning(f"Table '{table_name}' not found in schema. Sample available: {list(columns_by_table.keys())[:5]}")
            else:
                logger.warning(f"No system_architecture found in schema for table '{table_name}'")
        except Exception as e:
            logger.error(f"Error retrieving columns for table '{table_name}': {str(e)}")
        
        return []
    
    def fix_column_reference(self, table_name: str, column_name: str) -> str:
        """Fix column reference using comprehensive schema and column mapping"""
        # First check if column exists as-is
        validation = self.validate_column_reference(table_name, column_name)
        if validation['exists']:
            return column_name
        
        # Check column mapping for known fixes
        if table_name in COLUMN_MAPPING and column_name in COLUMN_MAPPING[table_name]:
            fixed_column = COLUMN_MAPPING[table_name][column_name]
            logger.info(f"Fixed column reference: {table_name}.{column_name} -> {table_name}.{fixed_column}")
            return fixed_column
        
        # Try suggestions from validation
        if validation['suggested_columns']:
            suggested = validation['suggested_columns'][0]  # Best suggestion
            logger.info(f"Suggested column reference: {table_name}.{column_name} -> {table_name}.{suggested}")
            return suggested
        
        # Fallback: return original (will likely cause SQL error but logged)
        logger.warning(f"Could not fix column reference: {table_name}.{column_name} - column may not exist")
        return column_name
    
    def get_recommended_joins(self) -> Dict[str, Any]:
        """Get recommended JOIN patterns based on schema analysis"""
        return RECOMMENDED_JOINS
    
    def is_table_problematic(self, table_name: str) -> Dict[str, Any]:
        """Check if a table is known to be problematic"""
        if table_name in PROBLEMATIC_TABLES:
            return {
                'is_problematic': True,
                'details': PROBLEMATIC_TABLES[table_name]
            }
        return {'is_problematic': False}
    
    def get_best_result_tables(self) -> List[str]:
        """Get list of high-value result tables for laboratory data"""
        return RECOMMENDED_RESULT_TABLES
    
    @staticmethod
    def _categorize_tables(table_list: List[str], table_info: Dict[str, Any]) -> Dict[str, Any]:
        """Categorize tables based on naming patterns and content"""
        categories = {
            'core_data': [],
            'results': [],
            'lookup': [],
            'system': [],
            'audit': []
        }
        
        for table_name in table_list:
            table_lower = table_name.lower()
            info = table_info.get(table_name, {})
            row_count = info.get('row_count', 0)
            
            # Categorize based on naming patterns and row counts
            if any(keyword in table_lower for keyword in ['result', 'test', 'analysis']):
                categories['results'].append(table_name)
            elif any(keyword in table_lower for keyword in ['type', 'status', 'lookup', 'reference']) or row_count < 1000:
                categories['lookup'].append(table_name)
            elif any(keyword in table_lower for keyword in ['log', 'audit', 'history']):
                categories['audit'].append(table_name)
            elif any(keyword in table_lower for keyword in ['user', 'role', 'permission', 'account']):
                categories['system'].append(table_name)
            else:
                categories['core_data'].append(table_name)
        
        return categories

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

database_name: The name of the database this schema represents

description: A human-readable description of the database and its contents

table_categories: Dictionary mapping category names (e.g., 'Core Laboratory Operations', 'Testing Results') to lists of table names that belong to each category

complete_table_list: List of all table names present in the database

key_relationships: Dictionary or list containing foreign key relationships and table associations

system_architecture: Dictionary containing comprehensive metadata including server info, total tables/columns/rows, table_info with row counts, and columns_by_table mapping

Return Value

Instantiation returns a DatabaseSchema object with all schema metadata populated. Key methods return: from_json() returns DatabaseSchema instance; validate_column_reference() returns dict with 'exists', 'table_exists', 'suggested_columns', 'exact_column_name'; get_available_columns() returns list of column name strings; fix_column_reference() returns corrected column name string; get_recommended_joins() returns dict of JOIN patterns; is_table_problematic() returns dict with 'is_problematic' boolean and 'details'; get_best_result_tables() returns list of recommended table names.

Class Interface

Methods

`from_json(cls, json_path: str) -> 'DatabaseSchema'`

Purpose: Class method to load database schema from a JSON file, automatically detecting and handling both legacy and comprehensive schema formats

Parameters:

json_path: Path to the JSON file containing database schema information

Returns: A fully populated DatabaseSchema instance with all metadata loaded from the JSON file

`_from_comprehensive_schema(cls, data: Dict[str, Any], json_path: str) -> 'DatabaseSchema'`

Purpose: Private class method to parse comprehensive schema format with detailed table and column information, and optionally load relationships from companion file

Parameters:

data: Dictionary containing comprehensive schema data with 'database_name', 'tables', 'columns_by_table' keys
json_path: Path to the schema JSON file, used to locate companion relationship file

Returns: DatabaseSchema instance populated from comprehensive format data

`_create_table_categories(cls, table_list: List[str]) -> Dict[str, List[str]]`

Purpose: Private class method to automatically categorize tables into business-functional groups based on naming patterns and keywords

Parameters:

table_list: List of all table names to categorize

Returns: Dictionary mapping category names to lists of table names, with categories like 'Core Laboratory Operations', 'Testing Results', 'Customer & Workflow Management', etc.

`validate_column_reference(self, table_name: str, column_name: str) -> Dict[str, Any]`

Purpose: Validates whether a column exists in a specified table and provides suggestions for similar columns if not found

Parameters:

table_name: Name of the table to check
column_name: Name of the column to validate

Returns: Dictionary with keys: 'exists' (bool), 'table_exists' (bool), 'suggested_columns' (list of up to 5 similar column names), 'exact_column_name' (str or None)

`get_available_columns(self, table_name: str) -> List[str]`

Purpose: Retrieves all column names available for a specified table, with case-insensitive fallback matching

Parameters:

table_name: Name of the table to get columns for

Returns: List of column name strings for the table, or empty list if table not found or error occurs

`fix_column_reference(self, table_name: str, column_name: str) -> str`

Purpose: Attempts to fix an invalid column reference by checking validation, column mappings, and suggestions in order

Parameters:

table_name: Name of the table containing the column
column_name: Name of the column to fix

Returns: Corrected column name string, or original column name if no fix found (with warning logged)

`get_recommended_joins(self) -> Dict[str, Any]`

Purpose: Returns recommended JOIN patterns based on schema analysis from the RECOMMENDED_JOINS constant

Returns: Dictionary containing recommended JOIN patterns and relationships between tables

`is_table_problematic(self, table_name: str) -> Dict[str, Any]`

Purpose: Checks if a table is known to have issues (empty, deprecated, etc.) based on PROBLEMATIC_TABLES constant

Parameters:

table_name: Name of the table to check

Returns: Dictionary with 'is_problematic' (bool) and optional 'details' key containing problem description

`get_best_result_tables(self) -> List[str]`

Purpose: Returns a curated list of high-value result tables for laboratory data queries from RECOMMENDED_RESULT_TABLES constant

Returns: List of recommended table names for accessing laboratory results

`_categorize_tables(table_list: List[str], table_info: Dict[str, Any]) -> Dict[str, Any]` static

Purpose: Static method to categorize tables into functional groups based on naming patterns and row counts

Parameters:

table_list: List of all table names to categorize
table_info: Dictionary mapping table names to metadata including row_count

Returns: Dictionary with categories 'core_data', 'results', 'lookup', 'system', 'audit' mapping to lists of table names

Attributes

Name	Type	Description	Scope
`database_name`	str	The name of the database this schema represents	instance
`description`	str	Human-readable description of the database and its purpose	instance
`table_categories`	Dict[str, Any]	Dictionary mapping category names to lists of tables, organizing tables by business function (e.g., 'Core Laboratory Operations', 'Testing Results')	instance
`complete_table_list`	List[str]	Complete list of all table names present in the database	instance
`key_relationships`	Dict[str, Any]	Dictionary or list containing foreign key relationships and associations between tables, loaded from companion relationship file if available	instance
`system_architecture`	Dict[str, Any]	Comprehensive metadata dictionary containing database_name, server, total_tables, total_columns, total_rows, generated_on timestamp, table_info (with row counts), and columns_by_table (detailed column metadata)	instance

Dependencies

json
logging
os
typing
dataclasses
pathlib
column_mapping
datetime
urllib.parse
re
importlib.util

Required Imports

import json
import logging
import os
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass
from pathlib import Path
from column_mapping import COLUMN_MAPPING, RECOMMENDED_JOINS, PROBLEMATIC_TABLES, RECOMMENDED_RESULT_TABLES

Usage Example

# Load schema from JSON file
schema = DatabaseSchema.from_json('path/to/database_schema.json')

# Check if a column exists and get suggestions
validation = schema.validate_column_reference('tblRequest', 'RequestID')
if validation['exists']:
    print(f"Column exists: {validation['exact_column_name']}")
else:
    print(f"Suggestions: {validation['suggested_columns']}")

# Get all columns for a table
columns = schema.get_available_columns('tblRequest')
print(f"Available columns: {columns}")

# Fix an invalid column reference
fixed_column = schema.fix_column_reference('tblRequest', 'req_id')
print(f"Fixed column name: {fixed_column}")

# Check if table is problematic
problematic = schema.is_table_problematic('tblOldData')
if problematic['is_problematic']:
    print(f"Warning: {problematic['details']}")

# Get recommended result tables
result_tables = schema.get_best_result_tables()
print(f"Best tables for results: {result_tables}")

# Access schema attributes
print(f"Database: {schema.database_name}")
print(f"Total tables: {len(schema.complete_table_list)}")
for category, tables in schema.table_categories.items():
    print(f"{category}: {len(tables)} tables")

Best Practices

Always use the from_json() class method to instantiate DatabaseSchema rather than direct construction
The schema supports both legacy and comprehensive formats - ensure your JSON files match one of these formats
Use validate_column_reference() before fix_column_reference() to understand what corrections will be made
The system_architecture attribute contains the most detailed metadata including columns_by_table - check this exists before accessing
Column name matching is case-sensitive by default but get_available_columns() includes case-insensitive fallback logic
Relationship data is loaded from a companion file (database_relationships.json) if available - this is optional but recommended
Table categorization is automatic based on naming patterns - review _create_table_categories() logic to understand categorization rules
The class depends on external constants (COLUMN_MAPPING, RECOMMENDED_JOINS, etc.) from column_mapping module - ensure this module is properly configured
Use get_available_columns() to retrieve actual column names before generating SQL queries
Check is_table_problematic() before querying tables to avoid known issues with empty or deprecated tables
The schema is immutable after loading - reload from JSON if database structure changes

Similar Components

AI-powered semantic similarity - components with related functionality:

class DatabaseSchema_v1 85.8% similar

A dataclass that represents database schema information, including table categories, relationships, and system architecture. Provides functionality to load schema from JSON files.
From: /tf/active/vicechatdev/smartstat/sql_query_generator.py
class DatabaseInfo 78.7% similar

A dataclass that encapsulates complete database schema information including tables, columns, relationships, and metadata for a specific database instance.
From: /tf/active/vicechatdev/full_smartstat/dynamic_schema_discovery.py
class TableInfo 71.0% similar

A dataclass that encapsulates comprehensive metadata about a database table, including schema information, columns, keys, and data quality metrics.
From: /tf/active/vicechatdev/full_smartstat/dynamic_schema_discovery.py
class DataSource 59.8% similar

A dataclass that represents configuration for various data sources, supporting file-based, SQL database, and query-based data access patterns.
From: /tf/active/vicechatdev/vice_ai/models.py
class SqlGenerationResult 58.9% similar

A dataclass that encapsulates the results of an SQL query generation operation, including the generated query, explanation, confidence score, and metadata about database objects used.
From: /tf/active/vicechatdev/full_smartstat/two_pass_sql_workflow.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class DatabaseSchema:
    """Database schema information"""
    database_name: str
    description: str
    table_categories: Dict[str, Any]
    complete_table_list: List[str]
    key_relationships: Dict[str, Any]
    system_architecture: Dict[str, Any]
    
    @classmethod
    def from_json(cls, json_path: str) -> 'DatabaseSchema':
        """Load database schema from JSON file - enhanced to handle comprehensive schema format"""
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Check if this is the new comprehensive schema format
            if 'database_name' in data and 'total_tables' in data and 'tables' in data:
                return cls._from_comprehensive_schema(data, json_path)
            else:
                # Legacy format
                return cls(
                    database_name=data.get('database_info', {}).get('name', 'Unknown'),
                    description=data.get('database_info', {}).get('description', ''),
                    table_categories=data.get('table_categories', {}),
                    complete_table_list=data.get('complete_table_list', []),
                    key_relationships=data.get('key_relationships', {}),
                    system_architecture=data.get('system_architecture', {})
                )
        except Exception as e:
            logger.error(f"Error loading database schema from {json_path}: {str(e)}")
            raise
    
    @classmethod
    def _from_comprehensive_schema(cls, data: Dict[str, Any], json_path: str) -> 'DatabaseSchema':
        """Load from comprehensive database schema format with detailed table/column info"""
        
        # Extract table information
        tables = data.get('tables', [])
        table_list = []
        table_info = {}
        
        for table in tables:
            if isinstance(table, dict) and 'TABLE_NAME' in table:
                table_name = table['TABLE_NAME']
                table_list.append(table_name)
                table_info[table_name] = {
                    'schema': table.get('TABLE_SCHEMA', 'dbo'),
                    'type': table.get('TABLE_TYPE', 'BASE TABLE'),
                    'row_count': table.get('ROW_COUNT', 0)
                }
        
        # Extract column information by table  
        columns_by_table = data.get('columns_by_table', {})
        
        # Create table categories based on naming patterns and business logic
        table_categories = cls._create_table_categories(table_list)
        
        # System architecture from comprehensive data
        system_architecture = {
            'database_name': data.get('database_name', 'Unknown'),
            'server': data.get('server', 'Unknown'), 
            'total_tables': data.get('total_tables', len(table_list)),
            'total_columns': data.get('total_columns', 0),
            'total_rows': data.get('total_rows', 0),
            'generated_on': data.get('generated_on', ''),
            'table_info': table_info,
            'columns_by_table': columns_by_table
        }
        
        # Try to load relationship information from companion file
        relationships = []
        relationship_file = json_path.replace('database_schema', 'database_relationships')
        try:
            if os.path.exists(relationship_file):
                with open(relationship_file, 'r', encoding='utf-8') as f:
                    rel_data = json.load(f)
                    relationships = rel_data.get('relationships', [])
                    logger.info(f"Loaded {len(relationships)} relationships from {relationship_file}")
        except Exception as e:
            logger.warning(f"Could not load relationships from {relationship_file}: {str(e)}")
        
        return cls(
            database_name=data.get('database_name', 'Unknown'),
            description=f"Comprehensive database schema with {len(table_list)} tables",
            table_categories=table_categories,
            complete_table_list=table_list,
            key_relationships=relationships,
            system_architecture=system_architecture
        )
    
    @classmethod
    def _create_table_categories(cls, table_list: List[str]) -> Dict[str, List[str]]:
        """Create table categories based on naming patterns and business logic"""
        categories = {
            'Core Laboratory Operations': [],
            'Testing Results': [],
            'Customer & Workflow Management': [], 
            'Specialized Testing': [],
            'System & Configuration': [],
            'Audit & Logging': [],
            'Legacy & Archive': []
        }
        
        for table_name in table_list:
            table_lower = table_name.lower()
            
            # Core laboratory operations
            if any(keyword in table_lower for keyword in ['request', 'sample', 'analysis']):
                if table_name not in categories['Core Laboratory Operations']:
                    categories['Core Laboratory Operations'].append(table_name)
            
            # Testing results
            elif any(keyword in table_lower for keyword in ['result', 'test', 'titer']):
                categories['Testing Results'].append(table_name)
            
            # Specialized testing
            elif any(keyword in table_lower for keyword in ['bact', 'pcr', 'serolog', 'hygienogram']):
                categories['Specialized Testing'].append(table_name)
            
            # Customer and workflow 
            elif any(keyword in table_lower for keyword in ['company', 'customer', 'dierenarts', 'veterinarian', 'flock']):
                categories['Customer & Workflow Management'].append(table_name)
            
            # System and configuration
            elif any(keyword in table_lower for keyword in ['config', 'user', 'role', 'permission', 'menu', 'form']):
                categories['System & Configuration'].append(table_name)
            
            # Audit and logging
            elif any(keyword in table_lower for keyword in ['audit', 'log', 'trail', 'version']):
                categories['Audit & Logging'].append(table_name)
            
            # Legacy and archive
            elif any(keyword in table_lower for keyword in ['deleted', 'old', 'temp', 'staging']):
                categories['Legacy & Archive'].append(table_name)
            
            # Default category for uncategorized tables
            else:
                categories['System & Configuration'].append(table_name)
        
        return categories
    
    def validate_column_reference(self, table_name: str, column_name: str) -> Dict[str, Any]:
        """Validate if a column exists in a table and suggest alternatives"""
        result = {
            'exists': False,
            'table_exists': False,
            'suggested_columns': [],
            'exact_column_name': None
        }
        
        # Check if we have comprehensive schema information
        if hasattr(self, 'system_architecture') and self.system_architecture:
            columns_by_table = self.system_architecture.get('columns_by_table', {})
            
            if table_name in columns_by_table:
                result['table_exists'] = True
                table_columns = columns_by_table[table_name]
                
                # Get actual column names
                actual_columns = [col.get('COLUMN_NAME', '') for col in table_columns if isinstance(col, dict)]
                
                # Check exact match
                if column_name in actual_columns:
                    result['exists'] = True
                    result['exact_column_name'] = column_name
                else:
                    # Find similar columns (case-insensitive partial matches)
                    column_lower = column_name.lower()
                    suggestions = []
                    
                    for actual_col in actual_columns:
                        actual_lower = actual_col.lower()
                        # Look for partial matches or similar patterns
                        if (column_lower in actual_lower or 
                            actual_lower in column_lower or
                            any(part in actual_lower for part in column_lower.split('_')) or
                            any(part in column_lower for part in actual_lower.split('_'))):
                            suggestions.append(actual_col)
                    
                    result['suggested_columns'] = suggestions[:5]  # Top 5 suggestions
        
        return result
    
    def get_available_columns(self, table_name: str) -> List[str]:
        """Get all available columns for a table"""
        try:
            if hasattr(self, 'system_architecture') and self.system_architecture:
                columns_by_table = self.system_architecture.get('columns_by_table', {})
                
                # Debug logging
                logger.info(f"Looking for table '{table_name}' in schema with {len(columns_by_table)} tables available")
                
                if table_name in columns_by_table:
                    table_columns = columns_by_table[table_name]
                    if isinstance(table_columns, list):
                        columns = [col.get('COLUMN_NAME', '') for col in table_columns if isinstance(col, dict) and col.get('COLUMN_NAME')]
                        logger.info(f"Found {len(columns)} columns for table '{table_name}': {columns[:5]}...")
                        return columns
                    else:
                        logger.warning(f"Table '{table_name}' columns data is not a list: {type(table_columns)}")
                else:
                    # Check for case-insensitive match
                    matching_tables = [t for t in columns_by_table.keys() if t.lower() == table_name.lower()]
                    if matching_tables:
                        actual_table_name = matching_tables[0]
                        logger.info(f"Found case-insensitive match: '{table_name}' -> '{actual_table_name}'")
                        table_columns = columns_by_table[actual_table_name]
                        if isinstance(table_columns, list):
                            columns = [col.get('COLUMN_NAME', '') for col in table_columns if isinstance(col, dict) and col.get('COLUMN_NAME')]
                            logger.info(f"Found {len(columns)} columns for table '{actual_table_name}': {columns[:5]}...")
                            return columns
                    
                    logger.warning(f"Table '{table_name}' not found in schema. Sample available: {list(columns_by_table.keys())[:5]}")
            else:
                logger.warning(f"No system_architecture found in schema for table '{table_name}'")
        except Exception as e:
            logger.error(f"Error retrieving columns for table '{table_name}': {str(e)}")
        
        return []
    
    def fix_column_reference(self, table_name: str, column_name: str) -> str:
        """Fix column reference using comprehensive schema and column mapping"""
        # First check if column exists as-is
        validation = self.validate_column_reference(table_name, column_name)
        if validation['exists']:
            return column_name
        
        # Check column mapping for known fixes
        if table_name in COLUMN_MAPPING and column_name in COLUMN_MAPPING[table_name]:
            fixed_column = COLUMN_MAPPING[table_name][column_name]
            logger.info(f"Fixed column reference: {table_name}.{column_name} -> {table_name}.{fixed_column}")
            return fixed_column
        
        # Try suggestions from validation
        if validation['suggested_columns']:
            suggested = validation['suggested_columns'][0]  # Best suggestion
            logger.info(f"Suggested column reference: {table_name}.{column_name} -> {table_name}.{suggested}")
            return suggested
        
        # Fallback: return original (will likely cause SQL error but logged)
        logger.warning(f"Could not fix column reference: {table_name}.{column_name} - column may not exist")
        return column_name
    
    def get_recommended_joins(self) -> Dict[str, Any]:
        """Get recommended JOIN patterns based on schema analysis"""
        return RECOMMENDED_JOINS
    
    def is_table_problematic(self, table_name: str) -> Dict[str, Any]:
        """Check if a table is known to be problematic"""
        if table_name in PROBLEMATIC_TABLES:
            return {
                'is_problematic': True,
                'details': PROBLEMATIC_TABLES[table_name]
            }
        return {'is_problematic': False}
    
    def get_best_result_tables(self) -> List[str]:
        """Get list of high-value result tables for laboratory data"""
        return RECOMMENDED_RESULT_TABLES
    
    @staticmethod
    def _categorize_tables(table_list: List[str], table_info: Dict[str, Any]) -> Dict[str, Any]:
        """Categorize tables based on naming patterns and content"""
        categories = {
            'core_data': [],
            'results': [],
            'lookup': [],
            'system': [],
            'audit': []
        }
        
        for table_name in table_list:
            table_lower = table_name.lower()
            info = table_info.get(table_name, {})
            row_count = info.get('row_count', 0)
            
            # Categorize based on naming patterns and row counts
            if any(keyword in table_lower for keyword in ['result', 'test', 'analysis']):
                categories['results'].append(table_name)
            elif any(keyword in table_lower for keyword in ['type', 'status', 'lookup', 'reference']) or row_count < 1000:
                categories['lookup'].append(table_name)
            elif any(keyword in table_lower for keyword in ['log', 'audit', 'history']):
                categories['audit'].append(table_name)
            elif any(keyword in table_lower for keyword in ['user', 'role', 'permission', 'account']):
                categories['system'].append(table_name)
            else:
                categories['core_data'].append(table_name)
        
        return categories
                        

Improved Code

🔍 Code Extractor

class DatabaseSchema

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`from_json(cls, json_path: str) -> 'DatabaseSchema'`

`_from_comprehensive_schema(cls, data: Dict[str, Any], json_path: str) -> 'DatabaseSchema'`

`_create_table_categories(cls, table_list: List[str]) -> Dict[str, List[str]]`

`validate_column_reference(self, table_name: str, column_name: str) -> Dict[str, Any]`

`get_available_columns(self, table_name: str) -> List[str]`

`fix_column_reference(self, table_name: str, column_name: str) -> str`

`get_recommended_joins(self) -> Dict[str, Any]`

`is_table_problematic(self, table_name: str) -> Dict[str, Any]`

`get_best_result_tables(self) -> List[str]`

`_categorize_tables(table_list: List[str], table_info: Dict[str, Any]) -> Dict[str, Any]` static

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class DatabaseSchema_v1 85.8% similar

class DatabaseInfo 78.7% similar

class TableInfo 71.0% similar

class DataSource 59.8% similar

class SqlGenerationResult 58.9% similar

class DatabaseSchema

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

from_json(cls, json_path: str) -> 'DatabaseSchema'

_from_comprehensive_schema(cls, data: Dict[str, Any], json_path: str) -> 'DatabaseSchema'

_create_table_categories(cls, table_list: List[str]) -> Dict[str, List[str]]

validate_column_reference(self, table_name: str, column_name: str) -> Dict[str, Any]

get_available_columns(self, table_name: str) -> List[str]

fix_column_reference(self, table_name: str, column_name: str) -> str

get_recommended_joins(self) -> Dict[str, Any]

is_table_problematic(self, table_name: str) -> Dict[str, Any]

get_best_result_tables(self) -> List[str]

_categorize_tables(table_list: List[str], table_info: Dict[str, Any]) -> Dict[str, Any] static

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class DatabaseSchema_v1 85.8% similar

class DatabaseInfo 78.7% similar

class TableInfo 71.0% similar

class DataSource 59.8% similar

class SqlGenerationResult 58.9% similar

✨ Improve Code: DatabaseSchema

Code Comparison

`from_json(cls, json_path: str) -> 'DatabaseSchema'`

`_from_comprehensive_schema(cls, data: Dict[str, Any], json_path: str) -> 'DatabaseSchema'`

`_create_table_categories(cls, table_list: List[str]) -> Dict[str, List[str]]`

`validate_column_reference(self, table_name: str, column_name: str) -> Dict[str, Any]`

`get_available_columns(self, table_name: str) -> List[str]`

`fix_column_reference(self, table_name: str, column_name: str) -> str`

`get_recommended_joins(self) -> Dict[str, Any]`

`is_table_problematic(self, table_name: str) -> Dict[str, Any]`

`get_best_result_tables(self) -> List[str]`

`_categorize_tables(table_list: List[str], table_info: Dict[str, Any]) -> Dict[str, Any]` static