🔍 Code Extractor

class SQLQueryGenerator_v1

Maturity: 26

Generates SQL queries based on user requests and database schema

File:
/tf/active/vicechatdev/smartstat/sql_query_generator.py
Lines:
82 - 505
Complexity:
moderate

Purpose

Generates SQL queries based on user requests and database schema

Source Code

class SQLQueryGenerator:
    """Generates SQL queries based on user requests and database schema"""
    
    def __init__(self, schema: DatabaseSchema, statistical_agent=None):
        self.schema = schema
        self.statistical_agent = statistical_agent
        self.is_poulpharm = schema.database_name.lower() == 'poulpharm'
        self._setup_poulpharm_optimizations() if self.is_poulpharm else None
        
    def _setup_poulpharm_optimizations(self):
        """Setup Poulpharm LIMS specific query optimizations"""
        # Common query patterns for Poulpharm LIMS
        self.common_patterns = {
            'recent_requests': {
                'keywords': ['recent', 'latest', 'new', 'current'],
                'base_query': 'Requests WHERE DateCreated >= DATEADD(month, -3, GETDATE())'
            },
            'bacteriology': {
                'keywords': ['bacteria', 'antibiogram', 'sensitivity', 'resistance', 'germ'],
                'tables': ['tblBactResult', 'tblAntibiogram', 'tblBactResultLine']
            },
            'pcr_analysis': {
                'keywords': ['pcr', 'molecular', 'genetic'],
                'tables': ['tblPCRResult', 'tblPCRKit', 'tblPCRImport']
            },
            'customer_analysis': {
                'keywords': ['customer', 'company', 'veterinarian', 'client'],
                'tables': ['Companies', 'tblDierenarts', 'Requests']
            },
            'sample_workflow': {
                'keywords': ['sample', 'workflow', 'processing', 'status'],
                'tables': ['Samples', 'SampleAnalyseGroups', 'AnalysisGroups']
            }
        }
        
    def generate_sql_query(self, user_query: str, max_rows: int = 10000) -> Tuple[str, Dict[str, Any]]:
        """
        Generate SQL query based on user analysis request
        Returns: (sql_query, metadata)
        """
        try:
            # Prepare context for the LLM
            context = self._prepare_database_context()
            
            # Create prompt for SQL generation
            prompt = self._create_sql_generation_prompt(user_query, context, max_rows)
            
            # Use statistical agent to generate SQL if available
            if self.statistical_agent:
                response = self._generate_with_llm(prompt)
                sql_query, explanation = self._parse_llm_response(response)
            else:
                # Fallback: basic SQL generation
                sql_query, explanation = self._generate_basic_sql(user_query, max_rows)
            
            metadata = {
                'user_query': user_query,
                'database_name': self.schema.database_name,
                'explanation': explanation,
                'generated_at': datetime.now().isoformat(),
                'max_rows': max_rows
            }
            
            return sql_query, metadata
            
        except Exception as e:
            logger.error(f"Error generating SQL query: {str(e)}")
            raise
    
    def _prepare_database_context(self) -> str:
        """Prepare database schema context for LLM"""
        context_parts = []
        
        # Database overview
        context_parts.append(f"DATABASE: {self.schema.database_name}")
        context_parts.append(f"DESCRIPTION: {self.schema.description}")
        
        # Poulpharm-specific context
        if self.is_poulpharm:
            context_parts.append("\nPOULPHARM LIMS SPECIFICS:")
            context_parts.append("- This is a veterinary laboratory information management system")
            context_parts.append("- Main workflow: Requests -> Samples -> SampleAnalyseGroups -> Results")
            context_parts.append("- Key date filtering: Use DateCreated for requests, DateSampling for samples")
            context_parts.append("- Multilingual support: Most tables have Name_nl, Name_fr, Name_en columns")
            context_parts.append("- Always consider TechValidated and BioValidated for result quality")
        
        context_parts.append("")
        
        # Key table categories with detailed information
        context_parts.append("KEY TABLE CATEGORIES:")
        for category_name, category_info in self.schema.table_categories.items():
            context_parts.append(f"\n{category_name}:")
            context_parts.append(f"  Description: {category_info.get('description', '')}")
            
            if 'tables' in category_info:
                for table in category_info['tables']:
                    context_parts.append(f"  - {table['name']}: {table.get('description', '')}")
                    if 'columns' in table:
                        # Truncate very long column lists
                        columns = table['columns']
                        if len(columns) > 200:
                            columns = columns[:200] + "..."
                        context_parts.append(f"    Columns: {columns}")
                    if 'relationships' in table:
                        context_parts.append(f"    Relationships: {table['relationships']}")
            elif 'key_tables' in category_info:
                for table_desc in category_info['key_tables']:
                    context_parts.append(f"  - {table_desc}")
        
        # Key relationships
        context_parts.append("\nKEY RELATIONSHIPS:")
        for rel_name, rel_info in self.schema.key_relationships.items():
            context_parts.append(f"{rel_name}: {rel_info.get('description', '')}")
            if 'flow' in rel_info:
                context_parts.append(f"  Flow: {rel_info['flow']}")
            if 'relationships' in rel_info:
                for rel in rel_info['relationships']:
                    context_parts.append(f"  - {rel}")
        
        return "\n".join(context_parts)
    
    def _create_sql_generation_prompt(self, user_query: str, db_context: str, max_rows: int) -> str:
        """Create prompt for SQL generation"""
        prompt = f"""You are an expert SQL analyst for veterinary laboratory data. Generate an appropriate SQL query based on the user's analysis request and the database schema provided.

DATABASE SCHEMA CONTEXT:
{db_context}

USER ANALYSIS REQUEST:
{user_query}

REQUIREMENTS:
1. Generate a SQL Server compatible query (T-SQL syntax)
2. Limit results to maximum {max_rows} rows using TOP clause
3. Include appropriate JOINs based on the relationships described
4. Select relevant columns for the requested analysis
5. Add appropriate WHERE clauses for data quality (e.g., exclude NULL values where necessary)
6. Use meaningful aliases for complex expressions
7. Consider the veterinary laboratory context (samples, tests, results, customers, etc.)

POULPHARM LIMS BEST PRACTICES:
- For recent data: Use DateCreated >= DATEADD(month, -N, GETDATE())
- For sample analysis: JOIN Requests -> Samples -> SampleAnalyseGroups -> Results
- For customer info: Use sCustomer from Requests or Name from Companies
- For bacteriology: Include tblBactResult and tblAntibiogram tables
- For validation status: Check TechValidated=1 AND BioValidated=1 for final results
- For multilingual: Prefer Name_en columns for English output
- Always include RequestNr and SampleNr for traceability

OUTPUT FORMAT:
```sql
-- Explanation: [Brief explanation of what the query does and why]
[SQL Query]
```

Generate the SQL query now:"""
        
        return prompt
    
    def _generate_with_llm(self, prompt: str) -> str:
        """Generate SQL using the statistical agent's LLM"""
        try:
            # Add pattern-specific guidance for Poulpharm
            enhanced_prompt = prompt
            if self.is_poulpharm:
                pattern_hints = self._get_poulpharm_pattern_hints(prompt)
                if pattern_hints:
                    enhanced_prompt += f"\n\nSPECIFIC HINTS FOR THIS QUERY:\n{pattern_hints}"
            
            # Use the statistical agent to generate SQL
            response = self.statistical_agent.query_llm(enhanced_prompt)
            return response
        except Exception as e:
            logger.error(f"Error generating SQL with LLM: {str(e)}")
            raise
    
    def _get_poulpharm_pattern_hints(self, prompt: str) -> str:
        """Get Poulpharm-specific hints based on detected patterns"""
        hints = []
        prompt_lower = prompt.lower()
        
        for pattern_name, pattern_info in self.common_patterns.items():
            if any(keyword in prompt_lower for keyword in pattern_info['keywords']):
                if pattern_name == 'recent_requests':
                    hints.append("- Start with Requests table and filter by DateCreated for recent data")
                elif pattern_name == 'bacteriology':
                    hints.append("- Include tblBactResult for bacterial isolates and tblAntibiogram for sensitivity")
                    hints.append("- Join through Samples: Requests -> Samples -> tblBactResult")
                elif pattern_name == 'customer_analysis':
                    hints.append("- Use Companies table for customer details, sCustomer field in Requests for quick lookup")
                elif pattern_name == 'sample_workflow':
                    hints.append("- Follow workflow: Requests -> Samples -> SampleAnalyseGroups -> AnalysisGroups")
        
        return "\n".join(hints)
    
    def _parse_llm_response(self, response: str) -> Tuple[str, str]:
        """Parse LLM response to extract SQL query and explanation"""
        try:
            # Look for SQL code block
            import re
            
            # Find explanation
            explanation_match = re.search(r'-- Explanation: (.+)', response)
            explanation = explanation_match.group(1).strip() if explanation_match else "Generated SQL query"
            
            # Find SQL query in code block
            sql_match = re.search(r'```sql\s*(?:-- Explanation: .+\n)?(.*?)```', response, re.DOTALL)
            if sql_match:
                sql_query = sql_match.group(1).strip()
                # Remove the explanation line if it's still in the SQL
                sql_query = re.sub(r'^-- Explanation: .+\n', '', sql_query, flags=re.MULTILINE)
            else:
                # Fallback: look for any SQL-like content
                lines = response.split('\n')
                sql_lines = []
                in_sql = False
                for line in lines:
                    if 'SELECT' in line.upper() or in_sql:
                        in_sql = True
                        sql_lines.append(line)
                        if line.strip().endswith(';') or (len(sql_lines) > 20):
                            break
                sql_query = '\n'.join(sql_lines).strip()
            
            if not sql_query:
                raise ValueError("Could not extract SQL query from LLM response")
            
            return sql_query, explanation
            
        except Exception as e:
            logger.error(f"Error parsing LLM response: {str(e)}")
            # Return the full response as fallback
            return response.strip(), "Generated SQL query"
    
    def _generate_basic_sql(self, user_query: str, max_rows: int) -> Tuple[str, str]:
        """Fallback basic SQL generation without LLM"""
        # Enhanced keyword-based SQL generation for Poulpharm
        query_lower = user_query.lower()
        
        # Start with basic query structure
        sql_parts = []
        explanation = "Basic SQL query based on keywords in user request"
        
        # Enhanced pattern detection for Poulpharm LIMS
        if self.is_poulpharm:
            return self._generate_poulpharm_specific_sql(query_lower, max_rows)
        
        # Generic fallback for other databases
        
        # Determine main tables based on keywords
        if any(word in query_lower for word in ['request', 'sample', 'test', 'result']):
            # Main laboratory workflow
            sql_parts.append(f"SELECT TOP {max_rows}")
            sql_parts.append("    r.RequestNr, r.DateCreated, r.sCustomer,")
            sql_parts.append("    s.SampleNr, s.Identification, s.DateSampling,")
            sql_parts.append("    ag.Name_en as AnalysisGroup,")
            sql_parts.append("    res.DateCreated as ResultDate, res.TechValidated, res.BioValidated")
            sql_parts.append("FROM Requests r")
            sql_parts.append("INNER JOIN Samples s ON s.Sample_Request = r.Id")
            sql_parts.append("INNER JOIN SampleAnalyseGroups sag ON sag.SampleAnalyseGroup_Sample = s.Id")
            sql_parts.append("INNER JOIN AnalysisGroups ag ON ag.Id = sag.RequestAnalyseGroup_AnalysisGroup")
            sql_parts.append("LEFT JOIN Results res ON res.Result_Sample = s.Id")
            sql_parts.append("WHERE r.DateCreated >= DATEADD(month, -6, GETDATE())")
            sql_parts.append("ORDER BY r.DateCreated DESC")
            explanation = "Query retrieving recent laboratory requests, samples, and results"
            
        elif any(word in query_lower for word in ['customer', 'company', 'veterinarian']):
            # Customer/company information
            sql_parts.append(f"SELECT TOP {max_rows}")
            sql_parts.append("    c.Name, c.City, c.Country,")
            sql_parts.append("    COUNT(r.Id) as TotalRequests")
            sql_parts.append("FROM Companies c")
            sql_parts.append("LEFT JOIN Requests r ON r.Customer_ID = c.Id")
            sql_parts.append("GROUP BY c.Name, c.City, c.Country")
            sql_parts.append("ORDER BY TotalRequests DESC")
            explanation = "Query retrieving customer/company information with request counts"
            
        elif any(word in query_lower for word in ['bacteria', 'antibiogram', 'sensitivity']):
            # Bacteriology data
            sql_parts.append(f"SELECT TOP {max_rows}")
            sql_parts.append("    r.RequestNr, s.SampleNr,")
            sql_parts.append("    br.Germ, br.DateCreated,")
            sql_parts.append("    ab.Antibioticum, ab.Breekpunt, ab.Result")
            sql_parts.append("FROM Requests r")
            sql_parts.append("INNER JOIN Samples s ON s.Sample_Request = r.Id")
            sql_parts.append("INNER JOIN tblBactResult br ON br.Sample_ID = s.Id")
            sql_parts.append("LEFT JOIN tblAntibiogram ab ON ab.BactResult_ID = br.Id")
            sql_parts.append("WHERE br.DateCreated >= DATEADD(month, -3, GETDATE())")
            sql_parts.append("ORDER BY br.DateCreated DESC")
            explanation = "Query retrieving bacteriology results and antibiogram data"
            
        else:
            # Default: basic request information
            sql_parts.append(f"SELECT TOP {max_rows}")
            sql_parts.append("    RequestNr, DateCreated, sCustomer,")
            sql_parts.append("    TestCode, StartNumber")
            sql_parts.append("FROM Requests")
            sql_parts.append("WHERE DateCreated >= DATEADD(month, -3, GETDATE())")
            sql_parts.append("ORDER BY DateCreated DESC")
            explanation = "Default query retrieving recent laboratory requests"
        
        sql_query = "\n".join(sql_parts)
        return sql_query, explanation
    
    def _generate_poulpharm_specific_sql(self, query_lower: str, max_rows: int) -> Tuple[str, str]:
        """Generate Poulpharm LIMS specific SQL based on common patterns"""
        sql_parts = []
        explanation = ""
        
        # Bacteriology analysis
        if any(word in query_lower for word in ['bacteria', 'antibiogram', 'sensitivity', 'resistance', 'germ']):
            sql_parts.extend([
                f"SELECT TOP {max_rows}",
                "    r.RequestNr, r.DateCreated, r.sCustomer,",
                "    s.SampleNr, s.Identification,",
                "    br.Germ, br.DateCreated as BactDate,",
                "    ab.Antibioticum, ab.Breekpunt, ab.Result as Sensitivity",
                "FROM Requests r",
                "INNER JOIN Samples s ON s.Sample_Request = r.Id",
                "INNER JOIN tblBactResult br ON br.Sample_ID = s.Id",
                "LEFT JOIN tblAntibiogram ab ON ab.BactResult_ID = br.Id",
                "WHERE r.DateCreated >= DATEADD(month, -6, GETDATE())",
                "  AND br.Germ IS NOT NULL",
                "ORDER BY r.DateCreated DESC, br.DateCreated DESC"
            ])
            explanation = "Bacteriology results with antibiogram sensitivity data"
            
        # Customer/company analysis
        elif any(word in query_lower for word in ['customer', 'company', 'veterinarian', 'client']):
            sql_parts.extend([
                f"SELECT TOP {max_rows}",
                "    c.Name as CompanyName, c.City, c.Country,",
                "    v.Dia_Naam as VeterinarianName, v.Dia_Email,",
                "    COUNT(r.Id) as TotalRequests,",
                "    MAX(r.DateCreated) as LastRequestDate",
                "FROM Companies c",
                "LEFT JOIN Requests r ON r.Customer_ID = c.Id",
                "LEFT JOIN tblDierenarts v ON v.DierenartsID = r.Veterinarian_ID",
                "WHERE r.DateCreated >= DATEADD(year, -1, GETDATE())",
                "GROUP BY c.Name, c.City, c.Country, v.Dia_Naam, v.Dia_Email",
                "HAVING COUNT(r.Id) > 0",
                "ORDER BY TotalRequests DESC"
            ])
            explanation = "Customer analysis with request statistics and veterinarian information"
            
        # PCR analysis
        elif any(word in query_lower for word in ['pcr', 'molecular', 'genetic']):
            sql_parts.extend([
                f"SELECT TOP {max_rows}",
                "    r.RequestNr, r.DateCreated, r.sCustomer,",
                "    s.SampleNr, s.Identification,",
                "    pr.DateCreated as PCRDate,",
                "    pk.Name_en as PCRKit,",
                "    pr.Result as PCRResult",
                "FROM Requests r",
                "INNER JOIN Samples s ON s.Sample_Request = r.Id",
                "INNER JOIN tblPCRResult pr ON pr.Sample_ID = s.Id",
                "LEFT JOIN tblPCRKit pk ON pk.Id = pr.PCRKit_ID",
                "WHERE r.DateCreated >= DATEADD(month, -6, GETDATE())",
                "ORDER BY r.DateCreated DESC"
            ])
            explanation = "PCR test results with kit information"
            
        # Sample workflow analysis
        elif any(word in query_lower for word in ['sample', 'workflow', 'processing', 'status', 'analysis']):
            sql_parts.extend([
                f"SELECT TOP {max_rows}",
                "    r.RequestNr, r.DateCreated, r.sCustomer,",
                "    s.SampleNr, s.Identification, s.DateSampling,",
                "    st.Name_en as SampleType,",
                "    ag.Name_en as AnalysisGroup,",
                "    sag.IsAnalyseDone, sag.IsAnalyseExported,",
                "    COUNT(res.Id) as ResultCount",
                "FROM Requests r",
                "INNER JOIN Samples s ON s.Sample_Request = r.Id",
                "LEFT JOIN SampleTypes st ON st.Id = s.Sample_SampleType",
                "INNER JOIN SampleAnalyseGroups sag ON sag.SampleAnalyseGroup_Sample = s.Id",
                "INNER JOIN AnalysisGroups ag ON ag.Id = sag.RequestAnalyseGroup_AnalysisGroup",
                "LEFT JOIN Results res ON res.Result_Sample = s.Id",
                "WHERE r.DateCreated >= DATEADD(month, -3, GETDATE())",
                "GROUP BY r.RequestNr, r.DateCreated, r.sCustomer, s.SampleNr, s.Identification,",
                "         s.DateSampling, st.Name_en, ag.Name_en, sag.IsAnalyseDone, sag.IsAnalyseExported",
                "ORDER BY r.DateCreated DESC"
            ])
            explanation = "Sample workflow status with analysis progress"
            
        # Quality/validation analysis
        elif any(word in query_lower for word in ['quality', 'validation', 'validated', 'approved']):
            sql_parts.extend([
                f"SELECT TOP {max_rows}",
                "    r.RequestNr, r.DateCreated, r.sCustomer,",
                "    s.SampleNr, s.Identification,",
                "    res.DateCreated as ResultDate,",
                "    res.TechValidated, res.BioValidated,",
                "    res.TechValidatedBy, res.BioValidatedBy,",
                "    res.DateTechValidated, res.DateBioValidated",
                "FROM Requests r",
                "INNER JOIN Samples s ON s.Sample_Request = r.Id",
                "INNER JOIN Results res ON res.Result_Sample = s.Id",
                "WHERE r.DateCreated >= DATEADD(month, -3, GETDATE())",
                "  AND (res.TechValidated = 1 OR res.BioValidated = 1)",
                "ORDER BY res.DateCreated DESC"
            ])
            explanation = "Results with validation status and validator information"
            
        # Default Poulpharm query - recent requests with basic info
        else:
            sql_parts.extend([
                f"SELECT TOP {max_rows}",
                "    r.RequestNr, r.DateCreated, r.sCustomer, r.sVeterinarian,",
                "    r.TestCode, r.StartNumber,",
                "    COUNT(s.Id) as SampleCount,",
                "    COUNT(DISTINCT sag.Id) as AnalysisGroupCount",
                "FROM Requests r",
                "LEFT JOIN Samples s ON s.Sample_Request = r.Id",
                "LEFT JOIN SampleAnalyseGroups sag ON sag.SampleAnalyseGroup_Sample = s.Id",
                "WHERE r.DateCreated >= DATEADD(month, -3, GETDATE())",
                "GROUP BY r.RequestNr, r.DateCreated, r.sCustomer, r.sVeterinarian, r.TestCode, r.StartNumber",
                "ORDER BY r.DateCreated DESC"
            ])
            explanation = "Recent laboratory requests with sample and analysis counts"
        
        sql_query = "\n".join(sql_parts)
        return sql_query, explanation

Parameters

Name Type Default Kind
bases - -

Parameter Details

bases: Parameter of type

Return Value

Returns unspecified type

Class Interface

Methods

__init__(self, schema, statistical_agent)

Purpose: Internal method: init

Parameters:

  • schema: Type: DatabaseSchema
  • statistical_agent: Parameter

Returns: None

_setup_poulpharm_optimizations(self)

Purpose: Setup Poulpharm LIMS specific query optimizations

Returns: None

generate_sql_query(self, user_query, max_rows) -> Tuple[str, Dict[str, Any]]

Purpose: Generate SQL query based on user analysis request Returns: (sql_query, metadata)

Parameters:

  • user_query: Type: str
  • max_rows: Type: int

Returns: Returns Tuple[str, Dict[str, Any]]

_prepare_database_context(self) -> str

Purpose: Prepare database schema context for LLM

Returns: Returns str

_create_sql_generation_prompt(self, user_query, db_context, max_rows) -> str

Purpose: Create prompt for SQL generation

Parameters:

  • user_query: Type: str
  • db_context: Type: str
  • max_rows: Type: int

Returns: Returns str

_generate_with_llm(self, prompt) -> str

Purpose: Generate SQL using the statistical agent's LLM

Parameters:

  • prompt: Type: str

Returns: Returns str

_get_poulpharm_pattern_hints(self, prompt) -> str

Purpose: Get Poulpharm-specific hints based on detected patterns

Parameters:

  • prompt: Type: str

Returns: Returns str

_parse_llm_response(self, response) -> Tuple[str, str]

Purpose: Parse LLM response to extract SQL query and explanation

Parameters:

  • response: Type: str

Returns: Returns Tuple[str, str]

_generate_basic_sql(self, user_query, max_rows) -> Tuple[str, str]

Purpose: Fallback basic SQL generation without LLM

Parameters:

  • user_query: Type: str
  • max_rows: Type: int

Returns: Returns Tuple[str, str]

_generate_poulpharm_specific_sql(self, query_lower, max_rows) -> Tuple[str, str]

Purpose: Generate Poulpharm LIMS specific SQL based on common patterns

Parameters:

  • query_lower: Type: str
  • max_rows: Type: int

Returns: Returns Tuple[str, str]

Required Imports

import json
import logging
from typing import Dict
from typing import List
from typing import Optional

Usage Example

# Example usage:
# result = SQLQueryGenerator(bases)

Similar Components

AI-powered semantic similarity - components with related functionality:

  • class SQLQueryGenerator 97.8% similar

    Generates SQL queries based on user requests and database schema

    From: /tf/active/vicechatdev/full_smartstat/sql_query_generator.py
  • class SqlGenerationResult 59.2% similar

    A dataclass that encapsulates the results of an SQL query generation operation, including the generated query, explanation, confidence score, and metadata about database objects used.

    From: /tf/active/vicechatdev/full_smartstat/two_pass_sql_workflow.py
  • function main_v62 57.8% similar

    Demonstrates a SmartStat SQL workflow by loading a database schema, initializing a SQL query generator, and generating SQL queries from natural language requests with detailed output and metadata.

    From: /tf/active/vicechatdev/full_smartstat/demo_sql_workflow.py
  • class QueryIteration 52.0% similar

    A dataclass representing a single iteration in an iterative SQL query generation and evaluation workflow, capturing the query, its results, assessment, and improvement suggestions.

    From: /tf/active/vicechatdev/full_smartstat/enhanced_sql_workflow.py
  • function main_v63 51.9% similar

    Demonstrates the SmartStat SQL Workflow by loading a database schema, initializing a SQL query generator, and generating SQL queries from natural language requests for various laboratory data analysis scenarios.

    From: /tf/active/vicechatdev/smartstat/demo_sql_workflow.py
← Back to Browse