class SQLQueryGenerator_v1
Generates SQL queries based on user requests and database schema
/tf/active/vicechatdev/smartstat/sql_query_generator.py
82 - 505
moderate
Purpose
Generates SQL queries based on user requests and database schema
Source Code
class SQLQueryGenerator:
"""Generates SQL queries based on user requests and database schema"""
def __init__(self, schema: DatabaseSchema, statistical_agent=None):
self.schema = schema
self.statistical_agent = statistical_agent
self.is_poulpharm = schema.database_name.lower() == 'poulpharm'
self._setup_poulpharm_optimizations() if self.is_poulpharm else None
def _setup_poulpharm_optimizations(self):
"""Setup Poulpharm LIMS specific query optimizations"""
# Common query patterns for Poulpharm LIMS
self.common_patterns = {
'recent_requests': {
'keywords': ['recent', 'latest', 'new', 'current'],
'base_query': 'Requests WHERE DateCreated >= DATEADD(month, -3, GETDATE())'
},
'bacteriology': {
'keywords': ['bacteria', 'antibiogram', 'sensitivity', 'resistance', 'germ'],
'tables': ['tblBactResult', 'tblAntibiogram', 'tblBactResultLine']
},
'pcr_analysis': {
'keywords': ['pcr', 'molecular', 'genetic'],
'tables': ['tblPCRResult', 'tblPCRKit', 'tblPCRImport']
},
'customer_analysis': {
'keywords': ['customer', 'company', 'veterinarian', 'client'],
'tables': ['Companies', 'tblDierenarts', 'Requests']
},
'sample_workflow': {
'keywords': ['sample', 'workflow', 'processing', 'status'],
'tables': ['Samples', 'SampleAnalyseGroups', 'AnalysisGroups']
}
}
def generate_sql_query(self, user_query: str, max_rows: int = 10000) -> Tuple[str, Dict[str, Any]]:
"""
Generate SQL query based on user analysis request
Returns: (sql_query, metadata)
"""
try:
# Prepare context for the LLM
context = self._prepare_database_context()
# Create prompt for SQL generation
prompt = self._create_sql_generation_prompt(user_query, context, max_rows)
# Use statistical agent to generate SQL if available
if self.statistical_agent:
response = self._generate_with_llm(prompt)
sql_query, explanation = self._parse_llm_response(response)
else:
# Fallback: basic SQL generation
sql_query, explanation = self._generate_basic_sql(user_query, max_rows)
metadata = {
'user_query': user_query,
'database_name': self.schema.database_name,
'explanation': explanation,
'generated_at': datetime.now().isoformat(),
'max_rows': max_rows
}
return sql_query, metadata
except Exception as e:
logger.error(f"Error generating SQL query: {str(e)}")
raise
def _prepare_database_context(self) -> str:
"""Prepare database schema context for LLM"""
context_parts = []
# Database overview
context_parts.append(f"DATABASE: {self.schema.database_name}")
context_parts.append(f"DESCRIPTION: {self.schema.description}")
# Poulpharm-specific context
if self.is_poulpharm:
context_parts.append("\nPOULPHARM LIMS SPECIFICS:")
context_parts.append("- This is a veterinary laboratory information management system")
context_parts.append("- Main workflow: Requests -> Samples -> SampleAnalyseGroups -> Results")
context_parts.append("- Key date filtering: Use DateCreated for requests, DateSampling for samples")
context_parts.append("- Multilingual support: Most tables have Name_nl, Name_fr, Name_en columns")
context_parts.append("- Always consider TechValidated and BioValidated for result quality")
context_parts.append("")
# Key table categories with detailed information
context_parts.append("KEY TABLE CATEGORIES:")
for category_name, category_info in self.schema.table_categories.items():
context_parts.append(f"\n{category_name}:")
context_parts.append(f" Description: {category_info.get('description', '')}")
if 'tables' in category_info:
for table in category_info['tables']:
context_parts.append(f" - {table['name']}: {table.get('description', '')}")
if 'columns' in table:
# Truncate very long column lists
columns = table['columns']
if len(columns) > 200:
columns = columns[:200] + "..."
context_parts.append(f" Columns: {columns}")
if 'relationships' in table:
context_parts.append(f" Relationships: {table['relationships']}")
elif 'key_tables' in category_info:
for table_desc in category_info['key_tables']:
context_parts.append(f" - {table_desc}")
# Key relationships
context_parts.append("\nKEY RELATIONSHIPS:")
for rel_name, rel_info in self.schema.key_relationships.items():
context_parts.append(f"{rel_name}: {rel_info.get('description', '')}")
if 'flow' in rel_info:
context_parts.append(f" Flow: {rel_info['flow']}")
if 'relationships' in rel_info:
for rel in rel_info['relationships']:
context_parts.append(f" - {rel}")
return "\n".join(context_parts)
def _create_sql_generation_prompt(self, user_query: str, db_context: str, max_rows: int) -> str:
"""Create prompt for SQL generation"""
prompt = f"""You are an expert SQL analyst for veterinary laboratory data. Generate an appropriate SQL query based on the user's analysis request and the database schema provided.
DATABASE SCHEMA CONTEXT:
{db_context}
USER ANALYSIS REQUEST:
{user_query}
REQUIREMENTS:
1. Generate a SQL Server compatible query (T-SQL syntax)
2. Limit results to maximum {max_rows} rows using TOP clause
3. Include appropriate JOINs based on the relationships described
4. Select relevant columns for the requested analysis
5. Add appropriate WHERE clauses for data quality (e.g., exclude NULL values where necessary)
6. Use meaningful aliases for complex expressions
7. Consider the veterinary laboratory context (samples, tests, results, customers, etc.)
POULPHARM LIMS BEST PRACTICES:
- For recent data: Use DateCreated >= DATEADD(month, -N, GETDATE())
- For sample analysis: JOIN Requests -> Samples -> SampleAnalyseGroups -> Results
- For customer info: Use sCustomer from Requests or Name from Companies
- For bacteriology: Include tblBactResult and tblAntibiogram tables
- For validation status: Check TechValidated=1 AND BioValidated=1 for final results
- For multilingual: Prefer Name_en columns for English output
- Always include RequestNr and SampleNr for traceability
OUTPUT FORMAT:
```sql
-- Explanation: [Brief explanation of what the query does and why]
[SQL Query]
```
Generate the SQL query now:"""
return prompt
def _generate_with_llm(self, prompt: str) -> str:
"""Generate SQL using the statistical agent's LLM"""
try:
# Add pattern-specific guidance for Poulpharm
enhanced_prompt = prompt
if self.is_poulpharm:
pattern_hints = self._get_poulpharm_pattern_hints(prompt)
if pattern_hints:
enhanced_prompt += f"\n\nSPECIFIC HINTS FOR THIS QUERY:\n{pattern_hints}"
# Use the statistical agent to generate SQL
response = self.statistical_agent.query_llm(enhanced_prompt)
return response
except Exception as e:
logger.error(f"Error generating SQL with LLM: {str(e)}")
raise
def _get_poulpharm_pattern_hints(self, prompt: str) -> str:
"""Get Poulpharm-specific hints based on detected patterns"""
hints = []
prompt_lower = prompt.lower()
for pattern_name, pattern_info in self.common_patterns.items():
if any(keyword in prompt_lower for keyword in pattern_info['keywords']):
if pattern_name == 'recent_requests':
hints.append("- Start with Requests table and filter by DateCreated for recent data")
elif pattern_name == 'bacteriology':
hints.append("- Include tblBactResult for bacterial isolates and tblAntibiogram for sensitivity")
hints.append("- Join through Samples: Requests -> Samples -> tblBactResult")
elif pattern_name == 'customer_analysis':
hints.append("- Use Companies table for customer details, sCustomer field in Requests for quick lookup")
elif pattern_name == 'sample_workflow':
hints.append("- Follow workflow: Requests -> Samples -> SampleAnalyseGroups -> AnalysisGroups")
return "\n".join(hints)
def _parse_llm_response(self, response: str) -> Tuple[str, str]:
"""Parse LLM response to extract SQL query and explanation"""
try:
# Look for SQL code block
import re
# Find explanation
explanation_match = re.search(r'-- Explanation: (.+)', response)
explanation = explanation_match.group(1).strip() if explanation_match else "Generated SQL query"
# Find SQL query in code block
sql_match = re.search(r'```sql\s*(?:-- Explanation: .+\n)?(.*?)```', response, re.DOTALL)
if sql_match:
sql_query = sql_match.group(1).strip()
# Remove the explanation line if it's still in the SQL
sql_query = re.sub(r'^-- Explanation: .+\n', '', sql_query, flags=re.MULTILINE)
else:
# Fallback: look for any SQL-like content
lines = response.split('\n')
sql_lines = []
in_sql = False
for line in lines:
if 'SELECT' in line.upper() or in_sql:
in_sql = True
sql_lines.append(line)
if line.strip().endswith(';') or (len(sql_lines) > 20):
break
sql_query = '\n'.join(sql_lines).strip()
if not sql_query:
raise ValueError("Could not extract SQL query from LLM response")
return sql_query, explanation
except Exception as e:
logger.error(f"Error parsing LLM response: {str(e)}")
# Return the full response as fallback
return response.strip(), "Generated SQL query"
def _generate_basic_sql(self, user_query: str, max_rows: int) -> Tuple[str, str]:
"""Fallback basic SQL generation without LLM"""
# Enhanced keyword-based SQL generation for Poulpharm
query_lower = user_query.lower()
# Start with basic query structure
sql_parts = []
explanation = "Basic SQL query based on keywords in user request"
# Enhanced pattern detection for Poulpharm LIMS
if self.is_poulpharm:
return self._generate_poulpharm_specific_sql(query_lower, max_rows)
# Generic fallback for other databases
# Determine main tables based on keywords
if any(word in query_lower for word in ['request', 'sample', 'test', 'result']):
# Main laboratory workflow
sql_parts.append(f"SELECT TOP {max_rows}")
sql_parts.append(" r.RequestNr, r.DateCreated, r.sCustomer,")
sql_parts.append(" s.SampleNr, s.Identification, s.DateSampling,")
sql_parts.append(" ag.Name_en as AnalysisGroup,")
sql_parts.append(" res.DateCreated as ResultDate, res.TechValidated, res.BioValidated")
sql_parts.append("FROM Requests r")
sql_parts.append("INNER JOIN Samples s ON s.Sample_Request = r.Id")
sql_parts.append("INNER JOIN SampleAnalyseGroups sag ON sag.SampleAnalyseGroup_Sample = s.Id")
sql_parts.append("INNER JOIN AnalysisGroups ag ON ag.Id = sag.RequestAnalyseGroup_AnalysisGroup")
sql_parts.append("LEFT JOIN Results res ON res.Result_Sample = s.Id")
sql_parts.append("WHERE r.DateCreated >= DATEADD(month, -6, GETDATE())")
sql_parts.append("ORDER BY r.DateCreated DESC")
explanation = "Query retrieving recent laboratory requests, samples, and results"
elif any(word in query_lower for word in ['customer', 'company', 'veterinarian']):
# Customer/company information
sql_parts.append(f"SELECT TOP {max_rows}")
sql_parts.append(" c.Name, c.City, c.Country,")
sql_parts.append(" COUNT(r.Id) as TotalRequests")
sql_parts.append("FROM Companies c")
sql_parts.append("LEFT JOIN Requests r ON r.Customer_ID = c.Id")
sql_parts.append("GROUP BY c.Name, c.City, c.Country")
sql_parts.append("ORDER BY TotalRequests DESC")
explanation = "Query retrieving customer/company information with request counts"
elif any(word in query_lower for word in ['bacteria', 'antibiogram', 'sensitivity']):
# Bacteriology data
sql_parts.append(f"SELECT TOP {max_rows}")
sql_parts.append(" r.RequestNr, s.SampleNr,")
sql_parts.append(" br.Germ, br.DateCreated,")
sql_parts.append(" ab.Antibioticum, ab.Breekpunt, ab.Result")
sql_parts.append("FROM Requests r")
sql_parts.append("INNER JOIN Samples s ON s.Sample_Request = r.Id")
sql_parts.append("INNER JOIN tblBactResult br ON br.Sample_ID = s.Id")
sql_parts.append("LEFT JOIN tblAntibiogram ab ON ab.BactResult_ID = br.Id")
sql_parts.append("WHERE br.DateCreated >= DATEADD(month, -3, GETDATE())")
sql_parts.append("ORDER BY br.DateCreated DESC")
explanation = "Query retrieving bacteriology results and antibiogram data"
else:
# Default: basic request information
sql_parts.append(f"SELECT TOP {max_rows}")
sql_parts.append(" RequestNr, DateCreated, sCustomer,")
sql_parts.append(" TestCode, StartNumber")
sql_parts.append("FROM Requests")
sql_parts.append("WHERE DateCreated >= DATEADD(month, -3, GETDATE())")
sql_parts.append("ORDER BY DateCreated DESC")
explanation = "Default query retrieving recent laboratory requests"
sql_query = "\n".join(sql_parts)
return sql_query, explanation
def _generate_poulpharm_specific_sql(self, query_lower: str, max_rows: int) -> Tuple[str, str]:
"""Generate Poulpharm LIMS specific SQL based on common patterns"""
sql_parts = []
explanation = ""
# Bacteriology analysis
if any(word in query_lower for word in ['bacteria', 'antibiogram', 'sensitivity', 'resistance', 'germ']):
sql_parts.extend([
f"SELECT TOP {max_rows}",
" r.RequestNr, r.DateCreated, r.sCustomer,",
" s.SampleNr, s.Identification,",
" br.Germ, br.DateCreated as BactDate,",
" ab.Antibioticum, ab.Breekpunt, ab.Result as Sensitivity",
"FROM Requests r",
"INNER JOIN Samples s ON s.Sample_Request = r.Id",
"INNER JOIN tblBactResult br ON br.Sample_ID = s.Id",
"LEFT JOIN tblAntibiogram ab ON ab.BactResult_ID = br.Id",
"WHERE r.DateCreated >= DATEADD(month, -6, GETDATE())",
" AND br.Germ IS NOT NULL",
"ORDER BY r.DateCreated DESC, br.DateCreated DESC"
])
explanation = "Bacteriology results with antibiogram sensitivity data"
# Customer/company analysis
elif any(word in query_lower for word in ['customer', 'company', 'veterinarian', 'client']):
sql_parts.extend([
f"SELECT TOP {max_rows}",
" c.Name as CompanyName, c.City, c.Country,",
" v.Dia_Naam as VeterinarianName, v.Dia_Email,",
" COUNT(r.Id) as TotalRequests,",
" MAX(r.DateCreated) as LastRequestDate",
"FROM Companies c",
"LEFT JOIN Requests r ON r.Customer_ID = c.Id",
"LEFT JOIN tblDierenarts v ON v.DierenartsID = r.Veterinarian_ID",
"WHERE r.DateCreated >= DATEADD(year, -1, GETDATE())",
"GROUP BY c.Name, c.City, c.Country, v.Dia_Naam, v.Dia_Email",
"HAVING COUNT(r.Id) > 0",
"ORDER BY TotalRequests DESC"
])
explanation = "Customer analysis with request statistics and veterinarian information"
# PCR analysis
elif any(word in query_lower for word in ['pcr', 'molecular', 'genetic']):
sql_parts.extend([
f"SELECT TOP {max_rows}",
" r.RequestNr, r.DateCreated, r.sCustomer,",
" s.SampleNr, s.Identification,",
" pr.DateCreated as PCRDate,",
" pk.Name_en as PCRKit,",
" pr.Result as PCRResult",
"FROM Requests r",
"INNER JOIN Samples s ON s.Sample_Request = r.Id",
"INNER JOIN tblPCRResult pr ON pr.Sample_ID = s.Id",
"LEFT JOIN tblPCRKit pk ON pk.Id = pr.PCRKit_ID",
"WHERE r.DateCreated >= DATEADD(month, -6, GETDATE())",
"ORDER BY r.DateCreated DESC"
])
explanation = "PCR test results with kit information"
# Sample workflow analysis
elif any(word in query_lower for word in ['sample', 'workflow', 'processing', 'status', 'analysis']):
sql_parts.extend([
f"SELECT TOP {max_rows}",
" r.RequestNr, r.DateCreated, r.sCustomer,",
" s.SampleNr, s.Identification, s.DateSampling,",
" st.Name_en as SampleType,",
" ag.Name_en as AnalysisGroup,",
" sag.IsAnalyseDone, sag.IsAnalyseExported,",
" COUNT(res.Id) as ResultCount",
"FROM Requests r",
"INNER JOIN Samples s ON s.Sample_Request = r.Id",
"LEFT JOIN SampleTypes st ON st.Id = s.Sample_SampleType",
"INNER JOIN SampleAnalyseGroups sag ON sag.SampleAnalyseGroup_Sample = s.Id",
"INNER JOIN AnalysisGroups ag ON ag.Id = sag.RequestAnalyseGroup_AnalysisGroup",
"LEFT JOIN Results res ON res.Result_Sample = s.Id",
"WHERE r.DateCreated >= DATEADD(month, -3, GETDATE())",
"GROUP BY r.RequestNr, r.DateCreated, r.sCustomer, s.SampleNr, s.Identification,",
" s.DateSampling, st.Name_en, ag.Name_en, sag.IsAnalyseDone, sag.IsAnalyseExported",
"ORDER BY r.DateCreated DESC"
])
explanation = "Sample workflow status with analysis progress"
# Quality/validation analysis
elif any(word in query_lower for word in ['quality', 'validation', 'validated', 'approved']):
sql_parts.extend([
f"SELECT TOP {max_rows}",
" r.RequestNr, r.DateCreated, r.sCustomer,",
" s.SampleNr, s.Identification,",
" res.DateCreated as ResultDate,",
" res.TechValidated, res.BioValidated,",
" res.TechValidatedBy, res.BioValidatedBy,",
" res.DateTechValidated, res.DateBioValidated",
"FROM Requests r",
"INNER JOIN Samples s ON s.Sample_Request = r.Id",
"INNER JOIN Results res ON res.Result_Sample = s.Id",
"WHERE r.DateCreated >= DATEADD(month, -3, GETDATE())",
" AND (res.TechValidated = 1 OR res.BioValidated = 1)",
"ORDER BY res.DateCreated DESC"
])
explanation = "Results with validation status and validator information"
# Default Poulpharm query - recent requests with basic info
else:
sql_parts.extend([
f"SELECT TOP {max_rows}",
" r.RequestNr, r.DateCreated, r.sCustomer, r.sVeterinarian,",
" r.TestCode, r.StartNumber,",
" COUNT(s.Id) as SampleCount,",
" COUNT(DISTINCT sag.Id) as AnalysisGroupCount",
"FROM Requests r",
"LEFT JOIN Samples s ON s.Sample_Request = r.Id",
"LEFT JOIN SampleAnalyseGroups sag ON sag.SampleAnalyseGroup_Sample = s.Id",
"WHERE r.DateCreated >= DATEADD(month, -3, GETDATE())",
"GROUP BY r.RequestNr, r.DateCreated, r.sCustomer, r.sVeterinarian, r.TestCode, r.StartNumber",
"ORDER BY r.DateCreated DESC"
])
explanation = "Recent laboratory requests with sample and analysis counts"
sql_query = "\n".join(sql_parts)
return sql_query, explanation
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
bases: Parameter of type
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self, schema, statistical_agent)
Purpose: Internal method: init
Parameters:
schema: Type: DatabaseSchemastatistical_agent: Parameter
Returns: None
_setup_poulpharm_optimizations(self)
Purpose: Setup Poulpharm LIMS specific query optimizations
Returns: None
generate_sql_query(self, user_query, max_rows) -> Tuple[str, Dict[str, Any]]
Purpose: Generate SQL query based on user analysis request Returns: (sql_query, metadata)
Parameters:
user_query: Type: strmax_rows: Type: int
Returns: Returns Tuple[str, Dict[str, Any]]
_prepare_database_context(self) -> str
Purpose: Prepare database schema context for LLM
Returns: Returns str
_create_sql_generation_prompt(self, user_query, db_context, max_rows) -> str
Purpose: Create prompt for SQL generation
Parameters:
user_query: Type: strdb_context: Type: strmax_rows: Type: int
Returns: Returns str
_generate_with_llm(self, prompt) -> str
Purpose: Generate SQL using the statistical agent's LLM
Parameters:
prompt: Type: str
Returns: Returns str
_get_poulpharm_pattern_hints(self, prompt) -> str
Purpose: Get Poulpharm-specific hints based on detected patterns
Parameters:
prompt: Type: str
Returns: Returns str
_parse_llm_response(self, response) -> Tuple[str, str]
Purpose: Parse LLM response to extract SQL query and explanation
Parameters:
response: Type: str
Returns: Returns Tuple[str, str]
_generate_basic_sql(self, user_query, max_rows) -> Tuple[str, str]
Purpose: Fallback basic SQL generation without LLM
Parameters:
user_query: Type: strmax_rows: Type: int
Returns: Returns Tuple[str, str]
_generate_poulpharm_specific_sql(self, query_lower, max_rows) -> Tuple[str, str]
Purpose: Generate Poulpharm LIMS specific SQL based on common patterns
Parameters:
query_lower: Type: strmax_rows: Type: int
Returns: Returns Tuple[str, str]
Required Imports
import json
import logging
from typing import Dict
from typing import List
from typing import Optional
Usage Example
# Example usage:
# result = SQLQueryGenerator(bases)
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class SQLQueryGenerator 97.8% similar
-
class SqlGenerationResult 59.2% similar
-
function main_v62 57.8% similar
-
class QueryIteration 52.0% similar
-
function main_v63 51.9% similar