class GPT5Validator
A comprehensive testing and validation class for OpenAI GPT models, with special support for GPT-5 family models using the Responses API.
/tf/active/vicechatdev/docchat/test_gpt5_readiness.py
32 - 255
moderate
Purpose
GPT5Validator provides a complete test suite for validating OpenAI language models across multiple dimensions: basic response quality, token limit handling, consistency, and latency. It automatically detects GPT-5 models and uses the appropriate API (Responses API for GPT-5, Chat Completions API for other models). The class maintains test results and provides summary reporting capabilities.
Source Code
class GPT5Validator:
def __init__(self):
self.client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
self.results = {}
# Token encoder for approximate token counts across models
try:
self.encoder = tiktoken.get_encoding("cl100k_base")
except Exception:
self.encoder = None
# --- Helpers for Responses API (GPT-5 family) ---
def _is_gpt5(self, model: str) -> bool:
return 'gpt-5' in model.lower()
def _responses_create(self, model: str, prompt: str, max_output_tokens: int = 100):
"""Call OpenAI Responses API and return (text, approx_total_tokens)."""
resp = self.client.responses.create(
model=model,
input=prompt,
max_output_tokens=max_output_tokens,
)
# Prefer output_text if available
content = getattr(resp, 'output_text', None)
if not content:
parts = []
for item in getattr(resp, 'output', []) or []:
if getattr(item, 'type', '') == 'message':
for c in getattr(item, 'content', []) or []:
if getattr(c, 'type', '') == 'output_text':
parts.append(getattr(c, 'text', '') or '')
content = ''.join(parts)
content = (content or '').strip()
# Approximate token usage (input + output) for reporting only
total_tokens = None
if self.encoder:
try:
total_tokens = len(self.encoder.encode(prompt)) + len(self.encoder.encode(content))
except Exception:
total_tokens = None
return content, total_tokens
def test_basic_responses(self, model: str) -> Tuple[bool, str]:
"""Test that model returns non-empty responses"""
print(f"\n[{model}] Testing basic responses...")
for i, query in enumerate(TEST_QUERIES, 1):
try:
if self._is_gpt5(model):
content, tokens = self._responses_create(model, query, max_output_tokens=100)
else:
response = self.client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": query}],
max_tokens=100,
temperature=0
)
content = response.choices[0].message.content
tokens = getattr(getattr(response, 'usage', None), 'total_tokens', None)
if not content or len(content.strip()) == 0:
return False, f"Empty response for query {i}: '{query}'"
tokens_str = f", {tokens} tokens" if tokens is not None else ""
print(f" ✓ Query {i}: {len(content)} chars{tokens_str}")
except Exception as e:
return False, f"Error on query {i}: {str(e)}"
return True, "All basic responses OK"
def test_token_limits(self, model: str) -> Tuple[bool, str]:
"""Test that model handles large contexts"""
print(f"\n[{model}] Testing token limits...")
try:
if self._is_gpt5(model):
content, tokens = self._responses_create(model, LARGE_CONTEXT_TEST, max_output_tokens=200)
# If we couldn't compute tokens, approximate with encoder
if tokens is None and self.encoder:
try:
tokens = len(self.encoder.encode(LARGE_CONTEXT_TEST)) + len(self.encoder.encode(content))
except Exception:
tokens = None
else:
response = self.client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": LARGE_CONTEXT_TEST}],
max_tokens=200,
temperature=0
)
content = response.choices[0].message.content
tokens = getattr(getattr(response, 'usage', None), 'total_tokens', None)
if not content or len(content.strip()) == 0:
return False, f"Empty response for large context"
tokens_str = f"{tokens} tokens" if tokens is not None else "unknown tokens"
print(f" ✓ Large context: {tokens_str}, response: {len(content)} chars")
# Check if claimed context window is accurate
if self._is_gpt5(model) and tokens is not None and tokens > 200000:
return False, f"Token usage {tokens} exceeds claimed 200k limit"
return True, f"Token handling OK ({tokens_str})"
except Exception as e:
return False, f"Error with large context: {str(e)}"
def test_consistency(self, model: str) -> Tuple[bool, str]:
"""Test that model returns consistent results"""
print(f"\n[{model}] Testing consistency (5 runs)...")
test_query = "What is the capital of France?"
responses = []
for i in range(5):
try:
if self._is_gpt5(model):
content, _ = self._responses_create(model, test_query, max_output_tokens=50)
else:
response = self.client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": test_query}],
max_tokens=50,
temperature=0
)
content = response.choices[0].message.content
if not content:
return False, f"Empty response on run {i+1}"
responses.append(content.strip().lower())
print(f" Run {i+1}: {content[:50]}...")
time.sleep(0.5) # Rate limiting
except Exception as e:
return False, f"Error on run {i+1}: {str(e)}"
# All responses should contain "paris"
if not all('paris' in r for r in responses):
return False, "Inconsistent responses detected"
return True, "Consistency OK (all responses correct)"
def test_latency(self, model: str) -> Tuple[bool, str]:
"""Test response time"""
print(f"\n[{model}] Testing latency...")
latencies = []
for i in range(3):
start = time.time()
try:
if self._is_gpt5(model):
self._responses_create(model, "Hi", max_output_tokens=50)
else:
response = self.client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": "Hi"}],
max_tokens=50,
temperature=0
)
latency = time.time() - start
latencies.append(latency)
print(f" Run {i+1}: {latency:.2f}s")
time.sleep(0.5)
except Exception as e:
return False, f"Error on run {i+1}: {str(e)}"
avg_latency = sum(latencies) / len(latencies)
if avg_latency > 10:
return False, f"High latency: {avg_latency:.2f}s average"
return True, f"Latency OK ({avg_latency:.2f}s average)"
def run_all_tests(self, model: str) -> Dict[str, Tuple[bool, str]]:
"""Run all tests for a model"""
print(f"\n{'='*60}")
print(f"Testing Model: {model}")
print(f"{'='*60}")
tests = {
'basic_responses': self.test_basic_responses,
'token_limits': self.test_token_limits,
'consistency': self.test_consistency,
'latency': self.test_latency,
}
results = {}
for test_name, test_func in tests.items():
try:
success, message = test_func(model)
results[test_name] = (success, message)
except Exception as e:
results[test_name] = (False, f"Test crashed: {str(e)}")
return results
def print_summary(self):
"""Print test summary"""
print(f"\n{'='*60}")
print("TEST SUMMARY")
print(f"{'='*60}\n")
all_passed = True
for model, tests in self.results.items():
print(f"\n{model}:")
passed = sum(1 for success, _ in tests.values() if success)
total = len(tests)
for test_name, (success, message) in tests.items():
status = "✓ PASS" if success else "✗ FAIL"
print(f" {status} | {test_name}: {message}")
model_passed = passed == total
print(f" Result: {passed}/{total} tests passed")
if not model_passed:
all_passed = False
return all_passed
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
__init__: No parameters required. Initializes the validator with an OpenAI client using the OPENAI_API_KEY environment variable, sets up an empty results dictionary, and attempts to load the tiktoken encoder for token counting.
Return Value
Instantiation returns a GPT5Validator object. Test methods return Tuple[bool, str] where the boolean indicates success/failure and the string provides a descriptive message. run_all_tests() returns Dict[str, Tuple[bool, str]] mapping test names to their results. print_summary() returns a boolean indicating whether all tests passed.
Class Interface
Methods
__init__(self)
Purpose: Initialize the validator with OpenAI client, results storage, and token encoder
Returns: None
_is_gpt5(self, model: str) -> bool
Purpose: Determine if a model is part of the GPT-5 family by checking model name
Parameters:
model: Model identifier string to check
Returns: True if 'gpt-5' is in the model name (case-insensitive), False otherwise
_responses_create(self, model: str, prompt: str, max_output_tokens: int = 100)
Purpose: Call OpenAI Responses API for GPT-5 models and extract text with approximate token count
Parameters:
model: GPT-5 model identifierprompt: Input text/prompt to send to the modelmax_output_tokens: Maximum tokens in the response (default 100)
Returns: Tuple of (response_text: str, total_tokens: int|None) where total_tokens is approximate input+output count
test_basic_responses(self, model: str) -> Tuple[bool, str]
Purpose: Test that the model returns non-empty responses for all queries in TEST_QUERIES
Parameters:
model: Model identifier to test
Returns: Tuple of (success: bool, message: str) indicating test result and details
test_token_limits(self, model: str) -> Tuple[bool, str]
Purpose: Test that the model handles large contexts (LARGE_CONTEXT_TEST) and respects token limits
Parameters:
model: Model identifier to test
Returns: Tuple of (success: bool, message: str) with token usage information
test_consistency(self, model: str) -> Tuple[bool, str]
Purpose: Test that the model returns consistent correct answers across 5 runs with temperature=0
Parameters:
model: Model identifier to test
Returns: Tuple of (success: bool, message: str) indicating whether all responses contained 'paris'
test_latency(self, model: str) -> Tuple[bool, str]
Purpose: Test response time across 3 runs and verify average latency is under 10 seconds
Parameters:
model: Model identifier to test
Returns: Tuple of (success: bool, message: str) with average latency information
run_all_tests(self, model: str) -> Dict[str, Tuple[bool, str]]
Purpose: Execute all test methods for a given model and return comprehensive results
Parameters:
model: Model identifier to test
Returns: Dictionary mapping test names ('basic_responses', 'token_limits', 'consistency', 'latency') to their (success, message) tuples
print_summary(self) -> bool
Purpose: Print formatted summary of all test results stored in self.results
Returns: Boolean indicating whether all tests for all models passed
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
client |
OpenAI | OpenAI API client instance initialized with API key from environment | instance |
results |
Dict[str, Dict[str, Tuple[bool, str]]] | Storage for test results, mapping model names to their test outcomes | instance |
encoder |
tiktoken.Encoding | None | Tiktoken encoder for approximate token counting (cl100k_base), None if loading fails | instance |
Dependencies
openaitiktokenossystimetypingtraceback
Required Imports
import os
import sys
import time
from openai import OpenAI
from typing import Dict, List, Tuple
import tiktoken
import traceback
Conditional/Optional Imports
These imports are only needed under specific conditions:
import tiktoken
Condition: Required for token counting functionality; gracefully degrades if unavailable
OptionalUsage Example
# Set up environment
import os
os.environ['OPENAI_API_KEY'] = 'your-api-key'
# Define required test constants
TEST_QUERIES = [
'What is 2+2?',
'Explain photosynthesis briefly',
'Name three colors'
]
LARGE_CONTEXT_TEST = 'Lorem ipsum ' * 1000 # Large text
# Create validator instance
validator = GPT5Validator()
# Test a single model
results = validator.run_all_tests('gpt-5-preview')
# Store results for multiple models
validator.results['gpt-5-preview'] = results
validator.results['gpt-4'] = validator.run_all_tests('gpt-4')
# Print comprehensive summary
all_passed = validator.print_summary()
# Run individual tests
success, message = validator.test_basic_responses('gpt-5-preview')
print(f'Basic test: {message}')
success, message = validator.test_consistency('gpt-4')
print(f'Consistency: {message}')
Best Practices
- Always set OPENAI_API_KEY environment variable before instantiating the class
- Define TEST_QUERIES and LARGE_CONTEXT_TEST constants in your module before using the validator
- Use run_all_tests() for comprehensive validation rather than individual test methods
- Store results in the validator.results dictionary before calling print_summary()
- Be aware of rate limiting - the class includes sleep delays between requests
- The class automatically detects GPT-5 models by checking if 'gpt-5' is in the model name (case-insensitive)
- Token counting is approximate and may be None if tiktoken encoder fails to load
- Test methods have side effects (print statements) - capture stdout if you need silent operation
- The consistency test expects 'paris' in responses for the capital of France question - modify for different test queries
- Latency threshold is hardcoded to 10 seconds - responses slower than this are considered failures
- The class maintains state in self.results - clear or create new instance for fresh test runs
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function main_v11 76.0% similar
-
class OpenAIResponsesLLM 68.5% similar
-
class OpenAIChatLLM 64.3% similar
-
class LLMClient 57.0% similar
-
function test_api_models_endpoint 55.1% similar