GPT5Validator - Code Extractor

class GPT5Validator

Maturity: 37

A comprehensive testing and validation class for OpenAI GPT models, with special support for GPT-5 family models using the Responses API.

File:
/tf/active/vicechatdev/docchat/test_gpt5_readiness.py

Lines:
32 - 255

Complexity:
moderate

Purpose

GPT5Validator provides a complete test suite for validating OpenAI language models across multiple dimensions: basic response quality, token limit handling, consistency, and latency. It automatically detects GPT-5 models and uses the appropriate API (Responses API for GPT-5, Chat Completions API for other models). The class maintains test results and provides summary reporting capabilities.

Source Code

class GPT5Validator:
    def __init__(self):
        self.client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
        self.results = {}
        # Token encoder for approximate token counts across models
        try:
            self.encoder = tiktoken.get_encoding("cl100k_base")
        except Exception:
            self.encoder = None

    # --- Helpers for Responses API (GPT-5 family) ---
    def _is_gpt5(self, model: str) -> bool:
        return 'gpt-5' in model.lower()

    def _responses_create(self, model: str, prompt: str, max_output_tokens: int = 100):
        """Call OpenAI Responses API and return (text, approx_total_tokens)."""
        resp = self.client.responses.create(
            model=model,
            input=prompt,
            max_output_tokens=max_output_tokens,
        )
        # Prefer output_text if available
        content = getattr(resp, 'output_text', None)
        if not content:
            parts = []
            for item in getattr(resp, 'output', []) or []:
                if getattr(item, 'type', '') == 'message':
                    for c in getattr(item, 'content', []) or []:
                        if getattr(c, 'type', '') == 'output_text':
                            parts.append(getattr(c, 'text', '') or '')
            content = ''.join(parts)
        content = (content or '').strip()
        # Approximate token usage (input + output) for reporting only
        total_tokens = None
        if self.encoder:
            try:
                total_tokens = len(self.encoder.encode(prompt)) + len(self.encoder.encode(content))
            except Exception:
                total_tokens = None
        return content, total_tokens
        
    def test_basic_responses(self, model: str) -> Tuple[bool, str]:
        """Test that model returns non-empty responses"""
        print(f"\n[{model}] Testing basic responses...")
        
        for i, query in enumerate(TEST_QUERIES, 1):
            try:
                if self._is_gpt5(model):
                    content, tokens = self._responses_create(model, query, max_output_tokens=100)
                else:
                    response = self.client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": query}],
                        max_tokens=100,
                        temperature=0
                    )
                    content = response.choices[0].message.content
                    tokens = getattr(getattr(response, 'usage', None), 'total_tokens', None)
                
                if not content or len(content.strip()) == 0:
                    return False, f"Empty response for query {i}: '{query}'"
                
                tokens_str = f", {tokens} tokens" if tokens is not None else ""
                print(f"  ✓ Query {i}: {len(content)} chars{tokens_str}")
                
            except Exception as e:
                return False, f"Error on query {i}: {str(e)}"
        
        return True, "All basic responses OK"
    
    def test_token_limits(self, model: str) -> Tuple[bool, str]:
        """Test that model handles large contexts"""
        print(f"\n[{model}] Testing token limits...")
        
        try:
            if self._is_gpt5(model):
                content, tokens = self._responses_create(model, LARGE_CONTEXT_TEST, max_output_tokens=200)
                # If we couldn't compute tokens, approximate with encoder
                if tokens is None and self.encoder:
                    try:
                        tokens = len(self.encoder.encode(LARGE_CONTEXT_TEST)) + len(self.encoder.encode(content))
                    except Exception:
                        tokens = None
            else:
                response = self.client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": LARGE_CONTEXT_TEST}],
                    max_tokens=200,
                    temperature=0
                )
                content = response.choices[0].message.content
                tokens = getattr(getattr(response, 'usage', None), 'total_tokens', None)
            
            if not content or len(content.strip()) == 0:
                return False, f"Empty response for large context"
            
            tokens_str = f"{tokens} tokens" if tokens is not None else "unknown tokens"
            print(f"  ✓ Large context: {tokens_str}, response: {len(content)} chars")
            
            # Check if claimed context window is accurate
            if self._is_gpt5(model) and tokens is not None and tokens > 200000:
                return False, f"Token usage {tokens} exceeds claimed 200k limit"
            
            return True, f"Token handling OK ({tokens_str})"
            
        except Exception as e:
            return False, f"Error with large context: {str(e)}"
    
    def test_consistency(self, model: str) -> Tuple[bool, str]:
        """Test that model returns consistent results"""
        print(f"\n[{model}] Testing consistency (5 runs)...")
        
        test_query = "What is the capital of France?"
        responses = []
        
        for i in range(5):
            try:
                if self._is_gpt5(model):
                    content, _ = self._responses_create(model, test_query, max_output_tokens=50)
                else:
                    response = self.client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": test_query}],
                        max_tokens=50,
                        temperature=0
                    )
                    content = response.choices[0].message.content
                if not content:
                    return False, f"Empty response on run {i+1}"
                
                responses.append(content.strip().lower())
                print(f"  Run {i+1}: {content[:50]}...")
                time.sleep(0.5)  # Rate limiting
                
            except Exception as e:
                return False, f"Error on run {i+1}: {str(e)}"
        
        # All responses should contain "paris"
        if not all('paris' in r for r in responses):
            return False, "Inconsistent responses detected"
        
        return True, "Consistency OK (all responses correct)"
    
    def test_latency(self, model: str) -> Tuple[bool, str]:
        """Test response time"""
        print(f"\n[{model}] Testing latency...")
        
        latencies = []
        
        for i in range(3):
            start = time.time()
            try:
                if self._is_gpt5(model):
                    self._responses_create(model, "Hi", max_output_tokens=50)
                else:
                    response = self.client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": "Hi"}],
                        max_tokens=50,
                        temperature=0
                    )
                
                latency = time.time() - start
                latencies.append(latency)
                print(f"  Run {i+1}: {latency:.2f}s")
                time.sleep(0.5)
                
            except Exception as e:
                return False, f"Error on run {i+1}: {str(e)}"
        
        avg_latency = sum(latencies) / len(latencies)
        
        if avg_latency > 10:
            return False, f"High latency: {avg_latency:.2f}s average"
        
        return True, f"Latency OK ({avg_latency:.2f}s average)"
    
    def run_all_tests(self, model: str) -> Dict[str, Tuple[bool, str]]:
        """Run all tests for a model"""
        print(f"\n{'='*60}")
        print(f"Testing Model: {model}")
        print(f"{'='*60}")
        
        tests = {
            'basic_responses': self.test_basic_responses,
            'token_limits': self.test_token_limits,
            'consistency': self.test_consistency,
            'latency': self.test_latency,
        }
        
        results = {}
        for test_name, test_func in tests.items():
            try:
                success, message = test_func(model)
                results[test_name] = (success, message)
            except Exception as e:
                results[test_name] = (False, f"Test crashed: {str(e)}")
        
        return results
    
    def print_summary(self):
        """Print test summary"""
        print(f"\n{'='*60}")
        print("TEST SUMMARY")
        print(f"{'='*60}\n")
        
        all_passed = True
        
        for model, tests in self.results.items():
            print(f"\n{model}:")
            passed = sum(1 for success, _ in tests.values() if success)
            total = len(tests)
            
            for test_name, (success, message) in tests.items():
                status = "✓ PASS" if success else "✗ FAIL"
                print(f"  {status} | {test_name}: {message}")
            
            model_passed = passed == total
            print(f"  Result: {passed}/{total} tests passed")
            
            if not model_passed:
                all_passed = False
        
        return all_passed

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

__init__: No parameters required. Initializes the validator with an OpenAI client using the OPENAI_API_KEY environment variable, sets up an empty results dictionary, and attempts to load the tiktoken encoder for token counting.

Return Value

Instantiation returns a GPT5Validator object. Test methods return Tuple[bool, str] where the boolean indicates success/failure and the string provides a descriptive message. run_all_tests() returns Dict[str, Tuple[bool, str]] mapping test names to their results. print_summary() returns a boolean indicating whether all tests passed.

Class Interface

Methods

`init(self)`

Purpose: Initialize the validator with OpenAI client, results storage, and token encoder

Returns: None

`_is_gpt5(self, model: str) -> bool`

Purpose: Determine if a model is part of the GPT-5 family by checking model name

Parameters:

model: Model identifier string to check

Returns: True if 'gpt-5' is in the model name (case-insensitive), False otherwise

`_responses_create(self, model: str, prompt: str, max_output_tokens: int = 100)`

Purpose: Call OpenAI Responses API for GPT-5 models and extract text with approximate token count

Parameters:

model: GPT-5 model identifier
prompt: Input text/prompt to send to the model
max_output_tokens: Maximum tokens in the response (default 100)

Returns: Tuple of (response_text: str, total_tokens: int|None) where total_tokens is approximate input+output count

`test_basic_responses(self, model: str) -> Tuple[bool, str]`

Purpose: Test that the model returns non-empty responses for all queries in TEST_QUERIES

Parameters:

model: Model identifier to test

Returns: Tuple of (success: bool, message: str) indicating test result and details

`test_token_limits(self, model: str) -> Tuple[bool, str]`

Purpose: Test that the model handles large contexts (LARGE_CONTEXT_TEST) and respects token limits

Parameters:

model: Model identifier to test

Returns: Tuple of (success: bool, message: str) with token usage information

`test_consistency(self, model: str) -> Tuple[bool, str]`

Purpose: Test that the model returns consistent correct answers across 5 runs with temperature=0

Parameters:

model: Model identifier to test

Returns: Tuple of (success: bool, message: str) indicating whether all responses contained 'paris'

`test_latency(self, model: str) -> Tuple[bool, str]`

Purpose: Test response time across 3 runs and verify average latency is under 10 seconds

Parameters:

model: Model identifier to test

Returns: Tuple of (success: bool, message: str) with average latency information

`run_all_tests(self, model: str) -> Dict[str, Tuple[bool, str]]`

Purpose: Execute all test methods for a given model and return comprehensive results

Parameters:

model: Model identifier to test

Returns: Dictionary mapping test names ('basic_responses', 'token_limits', 'consistency', 'latency') to their (success, message) tuples

`print_summary(self) -> bool`

Purpose: Print formatted summary of all test results stored in self.results

Returns: Boolean indicating whether all tests for all models passed

Attributes

Name	Type	Description	Scope
`client`	OpenAI	OpenAI API client instance initialized with API key from environment	instance
`results`	Dict[str, Dict[str, Tuple[bool, str]]]	Storage for test results, mapping model names to their test outcomes	instance
`encoder`	tiktoken.Encoding \| None	Tiktoken encoder for approximate token counting (cl100k_base), None if loading fails	instance

Dependencies

openai
tiktoken
os
sys
time
typing
traceback

Required Imports

import os
import sys
import time
from openai import OpenAI
from typing import Dict, List, Tuple
import tiktoken
import traceback

Conditional/Optional Imports

These imports are only needed under specific conditions:

import tiktoken

Condition: Required for token counting functionality; gracefully degrades if unavailable

Optional

Usage Example

# Set up environment
import os
os.environ['OPENAI_API_KEY'] = 'your-api-key'

# Define required test constants
TEST_QUERIES = [
    'What is 2+2?',
    'Explain photosynthesis briefly',
    'Name three colors'
]
LARGE_CONTEXT_TEST = 'Lorem ipsum ' * 1000  # Large text

# Create validator instance
validator = GPT5Validator()

# Test a single model
results = validator.run_all_tests('gpt-5-preview')

# Store results for multiple models
validator.results['gpt-5-preview'] = results
validator.results['gpt-4'] = validator.run_all_tests('gpt-4')

# Print comprehensive summary
all_passed = validator.print_summary()

# Run individual tests
success, message = validator.test_basic_responses('gpt-5-preview')
print(f'Basic test: {message}')

success, message = validator.test_consistency('gpt-4')
print(f'Consistency: {message}')

Best Practices

Always set OPENAI_API_KEY environment variable before instantiating the class
Define TEST_QUERIES and LARGE_CONTEXT_TEST constants in your module before using the validator
Use run_all_tests() for comprehensive validation rather than individual test methods
Store results in the validator.results dictionary before calling print_summary()
Be aware of rate limiting - the class includes sleep delays between requests
The class automatically detects GPT-5 models by checking if 'gpt-5' is in the model name (case-insensitive)
Token counting is approximate and may be None if tiktoken encoder fails to load
Test methods have side effects (print statements) - capture stdout if you need silent operation
The consistency test expects 'paris' in responses for the capital of France question - modify for different test queries
Latency threshold is hardcoded to 10 seconds - responses slower than this are considered failures
The class maintains state in self.results - clear or create new instance for fresh test runs

Similar Components

AI-powered semantic similarity - components with related functionality:

function main_v11 76.0% similar

Main test runner function that validates GPT-5 readiness by running comprehensive tests against multiple OpenAI models (GPT-5 and GPT-4o) and provides production readiness recommendations.
From: /tf/active/vicechatdev/docchat/test_gpt5_readiness.py
class OpenAIResponsesLLM 68.5% similar

Adapter class for OpenAI's Responses API, specifically designed for GPT-5 family models with automatic fallback mechanisms to stable models when responses fail.
From: /tf/active/vicechatdev/docchat/llm_factory.py
class OpenAIChatLLM 64.3% similar

Adapter class for interacting with OpenAI's Chat Completions API, supporting both GPT-4 and GPT-5 model families with automatic parameter adjustment based on model type.
From: /tf/active/vicechatdev/docchat/llm_factory.py
class LLMClient 57.0% similar

Multi-LLM client that provides a unified interface for interacting with OpenAI GPT-4o, Azure OpenAI, Google Gemini, and Anthropic Claude models.
From: /tf/active/vicechatdev/vice_ai/new_app.py
function test_api_models_endpoint 55.1% similar

A unit test function that validates the structure and content of the /api/models endpoint response, ensuring it contains the correct model configuration data.
From: /tf/active/vicechatdev/docchat/test_model_selection.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class GPT5Validator:
    def __init__(self):
        self.client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
        self.results = {}
        # Token encoder for approximate token counts across models
        try:
            self.encoder = tiktoken.get_encoding("cl100k_base")
        except Exception:
            self.encoder = None

    # --- Helpers for Responses API (GPT-5 family) ---
    def _is_gpt5(self, model: str) -> bool:
        return 'gpt-5' in model.lower()

    def _responses_create(self, model: str, prompt: str, max_output_tokens: int = 100):
        """Call OpenAI Responses API and return (text, approx_total_tokens)."""
        resp = self.client.responses.create(
            model=model,
            input=prompt,
            max_output_tokens=max_output_tokens,
        )
        # Prefer output_text if available
        content = getattr(resp, 'output_text', None)
        if not content:
            parts = []
            for item in getattr(resp, 'output', []) or []:
                if getattr(item, 'type', '') == 'message':
                    for c in getattr(item, 'content', []) or []:
                        if getattr(c, 'type', '') == 'output_text':
                            parts.append(getattr(c, 'text', '') or '')
            content = ''.join(parts)
        content = (content or '').strip()
        # Approximate token usage (input + output) for reporting only
        total_tokens = None
        if self.encoder:
            try:
                total_tokens = len(self.encoder.encode(prompt)) + len(self.encoder.encode(content))
            except Exception:
                total_tokens = None
        return content, total_tokens
        
    def test_basic_responses(self, model: str) -> Tuple[bool, str]:
        """Test that model returns non-empty responses"""
        print(f"\n[{model}] Testing basic responses...")
        
        for i, query in enumerate(TEST_QUERIES, 1):
            try:
                if self._is_gpt5(model):
                    content, tokens = self._responses_create(model, query, max_output_tokens=100)
                else:
                    response = self.client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": query}],
                        max_tokens=100,
                        temperature=0
                    )
                    content = response.choices[0].message.content
                    tokens = getattr(getattr(response, 'usage', None), 'total_tokens', None)
                
                if not content or len(content.strip()) == 0:
                    return False, f"Empty response for query {i}: '{query}'"
                
                tokens_str = f", {tokens} tokens" if tokens is not None else ""
                print(f"  ✓ Query {i}: {len(content)} chars{tokens_str}")
                
            except Exception as e:
                return False, f"Error on query {i}: {str(e)}"
        
        return True, "All basic responses OK"
    
    def test_token_limits(self, model: str) -> Tuple[bool, str]:
        """Test that model handles large contexts"""
        print(f"\n[{model}] Testing token limits...")
        
        try:
            if self._is_gpt5(model):
                content, tokens = self._responses_create(model, LARGE_CONTEXT_TEST, max_output_tokens=200)
                # If we couldn't compute tokens, approximate with encoder
                if tokens is None and self.encoder:
                    try:
                        tokens = len(self.encoder.encode(LARGE_CONTEXT_TEST)) + len(self.encoder.encode(content))
                    except Exception:
                        tokens = None
            else:
                response = self.client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": LARGE_CONTEXT_TEST}],
                    max_tokens=200,
                    temperature=0
                )
                content = response.choices[0].message.content
                tokens = getattr(getattr(response, 'usage', None), 'total_tokens', None)
            
            if not content or len(content.strip()) == 0:
                return False, f"Empty response for large context"
            
            tokens_str = f"{tokens} tokens" if tokens is not None else "unknown tokens"
            print(f"  ✓ Large context: {tokens_str}, response: {len(content)} chars")
            
            # Check if claimed context window is accurate
            if self._is_gpt5(model) and tokens is not None and tokens > 200000:
                return False, f"Token usage {tokens} exceeds claimed 200k limit"
            
            return True, f"Token handling OK ({tokens_str})"
            
        except Exception as e:
            return False, f"Error with large context: {str(e)}"
    
    def test_consistency(self, model: str) -> Tuple[bool, str]:
        """Test that model returns consistent results"""
        print(f"\n[{model}] Testing consistency (5 runs)...")
        
        test_query = "What is the capital of France?"
        responses = []
        
        for i in range(5):
            try:
                if self._is_gpt5(model):
                    content, _ = self._responses_create(model, test_query, max_output_tokens=50)
                else:
                    response = self.client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": test_query}],
                        max_tokens=50,
                        temperature=0
                    )
                    content = response.choices[0].message.content
                if not content:
                    return False, f"Empty response on run {i+1}"
                
                responses.append(content.strip().lower())
                print(f"  Run {i+1}: {content[:50]}...")
                time.sleep(0.5)  # Rate limiting
                
            except Exception as e:
                return False, f"Error on run {i+1}: {str(e)}"
        
        # All responses should contain "paris"
        if not all('paris' in r for r in responses):
            return False, "Inconsistent responses detected"
        
        return True, "Consistency OK (all responses correct)"
    
    def test_latency(self, model: str) -> Tuple[bool, str]:
        """Test response time"""
        print(f"\n[{model}] Testing latency...")
        
        latencies = []
        
        for i in range(3):
            start = time.time()
            try:
                if self._is_gpt5(model):
                    self._responses_create(model, "Hi", max_output_tokens=50)
                else:
                    response = self.client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": "Hi"}],
                        max_tokens=50,
                        temperature=0
                    )
                
                latency = time.time() - start
                latencies.append(latency)
                print(f"  Run {i+1}: {latency:.2f}s")
                time.sleep(0.5)
                
            except Exception as e:
                return False, f"Error on run {i+1}: {str(e)}"
        
        avg_latency = sum(latencies) / len(latencies)
        
        if avg_latency > 10:
            return False, f"High latency: {avg_latency:.2f}s average"
        
        return True, f"Latency OK ({avg_latency:.2f}s average)"
    
    def run_all_tests(self, model: str) -> Dict[str, Tuple[bool, str]]:
        """Run all tests for a model"""
        print(f"\n{'='*60}")
        print(f"Testing Model: {model}")
        print(f"{'='*60}")
        
        tests = {
            'basic_responses': self.test_basic_responses,
            'token_limits': self.test_token_limits,
            'consistency': self.test_consistency,
            'latency': self.test_latency,
        }
        
        results = {}
        for test_name, test_func in tests.items():
            try:
                success, message = test_func(model)
                results[test_name] = (success, message)
            except Exception as e:
                results[test_name] = (False, f"Test crashed: {str(e)}")
        
        return results
    
    def print_summary(self):
        """Print test summary"""
        print(f"\n{'='*60}")
        print("TEST SUMMARY")
        print(f"{'='*60}\n")
        
        all_passed = True
        
        for model, tests in self.results.items():
            print(f"\n{model}:")
            passed = sum(1 for success, _ in tests.values() if success)
            total = len(tests)
            
            for test_name, (success, message) in tests.items():
                status = "✓ PASS" if success else "✗ FAIL"
                print(f"  {status} | {test_name}: {message}")
            
            model_passed = passed == total
            print(f"  Result: {passed}/{total} tests passed")
            
            if not model_passed:
                all_passed = False
        
        return all_passed
                        

Improved Code

🔍 Code Extractor

class GPT5Validator

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self)`

`_is_gpt5(self, model: str) -> bool`

`_responses_create(self, model: str, prompt: str, max_output_tokens: int = 100)`

`test_basic_responses(self, model: str) -> Tuple[bool, str]`

`test_token_limits(self, model: str) -> Tuple[bool, str]`

`test_consistency(self, model: str) -> Tuple[bool, str]`

`test_latency(self, model: str) -> Tuple[bool, str]`

`run_all_tests(self, model: str) -> Dict[str, Tuple[bool, str]]`

`print_summary(self) -> bool`

Attributes

Dependencies

Required Imports

Conditional/Optional Imports

Usage Example

Best Practices

Tags

Similar Components

function main_v11 76.0% similar

class OpenAIResponsesLLM 68.5% similar

class OpenAIChatLLM 64.3% similar

class LLMClient 57.0% similar

function test_api_models_endpoint 55.1% similar

class GPT5Validator

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self)

_is_gpt5(self, model: str) -> bool

_responses_create(self, model: str, prompt: str, max_output_tokens: int = 100)

test_basic_responses(self, model: str) -> Tuple[bool, str]

test_token_limits(self, model: str) -> Tuple[bool, str]

test_consistency(self, model: str) -> Tuple[bool, str]

test_latency(self, model: str) -> Tuple[bool, str]

run_all_tests(self, model: str) -> Dict[str, Tuple[bool, str]]

print_summary(self) -> bool

Attributes

Dependencies

Required Imports

Conditional/Optional Imports

Usage Example

Best Practices

Tags

Similar Components

function main_v11 76.0% similar

class OpenAIResponsesLLM 68.5% similar

class OpenAIChatLLM 64.3% similar

class LLMClient 57.0% similar

function test_api_models_endpoint 55.1% similar

✨ Improve Code: GPT5Validator

Code Comparison

`init(self)`

`_is_gpt5(self, model: str) -> bool`

`_responses_create(self, model: str, prompt: str, max_output_tokens: int = 100)`

`test_basic_responses(self, model: str) -> Tuple[bool, str]`

`test_token_limits(self, model: str) -> Tuple[bool, str]`

`test_consistency(self, model: str) -> Tuple[bool, str]`

`test_latency(self, model: str) -> Tuple[bool, str]`

`run_all_tests(self, model: str) -> Dict[str, Tuple[bool, str]]`

`print_summary(self) -> bool`