class ScriptExecutor_v1
A sandboxed Python script executor that safely runs user-provided Python code with timeout controls, security restrictions, and isolated execution environments for data analysis tasks.
/tf/active/vicechatdev/full_smartstat/script_executor.py
38 - 464
complex
Purpose
ScriptExecutor provides a secure environment for executing untrusted Python scripts on pandas DataFrames. It validates scripts for dangerous operations, executes them in restricted namespaces with timeout protection, captures outputs and plots, and manages session-based file storage. Designed for data analysis workflows where users submit custom Python code that needs to be executed safely without compromising system security.
Source Code
class ScriptExecutor:
"""Sandboxed Python script executor with timeout and restrictions"""
def __init__(self, config: Config):
self.config = config
self.sandbox_dir = config.SANDBOX_FOLDER
self.timeout = config.SCRIPT_TIMEOUT
self.allowed_imports = config.ALLOWED_IMPORTS
# Ensure sandbox directory exists
os.makedirs(self.sandbox_dir, exist_ok=True)
def execute_script(self, script: str, data: pd.DataFrame,
session_id: str, step_id: str) -> Dict[str, Any]:
"""
Execute Python script in sandboxed environment
Returns execution results, outputs, and any errors
"""
execution_result = {
'success': False,
'output': '',
'error': '',
'results': {},
'plots': [],
'execution_time': 0,
'warnings': []
}
start_time = time.time()
try:
# Validate script safety
validation_result = self._validate_script(script)
if not validation_result['safe']:
execution_result['error'] = f"Script validation failed: {validation_result['reason']}"
return execution_result
# Create isolated execution environment
execution_context = self._create_execution_context(data, session_id, step_id)
# Execute script with timeout
output, error, results, plots = self._execute_with_timeout(script, execution_context)
execution_result.update({
'success': error == '',
'output': output,
'error': error,
'results': results,
'plots': plots,
'execution_time': time.time() - start_time
})
except Exception as e:
try:
error_msg = f"Execution failed: {str(e)}"
# Try to add traceback if possible
exc_type, exc_value, exc_traceback = sys.exc_info()
if exc_traceback:
try:
tb_lines = traceback.format_tb(exc_traceback)
error_msg += "\n" + "".join(tb_lines)
except Exception:
error_msg += "\n(Traceback unavailable)"
except Exception:
error_msg = f"Execution failed: {str(e)}"
execution_result['error'] = error_msg
execution_result['execution_time'] = time.time() - start_time
logger.error(f"Script execution error: {error_msg}")
return execution_result
def _validate_script(self, script: str) -> Dict[str, Any]:
"""Validate script for security and safety"""
validation = {'safe': True, 'reason': '', 'warnings': []}
# Check for dangerous imports
dangerous_imports = ['os', 'sys', 'subprocess', 'eval', 'exec', 'compile',
'open', '__import__', 'vars']
for dangerous in dangerous_imports:
if dangerous in script:
# More precise checking
import re
patterns = [
rf'\bimport\s+{dangerous}\b',
rf'\bfrom\s+{dangerous}\s+import\b',
rf'\b{dangerous}\s*\(',
]
for pattern in patterns:
if re.search(pattern, script):
validation['safe'] = False
validation['reason'] = f"Dangerous import/function detected: {dangerous}"
return validation
# Check for file operations (except matplotlib save)
file_ops = ['open(', 'file(', 'read(', 'write(']
for op in file_ops:
if op in script and 'plt.savefig' not in script:
validation['warnings'].append(f"File operation detected: {op}")
# Check script length
if len(script) > 10000: # 10KB limit
validation['safe'] = False
validation['reason'] = "Script too long"
return validation
# Check for infinite loops (basic detection)
if 'while True:' in script:
validation['warnings'].append("Potential infinite loop detected")
return validation
def _create_execution_context(self, data: pd.DataFrame, session_id: str, step_id: str) -> Dict[str, Any]:
"""Create isolated execution context"""
# Create session-specific directory
session_dir = self.sandbox_dir / session_id
os.makedirs(session_dir, exist_ok=True)
# Create step-specific directory for outputs
step_dir = session_dir / step_id
os.makedirs(step_dir, exist_ok=True)
context = {
'df': data.copy(), # Provide data as 'df'
'session_dir': str(session_dir),
'step_dir': str(step_dir),
'plots_dir': str(step_dir / 'plots'),
'np': np,
'pd': pd,
'plt': plt,
'sns': sns,
'stats': stats,
'sm': sm,
'warnings': warnings
}
# Create plots directory
os.makedirs(context['plots_dir'], exist_ok=True)
return context
def _execute_with_timeout(self, script: str, context: Dict[str, Any]) -> Tuple[str, str, Dict, List[str]]:
"""Execute script with timeout and capture outputs"""
# Prepare script with context setup
full_script = self._prepare_script(script, context)
# Capture stdout and stderr
old_stdout = sys.stdout
old_stderr = sys.stderr
stdout_capture = io.StringIO()
stderr_capture = io.StringIO()
results = {}
plots = []
try:
# Redirect output
sys.stdout = stdout_capture
sys.stderr = stderr_capture
# Use signal-based timeout but restore original handler afterwards
original_handler = None
if hasattr(signal, 'SIGALRM'):
original_handler = signal.signal(signal.SIGALRM, signal.SIG_DFL)
try:
def timeout_handler(signum, frame):
raise TimeoutError("Script execution timed out")
if hasattr(signal, 'SIGALRM'):
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(self.timeout)
# Execute script in restricted environment
exec_globals = self._create_restricted_globals(context)
exec_locals = {}
exec(full_script, exec_globals, exec_locals)
# Extract results if available (check both locals and globals)
if 'results' in exec_locals:
results = exec_locals['results']
elif 'results' in exec_globals:
results = exec_globals['results']
# Find generated plots
plots = self._collect_plots(context['plots_dir'])
finally:
# Always restore original signal handler and cancel alarm
if hasattr(signal, 'SIGALRM'):
signal.alarm(0)
if original_handler is not None:
signal.signal(signal.SIGALRM, original_handler)
output = stdout_capture.getvalue()
error = stderr_capture.getvalue()
except TimeoutError:
error = f"Script execution timed out after {self.timeout} seconds"
output = stdout_capture.getvalue()
except Exception as e:
# Get exception info without using format_exc which might have compatibility issues
exc_type, exc_value, exc_traceback = sys.exc_info()
error_lines = [f"Execution error: {str(e)}"]
# Try to get traceback info safely
try:
if exc_traceback:
tb_lines = traceback.format_tb(exc_traceback)
error_lines.extend(tb_lines)
except Exception:
error_lines.append("(Traceback unavailable due to compatibility issue)")
error = "\n".join(error_lines)
output = stdout_capture.getvalue()
finally:
# Restore stdout/stderr
sys.stdout = old_stdout
sys.stderr = old_stderr
return output, error, results, plots
def _prepare_script(self, script: str, context: Dict[str, Any]) -> str:
"""Prepare script with necessary setup and context"""
setup_code = f"""
# Setup code - automatically added
# Modules are pre-imported and available as: pd, np, plt, sns, stats, sm, warnings
# Suppress warnings
warnings.filterwarnings('ignore')
# Set up matplotlib for non-interactive use
plt.ioff()
plt.style.use('default')
# Set up directories
plots_dir = r"{context['plots_dir']}"
# Data is available as 'df' and 'data'
data = df # Alias for compatibility
# Initialize results dictionary at global scope
results = {{
'summary_statistics': {{}},
'test_results': {{}},
'plots': [],
'assumptions': {{}},
'interpretation': ''
}}
# Override plt.show to auto-save instead
original_show = plt.show
def show_and_save():
global results
try:
# Get current figure and save it directly
fig = plt.gcf()
if fig.get_axes(): # Only save if figure has content
filename = f"plot_{{len(results.get('plots', []))}}.png"
filepath = plots_dir + "/" + filename
fig.savefig(filepath, dpi=150, bbox_inches='tight')
if 'plots' not in results:
results['plots'] = []
results['plots'].append(filename)
plt.close(fig)
except Exception as e:
print(f"Warning: Could not save plot: {{e}}")
plt.show = show_and_save
"""
return setup_code + "\n" + script
def _create_restricted_globals(self, context: Dict[str, Any]) -> Dict[str, Any]:
"""Create restricted global namespace for execution"""
# Start with minimal safe builtins
safe_builtins = {
'abs', 'all', 'any', 'bool', 'dict', 'enumerate', 'filter',
'float', 'int', 'len', 'list', 'map', 'max', 'min', 'range',
'round', 'set', 'sorted', 'str', 'sum', 'tuple', 'type', 'zip',
'print', 'isinstance', 'hasattr', 'getattr', 'setattr', 'locals', 'globals',
# Add exception types for error handling
'Exception', 'ValueError', 'TypeError', 'KeyError', 'IndexError', 'AttributeError'
}
# Create restricted builtins
restricted_builtins = {name: __builtins__[name] for name in safe_builtins if name in __builtins__}
# Add mathematical functions
import math
restricted_builtins['math'] = math
globals_dict = {
'__builtins__': restricted_builtins,
# Add allowed modules
'pd': pd,
'np': np,
'plt': plt,
'sns': sns,
'stats': stats,
'sm': sm,
'warnings': warnings,
# Add data
'df': context['df']
}
return globals_dict
def _collect_plots(self, plots_dir: str) -> List[str]:
"""Collect generated plot files"""
plots = []
if os.path.exists(plots_dir):
for file in os.listdir(plots_dir):
if file.endswith(('.png', '.jpg', '.jpeg', '.svg', '.pdf')):
plots.append(os.path.join(plots_dir, file))
return plots
def extract_results_summary(self, results: Dict[str, Any]) -> Dict[str, Any]:
"""Extract and format key results for display"""
summary = {
'statistics': {},
'tests': {},
'plots_count': 0,
'key_findings': []
}
try:
# Extract summary statistics
if 'summary_statistics' in results:
summary['statistics'] = results['summary_statistics']
# Extract test results
if 'test_results' in results:
summary['tests'] = results['test_results']
# Count plots
if 'plots' in results:
summary['plots_count'] = len(results['plots'])
# Extract interpretation
if 'interpretation' in results:
summary['interpretation'] = results['interpretation']
# Try to extract key numerical findings
for key, value in results.items():
if isinstance(value, (int, float)) and not np.isnan(value):
summary['key_findings'].append(f"{key}: {value}")
elif isinstance(value, dict):
for subkey, subvalue in value.items():
if isinstance(subvalue, (int, float)) and not np.isnan(subvalue):
summary['key_findings'].append(f"{key}.{subkey}: {subvalue}")
except Exception as e:
logger.warning(f"Error extracting results summary: {str(e)}")
return summary
def cleanup_session(self, session_id: str, keep_recent: int = 5):
"""Clean up old session files"""
try:
session_dir = self.sandbox_dir / session_id
if session_dir.exists():
# Get all step directories
step_dirs = [d for d in session_dir.iterdir() if d.is_dir()]
step_dirs.sort(key=lambda x: x.stat().st_mtime, reverse=True)
# Remove old step directories
for step_dir in step_dirs[keep_recent:]:
import shutil
shutil.rmtree(step_dir)
except Exception as e:
logger.warning(f"Error cleaning up session {session_id}: {str(e)}")
def get_plot_data(self, plot_path: str) -> Optional[str]:
"""Get plot as base64 encoded string for web display"""
try:
if os.path.exists(plot_path):
with open(plot_path, 'rb') as f:
plot_data = base64.b64encode(f.read()).decode('utf-8')
return plot_data
except Exception as e:
logger.error(f"Error reading plot {plot_path}: {str(e)}")
return None
def validate_data_access(self, script: str, available_columns: List[str]) -> Dict[str, Any]:
"""Validate that script only accesses available columns"""
validation = {
'valid': True,
'warnings': [],
'accessed_columns': []
}
import re
# Find column references in script
column_patterns = [
r"df\[['\"](.*?)['\"]\]", # df['column']
r"df\.(\w+)", # df.column
]
accessed_columns = set()
for pattern in column_patterns:
matches = re.findall(pattern, script)
accessed_columns.update(matches)
validation['accessed_columns'] = list(accessed_columns)
# Check if columns exist
missing_columns = accessed_columns - set(available_columns)
if missing_columns:
validation['warnings'].append(f"Columns not found in dataset: {list(missing_columns)}")
return validation
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
config: A Config object containing execution settings including SANDBOX_FOLDER (directory for isolated execution), SCRIPT_TIMEOUT (maximum execution time in seconds), and ALLOWED_IMPORTS (list of permitted module imports)
Return Value
Instantiation returns a ScriptExecutor object. The main execute_script method returns a dictionary with keys: 'success' (bool), 'output' (captured stdout), 'error' (error messages), 'results' (dict of script results), 'plots' (list of plot file paths), 'execution_time' (float in seconds), and 'warnings' (list of warning messages).
Class Interface
Methods
__init__(self, config: Config)
Purpose: Initialize the ScriptExecutor with configuration settings and create sandbox directory
Parameters:
config: Config object containing SANDBOX_FOLDER, SCRIPT_TIMEOUT, and ALLOWED_IMPORTS
Returns: None
execute_script(self, script: str, data: pd.DataFrame, session_id: str, step_id: str) -> Dict[str, Any]
Purpose: Main method to execute a Python script in a sandboxed environment with the provided DataFrame
Parameters:
script: Python code string to executedata: pandas DataFrame to make available in the script as 'df' and 'data'session_id: Unique identifier for the user session (used for file organization)step_id: Unique identifier for this execution step (used for file organization)
Returns: Dictionary with keys: 'success' (bool), 'output' (str), 'error' (str), 'results' (dict), 'plots' (list of file paths), 'execution_time' (float), 'warnings' (list)
_validate_script(self, script: str) -> Dict[str, Any]
Purpose: Validate script for security issues like dangerous imports, file operations, and excessive length
Parameters:
script: Python code string to validate
Returns: Dictionary with keys: 'safe' (bool), 'reason' (str explaining why unsafe), 'warnings' (list of non-critical issues)
_create_execution_context(self, data: pd.DataFrame, session_id: str, step_id: str) -> Dict[str, Any]
Purpose: Create an isolated execution context with data, directories, and pre-imported modules
Parameters:
data: pandas DataFrame to include in contextsession_id: Session identifier for directory creationstep_id: Step identifier for directory creation
Returns: Dictionary containing 'df' (DataFrame copy), directory paths, and pre-imported modules (np, pd, plt, sns, stats, sm, warnings)
_execute_with_timeout(self, script: str, context: Dict[str, Any]) -> Tuple[str, str, Dict, List[str]]
Purpose: Execute script with timeout protection, capturing stdout, stderr, results, and plots
Parameters:
script: Python code to executecontext: Execution context dictionary from _create_execution_context
Returns: Tuple of (stdout output, stderr/error output, results dictionary, list of plot file paths)
_prepare_script(self, script: str, context: Dict[str, Any]) -> str
Purpose: Prepend setup code to user script including imports, matplotlib configuration, and results dictionary initialization
Parameters:
script: User's Python codecontext: Execution context with directory paths
Returns: Complete script string with setup code prepended
_create_restricted_globals(self, context: Dict[str, Any]) -> Dict[str, Any]
Purpose: Create a restricted global namespace with only safe builtins and allowed modules
Parameters:
context: Execution context containing DataFrame and modules
Returns: Dictionary to use as globals in exec(), containing restricted builtins and safe modules
_collect_plots(self, plots_dir: str) -> List[str]
Purpose: Collect all plot files generated during script execution from the plots directory
Parameters:
plots_dir: Path to directory containing generated plots
Returns: List of full file paths to plot files (png, jpg, jpeg, svg, pdf)
extract_results_summary(self, results: Dict[str, Any]) -> Dict[str, Any]
Purpose: Extract and format key results from script execution for display purposes
Parameters:
results: Results dictionary from script execution
Returns: Formatted summary dictionary with keys: 'statistics', 'tests', 'plots_count', 'key_findings', 'interpretation'
cleanup_session(self, session_id: str, keep_recent: int = 5)
Purpose: Remove old step directories for a session, keeping only the most recent ones to manage disk space
Parameters:
session_id: Session identifier to clean upkeep_recent: Number of most recent step directories to keep (default 5)
Returns: None
get_plot_data(self, plot_path: str) -> Optional[str]
Purpose: Read a plot file and return it as base64-encoded string for web display
Parameters:
plot_path: Full path to plot file
Returns: Base64-encoded string of plot image, or None if file cannot be read
validate_data_access(self, script: str, available_columns: List[str]) -> Dict[str, Any]
Purpose: Validate that script only accesses columns that exist in the DataFrame
Parameters:
script: Python code to analyzeavailable_columns: List of column names available in the DataFrame
Returns: Dictionary with keys: 'valid' (bool), 'warnings' (list of issues), 'accessed_columns' (list of column names found in script)
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
config |
Config | Configuration object containing execution settings | instance |
sandbox_dir |
Path or str | Root directory for sandboxed script execution and file storage | instance |
timeout |
int | Maximum execution time in seconds for scripts | instance |
allowed_imports |
list | List of module names that scripts are permitted to import | instance |
Dependencies
ossyssubprocesstempfilejsonloggingtracebackiocontextlibtimesignalthreadingqueuepathlibpicklebase64pandasnumpymatplotlibseabornscipystatsmodelswarningsmathreshutil
Required Imports
import os
import sys
import subprocess
import tempfile
import json
import logging
import traceback
import io
import contextlib
import time
import signal
import threading
import queue
from typing import Dict, List, Any, Optional, Tuple
from pathlib import Path
import pickle
import base64
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import warnings
from config import Config
import math
import re
import shutil
Conditional/Optional Imports
These imports are only needed under specific conditions:
import signal
Condition: only on Unix-like systems for SIGALRM-based timeout; Windows systems will skip signal-based timeout
OptionalUsage Example
from config import Config
import pandas as pd
from pathlib import Path
# Setup configuration
config = Config()
config.SANDBOX_FOLDER = Path('./sandbox')
config.SCRIPT_TIMEOUT = 30
config.ALLOWED_IMPORTS = ['pandas', 'numpy', 'matplotlib', 'seaborn', 'scipy', 'statsmodels']
# Create executor
executor = ScriptExecutor(config)
# Prepare data
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
# Execute script
script = '''
import numpy as np
results['mean_A'] = df['A'].mean()
results['sum_B'] = df['B'].sum()
print(f"Mean of A: {results['mean_A']}")
plt.figure()
plt.plot(df['A'], df['B'])
plt.title('A vs B')
plt.show()
'''
result = executor.execute_script(
script=script,
data=df,
session_id='session_123',
step_id='step_001'
)
if result['success']:
print(f"Output: {result['output']}")
print(f"Results: {result['results']}")
print(f"Plots: {result['plots']}")
print(f"Execution time: {result['execution_time']}s")
else:
print(f"Error: {result['error']}")
# Get plot as base64 for web display
if result['plots']:
plot_data = executor.get_plot_data(result['plots'][0])
# Cleanup old files
executor.cleanup_session('session_123', keep_recent=5)
Best Practices
- Always validate scripts before execution using the built-in validation
- Use unique session_id and step_id for each execution to prevent file conflicts
- Regularly call cleanup_session to prevent disk space issues from accumulated plots
- Set appropriate SCRIPT_TIMEOUT values based on expected analysis complexity (default 30 seconds recommended)
- The executor creates a 'results' dictionary in the script namespace - users should populate this with their findings
- Scripts should use plt.show() to save plots automatically; direct file operations are restricted
- The DataFrame is available as both 'df' and 'data' in the script namespace
- Signal-based timeout only works on Unix-like systems; Windows will execute without timeout protection
- Scripts are executed with restricted builtins - dangerous functions like eval, exec, open are blocked
- All script execution is isolated - changes to data don't affect the original DataFrame
- Use get_plot_data() to retrieve plots as base64 for web display
- The executor automatically suppresses warnings and configures matplotlib for non-interactive use
- Scripts have access to: pandas (pd), numpy (np), matplotlib (plt), seaborn (sns), scipy.stats (stats), statsmodels (sm)
- File operations are restricted to the session-specific sandbox directory
- Maximum script length is 10KB to prevent resource exhaustion
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class ScriptExecutor 96.6% similar
-
class AgentExecutor_v1 58.9% similar
-
class AgentExecutor_v2 58.4% similar
-
class AgentExecutor 57.7% similar
-
function test_agent_executor 50.9% similar