function smartstat_upload_files
Flask API endpoint that handles multi-file uploads (CSV, Excel, PDF, Word, PowerPoint) to a SmartStat session, processing data files as datasets and documents as information sheets.
/tf/active/vicechatdev/vice_ai/new_app.py
4998 - 5206
complex
Purpose
This endpoint enables users to upload multiple files to a SmartStat analysis session. It processes tabular data files (CSV, Excel) as datasets for statistical analysis and document files (PDF, Word, PowerPoint) as contextual information sheets. The function validates user ownership, extracts content from various file formats, stores datasets and information sheets in the session, and updates metadata for tracking.
Source Code
def smartstat_upload_files(session_id):
"""Upload multiple files (CSV data + PDF/Word/PPT info sheets) to SmartStat"""
user_email = get_current_user()
session = smartstat_service.get_session(session_id)
if not session:
return jsonify({'error': 'Session not found'}), 404
# Verify ownership
data_section = data_section_service.get_data_section(session.data_section_id)
if not data_section or data_section.owner != user_email:
return jsonify({'error': 'Access denied'}), 403
try:
files = request.files.getlist('files[]')
if not files:
return jsonify({'error': 'No files provided'}), 400
datasets_info = []
info_sheets_info = []
all_columns = []
total_rows = 0
# Process each uploaded file
for file in files:
if file.filename == '':
continue
# Validate file type
file_ext = os.path.splitext(file.filename)[1].lower()
if file_ext in ['.csv', '.xlsx', '.xls']:
# Handle CSV and Excel as datasets
try:
import tempfile
from pathlib import Path
# Save temporarily
temp_dir = tempfile.mkdtemp()
temp_file = Path(temp_dir) / file.filename
file.save(str(temp_file))
if file_ext == '.csv':
# Read CSV
from smartstat_service import smart_read_csv
df = smart_read_csv(str(temp_file))
# Add to session
dataset_name = os.path.splitext(file.filename)[0]
result = smartstat_service.add_dataset(session_id, dataset_name, df)
if result['success'] and result['type'] == 'tabular_data':
datasets_info.append(result['dataset_info'])
all_columns.extend(df.columns.tolist())
total_rows += len(df)
else: # Excel files
# For Excel, we need to handle multiple sheets
from smartstat_service import read_excel_file
# Get all sheets first
excel_info = read_excel_file(str(temp_file))
sheets_info = excel_info.get('sheets', [])
for sheet_info in sheets_info:
sheet_name = sheet_info['name']
# Load individual sheet
sheet_data = read_excel_file(str(temp_file), sheet_name=sheet_name)
df = sheet_data['dataframe']
validation = sheet_data.get('format_validation')
context = sheet_data.get('context')
# Add to session datasets with validation and context
result = smartstat_service.add_dataset(
session_id,
sheet_name,
df,
validation=validation,
context=context
)
if result['success']:
if result['type'] == 'information_sheet':
info_sheets_info.append({
'name': sheet_name,
'type': 'information_sheet',
'validation': validation,
'context_length': result.get('context_length', 0)
})
else:
datasets_info.append(result['dataset_info'])
all_columns.extend(df.columns.tolist())
total_rows += len(df)
# Cleanup
temp_file.unlink()
except Exception as e:
logger.error(f"Error processing data file {file.filename}: {e}")
continue
elif file_ext in ['.pdf', '.doc', '.docx', '.ppt', '.pptx', '.rtf', '.odt']:
# Handle document as information sheet
try:
import tempfile
from pathlib import Path
# Save temporarily
temp_dir = tempfile.mkdtemp()
temp_file = Path(temp_dir) / file.filename
file.save(str(temp_file))
# Extract text content using document processor
from document_processor import DocumentProcessor
processor = DocumentProcessor()
try:
# Use the main process_document method
processed_result = processor.process_document(str(temp_file))
# Get combined text from the result
text_content = processor.get_combined_text(processed_result)
if not text_content or len(text_content.strip()) == 0:
text_content = f"Unable to extract text from {file.filename}"
except Exception as doc_error:
logger.error(f"Document processing failed for {file.filename}: {doc_error}")
# Fallback for text files
try:
with open(temp_file, 'r', encoding='utf-8', errors='ignore') as f:
text_content = f.read()
except:
text_content = f"Failed to process document: {file.filename}"
# Format as markdown info sheet
sheet_name = os.path.splitext(file.filename)[0]
formatted_content = f"# Document: {sheet_name}\n\n"
formatted_content += f"**Source**: {file.filename}\n"
formatted_content += f"**Type**: {file_ext.upper()} Document\n\n"
formatted_content += "## Content\n\n"
formatted_content += text_content
# Add to session info sheets
session.info_sheets[sheet_name] = formatted_content
info_sheets_info.append({
'name': sheet_name,
'type': 'document_sheet',
'filename': file.filename,
'format': file_ext,
'context_length': len(formatted_content)
})
# Cleanup
temp_file.unlink()
except Exception as e:
logger.error(f"Error processing document {file.filename}: {e}")
continue
else:
logger.warning(f"Unsupported file type: {file_ext} for {file.filename}")
continue
session.updated_at = datetime.now()
# Save info sheets to JSON file if any were added
if session.info_sheets:
session_scripts_dir = Path(f"smartstat_scripts/{session_id}")
session_scripts_dir.mkdir(exist_ok=True)
project_path = session_scripts_dir / "project_1"
project_path.mkdir(exist_ok=True)
import json
info_sheets_file = project_path / "info_sheets.json"
with open(info_sheets_file, 'w', encoding='utf-8') as f:
json.dump(session.info_sheets, f, indent=2, ensure_ascii=False)
logger.info(f"Saved {len(session.info_sheets)} info sheets to {info_sheets_file}")
# Generate combined dataset info
combined_info = {
'rows': total_rows,
'columns': len(set(all_columns)) if all_columns else 0,
'column_names': list(set(all_columns)) if all_columns else [],
'datasets': datasets_info,
'info_sheets': info_sheets_info,
'dataset_count': len(datasets_info),
'info_sheet_count': len(info_sheets_info)
}
# Update metadata
if not data_section.metadata:
data_section.metadata = {}
data_section.metadata['dataset_count'] = len(datasets_info)
data_section.metadata['info_sheet_count'] = len(info_sheets_info)
data_section.metadata['datasets'] = {ds['name']: {'rows': ds['rows'], 'columns': ds['columns']} for ds in datasets_info}
data_section.metadata['info_sheets'] = [sheet['name'] for sheet in info_sheets_info]
data_section_service.update_data_section(data_section)
return jsonify({
'success': True,
'dataset_info': combined_info,
'files_processed': len([f for f in files if f.filename != ''])
})
except Exception as e:
logger.error(f"Error uploading files to SmartStat: {e}")
return jsonify({'error': str(e)}), 500
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
session_id |
- | - | positional_or_keyword |
Parameter Details
session_id: String identifier for the SmartStat session. Must be a valid existing session ID. Used to retrieve the session and associate uploaded files with it. The session must belong to a data section owned by the authenticated user.
Return Value
Returns a Flask JSON response. On success (200): {'success': True, 'dataset_info': {...}, 'files_processed': int} where dataset_info contains rows, columns, column_names, datasets array, info_sheets array, and counts. On error: 404 if session not found, 403 if access denied, 400 if no files provided, or 500 with error message for processing failures.
Dependencies
flaskosjsonloggingdatetimepathlibtempfilepandaswerkzeugsmartstat_servicedata_section_servicedocument_processor
Required Imports
from flask import request, jsonify
import os
import json
import logging
from datetime import datetime
from pathlib import Path
from services import DataSectionService
from smartstat_service import SmartStatService
Conditional/Optional Imports
These imports are only needed under specific conditions:
import tempfile
Condition: always needed for file processing
Required (conditional)from pathlib import Path
Condition: always needed for file path operations
Required (conditional)from smartstat_service import smart_read_csv
Condition: when processing CSV files
Required (conditional)from smartstat_service import read_excel_file
Condition: when processing Excel files (.xlsx, .xls)
Required (conditional)from document_processor import DocumentProcessor
Condition: when processing document files (PDF, Word, PowerPoint)
Required (conditional)Usage Example
# Client-side usage (JavaScript fetch example)
const formData = new FormData();
formData.append('files[]', csvFile);
formData.append('files[]', pdfFile);
formData.append('files[]', excelFile);
fetch('/api/smartstat/session_123/upload-files', {
method: 'POST',
headers: {
'Authorization': 'Bearer <token>'
},
body: formData
})
.then(response => response.json())
.then(data => {
console.log('Files processed:', data.files_processed);
console.log('Datasets:', data.dataset_info.datasets);
console.log('Info sheets:', data.dataset_info.info_sheets);
});
# Server-side testing example
with app.test_client() as client:
with open('data.csv', 'rb') as csv_file, open('info.pdf', 'rb') as pdf_file:
response = client.post(
'/api/smartstat/session_123/upload-files',
data={'files[]': [csv_file, pdf_file]},
content_type='multipart/form-data'
)
result = response.get_json()
print(f"Success: {result['success']}")
print(f"Datasets: {result['dataset_info']['dataset_count']}")
Best Practices
- Always validate user ownership before allowing file uploads to prevent unauthorized access
- Use temporary directories for file processing and ensure cleanup with unlink() to prevent disk space issues
- Handle multiple file formats gracefully with try-except blocks to prevent one file failure from stopping the entire upload
- Log errors for individual file processing failures but continue processing remaining files
- Validate file extensions before processing to reject unsupported formats early
- Store information sheets in both session memory and persistent JSON files for durability
- Update metadata atomically after all files are processed to maintain consistency
- Use secure_filename or similar validation in production to prevent path traversal attacks
- Consider implementing file size limits to prevent memory exhaustion
- For Excel files, process each sheet separately as they may contain different types of data
- Use the DocumentProcessor for consistent text extraction across different document formats
- Maintain separate tracking for datasets vs information sheets as they serve different purposes
- Return detailed information about processed files to help users understand what was uploaded
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function smartstat_upload_data 88.2% similar
-
function smartstat_select_sheet 75.6% similar
-
function upload_analysis_dataset 73.6% similar
-
function upload_data_section_dataset 71.2% similar
-
function upload_data 71.2% similar