function add_data_section_to_docx
Adds a data analysis section to a Word document, including analysis metadata, statistical conclusions, and embedded visualizations from saved content or legacy analysis history.
/tf/active/vicechatdev/vice_ai/new_app.py
2672 - 2953
complex
Purpose
This function is designed to populate a Word document with comprehensive data analysis results. It handles two content sources: (1) saved content from selective saves with plots, and (2) legacy analysis_history metadata. It formats and embeds analysis information, statistical conclusions (supporting HTML and markdown), and visualization images into the document with proper styling and structure.
Source Code
def add_data_section_to_docx(doc, data_section):
"""Add a data section to the Word document with analysis info, conclusions, and plots"""
# Section title with icon
heading = doc.add_heading(f"📊 {data_section.title}", 2)
if data_section.description:
desc_para = doc.add_paragraph(data_section.description)
desc_para.style = 'Intense Quote'
# Check for saved content first (from selective save), fall back to metadata
has_saved_content = data_section.current_content and data_section.current_content.strip()
has_saved_plots = data_section.generated_plots and len(data_section.generated_plots) > 0
if has_saved_content or has_saved_plots:
# Use selective save content
logger.info(f"DOCX: Using saved content for data section '{data_section.title}'")
# Add last saved timestamp if available
if data_section.metadata and data_section.metadata.get('last_saved'):
try:
timestamp = datetime.fromisoformat(data_section.metadata['last_saved']).strftime('%Y-%m-%d %H:%M:%S')
para = doc.add_paragraph()
run = para.add_run(f"Last Saved: {timestamp}")
run.italic = True
doc.add_paragraph()
except:
pass
# Add saved content
if has_saved_content:
doc.add_heading("Analysis Results", 3)
# Check if content is HTML or markdown
is_html = bool(re.search(r'<h[1-6]>|<p>|<strong>|<ul>|<li>', data_section.current_content))
if is_html:
# Parse HTML and add formatted content
elements = html_to_plain_text_with_formatting(data_section.current_content)
for text, format_type in elements:
if not text.strip():
continue
if format_type == 'h1':
doc.add_heading(text, 2)
elif format_type == 'h2':
doc.add_heading(text, 3)
elif format_type in ['h3', 'h4']:
doc.add_heading(text, 4)
elif format_type in ['h5', 'h6']:
para = doc.add_paragraph()
run = para.add_run(text)
run.bold = True
elif format_type == 'bold':
para = doc.add_paragraph()
run = para.add_run(text)
run.bold = True
else: # normal
doc.add_paragraph(text)
else:
# Plain text or markdown
paragraphs = data_section.current_content.split('\n\n')
for para_text in paragraphs:
para_text = para_text.strip()
if para_text:
doc.add_paragraph(para_text)
doc.add_paragraph()
# Add saved plots
if has_saved_plots:
doc.add_heading(f"Selected Visualizations ({len(data_section.generated_plots)})", 3)
session_id = data_section.analysis_session_id or data_section.id
base_path = os.path.join('smartstat_scripts', session_id)
for i, plot_file in enumerate(data_section.generated_plots, 1):
plot_name = plot_file.split('/')[-1]
# Search for the plot
plot_path = None
if os.path.exists(base_path):
for project_dir in os.listdir(base_path):
if project_dir.startswith('project_'):
candidate_path = os.path.join(base_path, project_dir, plot_name)
if os.path.exists(candidate_path):
plot_path = candidate_path
break
if plot_path and os.path.exists(plot_path):
try:
para = doc.add_paragraph()
run = para.add_run(f"Figure {i}: {plot_name}")
run.bold = True
doc.add_picture(plot_path, width=Inches(6))
doc.add_paragraph()
except Exception as e:
logger.warning(f"Error adding plot {plot_name} to DOCX: {e}")
para = doc.add_paragraph()
para.add_run(f"{i}. {plot_name} (image error)")
else:
logger.warning(f"Plot file not found for {plot_name}")
para = doc.add_paragraph()
para.add_run(f"{i}. {plot_name} (file not found)")
doc.add_paragraph()
return
# Fall back to old metadata format (analysis_history)
logger.info(f"DOCX: Using legacy analysis_history for data section '{data_section.title}'")
latest_analysis = None
if data_section.metadata and 'analysis_history' in data_section.metadata:
analysis_history = data_section.metadata['analysis_history']
if analysis_history and len(analysis_history) > 0:
latest_analysis = analysis_history[-1]
if not latest_analysis:
para = doc.add_paragraph()
run = para.add_run("No analysis results available for this data section.")
run.italic = True
return
# PART 1: Analysis Metadata
doc.add_heading("Analysis Information", 3)
model_name = latest_analysis.get('model', 'gpt-4o (default)')
query = latest_analysis.get('query', 'N/A')
timestamp = latest_analysis.get('timestamp', 'N/A')
if timestamp != 'N/A':
try:
timestamp = datetime.fromisoformat(timestamp.replace('Z', '+00:00')).strftime('%Y-%m-%d %H:%M:%S')
except:
pass
# Add metadata as table
table = doc.add_table(rows=4, cols=2)
table.style = 'Light Grid Accent 1'
table.cell(0, 0).text = 'Query'
table.cell(0, 1).text = str(query)
table.cell(1, 0).text = 'Timestamp'
table.cell(1, 1).text = str(timestamp)
table.cell(2, 0).text = 'LLM Model'
table.cell(2, 1).text = str(model_name)
table.cell(3, 0).text = 'Status'
table.cell(3, 1).text = '✓ Completed'
# Dataset info
if data_section.dataset_info:
try:
if isinstance(data_section.dataset_info, str):
dataset_info = json.loads(data_section.dataset_info)
else:
dataset_info = data_section.dataset_info
if 'rows' in dataset_info and 'columns' in dataset_info:
doc.add_paragraph()
para = doc.add_paragraph()
para.add_run('Dataset: ').bold = True
para.add_run(f"{dataset_info['rows']} rows × {dataset_info['columns']} columns")
except Exception as e:
logger.warning(f"Error parsing dataset_info for DOCX: {e}")
doc.add_paragraph()
# PART 2: Statistical Conclusions
doc.add_heading("Statistical Conclusions", 3)
# Extract conclusion text
conclusion_text = extract_conclusion_text_for_pdf(latest_analysis)
# Add conclusions (process HTML formatting)
if conclusion_text:
# Check if content is HTML or markdown (look for actual HTML tags)
is_html = bool(re.search(r'<h[1-6]>|<p>|<strong>|<ul>|<li>', conclusion_text))
if is_html:
# Parse HTML and add formatted content
elements = html_to_plain_text_with_formatting(conclusion_text)
for text, format_type in elements:
if not text.strip():
continue
if format_type == 'h1':
doc.add_heading(text, 2)
elif format_type == 'h2':
doc.add_heading(text, 3)
elif format_type in ['h3', 'h4']:
doc.add_heading(text, 4)
elif format_type in ['h5', 'h6']:
para = doc.add_paragraph()
run = para.add_run(text)
run.bold = True
elif format_type == 'bold':
para = doc.add_paragraph()
run = para.add_run(text)
run.bold = True
else: # normal
doc.add_paragraph(text)
else:
# Fallback: parse as markdown (legacy format)
paragraphs = conclusion_text.split('\n\n')
for para_text in paragraphs:
para_text = para_text.strip()
if para_text:
# Handle markdown-style headers
if para_text.startswith('###'):
para_text = para_text.replace('###', '').strip()
doc.add_heading(para_text, 4)
elif para_text.startswith('##'):
para_text = para_text.replace('##', '').strip()
doc.add_heading(para_text, 3)
elif para_text.startswith('#'):
para_text = para_text.replace('#', '').strip()
doc.add_heading(para_text, 2)
else:
# Create paragraph and handle bold markdown **text**
para = doc.add_paragraph()
# Split by bold markers and add runs
parts = re.split(r'(\*\*[^*]+\*\*)', para_text)
for part in parts:
if part.startswith('**') and part.endswith('**'):
# Bold text
run = para.add_run(part[2:-2])
run.bold = True
elif part:
# Normal text
para.add_run(part)
else:
para = doc.add_paragraph()
run = para.add_run("No conclusions available.")
run.italic = True
doc.add_paragraph()
# PART 3: Visualizations
if latest_analysis.get('plots') and len(latest_analysis['plots']) > 0:
doc.add_heading(f"Visualizations ({len(latest_analysis['plots'])})", 3)
session_id = data_section.analysis_session_id or data_section.id
base_path = os.path.join('smartstat_scripts', session_id)
# Embed actual plot images
for i, plot_file in enumerate(latest_analysis['plots'], 1):
# Extract filename from URL or path
plot_name = plot_file.split('/')[-1]
# Search for the plot in all project directories
plot_path = None
if os.path.exists(base_path):
for project_dir in os.listdir(base_path):
if project_dir.startswith('project_'):
candidate_path = os.path.join(base_path, project_dir, plot_name)
if os.path.exists(candidate_path):
plot_path = candidate_path
break
# If found, embed the image
if plot_path and os.path.exists(plot_path):
try:
# Add plot caption
para = doc.add_paragraph()
run = para.add_run(f"Figure {i}: {plot_name}")
run.bold = True
# Add the image (width in inches - 6 inches to fit page with margins)
doc.add_picture(plot_path, width=Inches(6))
doc.add_paragraph() # Spacing after image
except Exception as e:
logger.warning(f"Error adding plot {plot_name} to DOCX: {e}")
para = doc.add_paragraph()
para.add_run(f"{i}. {plot_name} (image error)")
else:
logger.warning(f"Plot file not found for {plot_name} in {base_path}")
para = doc.add_paragraph()
para.add_run(f"{i}. {plot_name} (file not found)")
else:
para = doc.add_paragraph("No visualizations generated in this analysis.")
doc.add_paragraph() # Spacing after data section
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
doc |
- | - | positional_or_keyword |
data_section |
- | - | positional_or_keyword |
Parameter Details
doc: A python-docx Document object representing the Word document being created. This object is modified in-place by adding headings, paragraphs, tables, and images.
data_section: A DataSection model object containing analysis results. Expected attributes include: title, description, current_content, generated_plots, metadata (with analysis_history and last_saved), dataset_info, analysis_session_id, and id. The function checks for saved content first, then falls back to metadata.analysis_history.
Return Value
Returns None. The function modifies the 'doc' parameter in-place by adding formatted content sections including headings, paragraphs, tables, and embedded images.
Dependencies
python-docxdatetimeosjsonrelogging
Required Imports
from docx import Document
from docx.shared import Inches
from datetime import datetime
import os
import json
import re
import logging
Conditional/Optional Imports
These imports are only needed under specific conditions:
from models import DataSection
Condition: Required for type checking and understanding the data_section parameter structure
Required (conditional)Custom function: html_to_plain_text_with_formatting
Condition: Required when processing HTML content in current_content or conclusions. Must be defined in the same module or imported.
Required (conditional)Custom function: extract_conclusion_text_for_pdf
Condition: Required for extracting conclusion text from legacy analysis_history format. Must be defined in the same module or imported.
Required (conditional)Usage Example
from docx import Document
from docx.shared import Inches
from datetime import datetime
import os
import json
import re
import logging
# Setup logger
logger = logging.getLogger(__name__)
# Create a Word document
doc = Document()
# Assume data_section is a DataSection object with analysis results
# data_section.title = 'Sales Analysis'
# data_section.description = 'Quarterly sales performance'
# data_section.current_content = '<h2>Key Findings</h2><p>Sales increased by 15%</p>'
# data_section.generated_plots = ['sales_chart.png', 'trend_plot.png']
# data_section.metadata = {'last_saved': '2024-01-15T10:30:00'}
# data_section.analysis_session_id = 'session_123'
# Add the data section to the document
add_data_section_to_docx(doc, data_section)
# Save the document
doc.save('analysis_report.docx')
Best Practices
- Ensure the DataSection object has either current_content/generated_plots OR metadata.analysis_history populated before calling this function
- Plot files must exist in the expected directory structure: 'smartstat_scripts/{session_id}/project_*/{plot_name}'
- The function expects helper functions html_to_plain_text_with_formatting() and extract_conclusion_text_for_pdf() to be available in scope
- HTML content should use standard tags (h1-h6, p, strong, ul, li) for proper formatting
- Markdown content should use standard syntax (# for headers, ** for bold)
- Images are embedded at 6 inches width to fit standard page margins
- The function logs warnings for missing plots or parsing errors, so ensure logger is configured
- Dataset info can be either a JSON string or dictionary - the function handles both formats
- The function modifies the doc object in-place, so no return value is needed
- For large documents with many plots, ensure sufficient memory and file system access
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function add_data_section_to_pdf 79.9% similar
-
function export_to_docx 66.3% similar
-
function export_to_docx_v1 64.5% similar
-
class DataSection 62.6% similar
-
function add_formatted_content_to_word_v1 62.3% similar