function add_data_section_to_pdf
Adds a data analysis section to a PDF document story, including analysis metadata, statistical conclusions, and embedded visualizations from saved content or analysis history.
/tf/active/vicechatdev/vice_ai/new_app.py
3064 - 3321
complex
Purpose
This function is part of a PDF export system for data analysis reports. It handles two content sources: (1) selective save content with user-chosen plots, or (2) legacy analysis_history metadata. It formats and adds section titles, descriptions, analysis information (query, timestamp, model), statistical conclusions with HTML/markdown parsing, and embeds plot images from the filesystem. The function intelligently searches for plot files across project directories and handles various text formatting scenarios including HTML tags and markdown syntax.
Source Code
def add_data_section_to_pdf(story, data_section, styles, heading2_style, heading3_style):
"""Add a data section to the PDF story with analysis info, conclusions, and plots"""
# Section title
story.append(Paragraph(f"📊 {html.escape(data_section.title)}", heading2_style))
if data_section.description:
story.append(Paragraph(html.escape(data_section.description), styles['Normal']))
story.append(Spacer(1, 12))
# Check for saved content first (from selective save), fall back to metadata
has_saved_content = data_section.current_content and data_section.current_content.strip()
has_saved_plots = data_section.generated_plots and len(data_section.generated_plots) > 0
if has_saved_content or has_saved_plots:
# Use selective save content
logger.info(f"PDF: Using saved content for data section '{data_section.title}'")
# Add last saved timestamp if available
if data_section.metadata and data_section.metadata.get('last_saved'):
try:
timestamp = datetime.fromisoformat(data_section.metadata['last_saved']).strftime('%Y-%m-%d %H:%M:%S')
story.append(Paragraph(f"<i>Last Saved: {timestamp}</i>", styles['Normal']))
story.append(Spacer(1, 12))
except:
pass
# Add saved content
if has_saved_content:
story.append(Paragraph("Analysis Results", heading3_style))
# Check if content is HTML
is_html = bool(re.search(r'<h[1-6]>|<p>|<strong>|<ul>|<li>', data_section.current_content))
if is_html:
# Parse HTML and add formatted content
elements = html_to_plain_text_with_formatting(data_section.current_content)
for text, format_type in elements:
if not text.strip():
continue
escaped_text = html.escape(text)
if format_type == 'h1':
story.append(Paragraph(f"<b>{escaped_text}</b>", heading2_style))
elif format_type == 'h2':
story.append(Paragraph(f"<b>{escaped_text}</b>", heading3_style))
elif format_type in ['h3', 'h4', 'h5', 'h6']:
story.append(Paragraph(f"<b>{escaped_text}</b>", styles['Heading4']))
elif format_type == 'bold':
story.append(Paragraph(f"<b>{escaped_text}</b>", styles['Normal']))
else: # normal
story.append(Paragraph(escaped_text, styles['Normal']))
story.append(Spacer(1, 6))
else:
# Plain text
paragraphs = data_section.current_content.split('\n\n')
for para_text in paragraphs:
para_text = para_text.strip()
if para_text:
story.append(Paragraph(html.escape(para_text), styles['Normal']))
story.append(Spacer(1, 6))
story.append(Spacer(1, 12))
# Add saved plots
if has_saved_plots:
story.append(Paragraph(f"Selected Visualizations ({len(data_section.generated_plots)})", heading3_style))
story.append(Spacer(1, 12))
session_id = data_section.analysis_session_id or data_section.id
base_path = os.path.join('smartstat_scripts', session_id)
for i, plot_file in enumerate(data_section.generated_plots, 1):
plot_name = plot_file.split('/')[-1]
# Search for the plot
plot_path = None
if os.path.exists(base_path):
for project_dir in os.listdir(base_path):
if project_dir.startswith('project_'):
candidate_path = os.path.join(base_path, project_dir, plot_name)
if os.path.exists(candidate_path):
plot_path = candidate_path
break
if plot_path and os.path.exists(plot_path):
try:
story.append(Paragraph(f"<b>Figure {i}: {html.escape(plot_name)}</b>", styles['Normal']))
story.append(Spacer(1, 6))
img = RLImage(plot_path, width=6*inch, height=4*inch)
story.append(img)
story.append(Spacer(1, 12))
except Exception as e:
logger.warning(f"Error adding plot {plot_name} to PDF: {e}")
story.append(Paragraph(f"{i}. {html.escape(plot_name)} (image error)", styles['Normal']))
else:
logger.warning(f"Plot file not found for {plot_name}")
story.append(Paragraph(f"{i}. {html.escape(plot_name)} (file not found)", styles['Normal']))
story.append(Spacer(1, 12))
return
# Fall back to old metadata format (analysis_history)
logger.info(f"PDF: Using legacy analysis_history for data section '{data_section.title}'")
latest_analysis = None
if data_section.metadata and 'analysis_history' in data_section.metadata:
analysis_history = data_section.metadata['analysis_history']
if analysis_history and len(analysis_history) > 0:
latest_analysis = analysis_history[-1]
if not latest_analysis:
story.append(Paragraph("<i>No analysis results available for this data section.</i>", styles['Italic']))
return
# PART 1: Analysis Metadata
story.append(Paragraph("Analysis Information", heading3_style))
model_name = latest_analysis.get('model', 'gpt-4o (default)')
query = latest_analysis.get('query', 'N/A')
timestamp = latest_analysis.get('timestamp', 'N/A')
if timestamp != 'N/A':
try:
timestamp = datetime.fromisoformat(timestamp.replace('Z', '+00:00')).strftime('%Y-%m-%d %H:%M:%S')
except:
pass
info_text = f"""
<b>Query:</b> {html.escape(str(query))}<br/>
<b>Timestamp:</b> {html.escape(str(timestamp))}<br/>
<b>LLM Model:</b> {html.escape(str(model_name))}<br/>
<b>Status:</b> <font color="green">✓ Completed</font>
"""
story.append(Paragraph(info_text, styles['Normal']))
story.append(Spacer(1, 12))
# Dataset info
if data_section.dataset_info:
try:
if isinstance(data_section.dataset_info, str):
dataset_info = json.loads(data_section.dataset_info)
else:
dataset_info = data_section.dataset_info
if 'rows' in dataset_info and 'columns' in dataset_info:
story.append(Paragraph(
f"<b>Dataset:</b> {dataset_info['rows']} rows × {dataset_info['columns']} columns",
styles['Normal']
))
story.append(Spacer(1, 12))
except Exception as e:
logger.warning(f"Error parsing dataset_info for PDF: {e}")
# PART 2: Statistical Conclusions
story.append(Paragraph("Statistical Conclusions", heading3_style))
# Extract conclusion text
conclusion_text = extract_conclusion_text_for_pdf(latest_analysis)
# Add conclusions (process HTML formatting)
if conclusion_text:
# Check if content is HTML or markdown (look for actual HTML tags)
is_html = bool(re.search(r'<h[1-6]>|<p>|<strong>|<ul>|<li>', conclusion_text))
if is_html:
# Parse HTML and add formatted content
elements = html_to_plain_text_with_formatting(conclusion_text)
for text, format_type in elements:
if not text.strip():
continue
# Escape text for reportlab
escaped_text = html.escape(text)
if format_type == 'h1':
story.append(Paragraph(f"<b>{escaped_text}</b>", heading2_style))
elif format_type == 'h2':
story.append(Paragraph(f"<b>{escaped_text}</b>", heading3_style))
elif format_type in ['h3', 'h4', 'h5', 'h6']:
story.append(Paragraph(f"<b>{escaped_text}</b>", styles['Heading4']))
elif format_type == 'bold':
story.append(Paragraph(f"<b>{escaped_text}</b>", styles['Normal']))
else: # normal
story.append(Paragraph(escaped_text, styles['Normal']))
story.append(Spacer(1, 6))
else:
# Fallback: parse as markdown (legacy format)
paragraphs = conclusion_text.split('\n\n')
for para_text in paragraphs:
para_text = para_text.strip()
if para_text:
# Handle markdown-style headers
if para_text.startswith('###'):
para_text = para_text.replace('###', '').strip()
story.append(Paragraph(f"<b>{html.escape(para_text)}</b>", styles['Heading4']))
elif para_text.startswith('##'):
para_text = para_text.replace('##', '').strip()
story.append(Paragraph(f"<b>{html.escape(para_text)}</b>", heading3_style))
elif para_text.startswith('#'):
para_text = para_text.replace('#', '').strip()
story.append(Paragraph(f"<b>{html.escape(para_text)}</b>", heading2_style))
else:
# Handle bold markdown **text**
para_text = re.sub(r'\*\*([^*]+)\*\*', r'<b>\1</b>', para_text)
# Escape remaining text
para_text = para_text.replace('&', '&').replace('<', '<').replace('>', '>')
# But keep our <b> tags
para_text = para_text.replace('<b>', '<b>').replace('</b>', '</b>')
story.append(Paragraph(para_text, styles['Normal']))
story.append(Spacer(1, 6))
else:
story.append(Paragraph("<i>No conclusions available.</i>", styles['Italic']))
story.append(Spacer(1, 12))
# PART 3: Visualizations
if latest_analysis.get('plots') and len(latest_analysis['plots']) > 0:
story.append(Paragraph(f"Visualizations ({len(latest_analysis['plots'])})", heading3_style))
story.append(Spacer(1, 8))
session_id = data_section.analysis_session_id or data_section.id
base_path = os.path.join('smartstat_scripts', session_id)
# Embed actual plot images
for i, plot_file in enumerate(latest_analysis['plots'], 1):
# Extract filename from URL or path
plot_name = plot_file.split('/')[-1]
# Search for the plot in all project directories
plot_path = None
if os.path.exists(base_path):
for project_dir in os.listdir(base_path):
if project_dir.startswith('project_'):
candidate_path = os.path.join(base_path, project_dir, plot_name)
if os.path.exists(candidate_path):
plot_path = candidate_path
break
# If found, embed the image
if plot_path and os.path.exists(plot_path):
try:
# Add plot caption
story.append(Paragraph(f"<b>Figure {i}:</b> {html.escape(plot_name)}", styles['Normal']))
story.append(Spacer(1, 4))
# Add the image (scale to fit page width with margins)
# Use width=5*inch to leave margins, maintain aspect ratio
img = RLImage(plot_path, width=5*inch, height=3.5*inch)
story.append(img)
story.append(Spacer(1, 12))
except Exception as e:
logger.warning(f"Error adding plot {plot_name} to PDF: {e}")
story.append(Paragraph(f"{i}. {html.escape(plot_name)} (image error)", styles['Normal']))
else:
logger.warning(f"Plot file not found for {plot_name} in {base_path}")
story.append(Paragraph(f"{i}. {html.escape(plot_name)} (file not found)", styles['Normal']))
else:
story.append(Paragraph("No visualizations generated in this analysis.", styles['Normal']))
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
story |
- | - | positional_or_keyword |
data_section |
- | - | positional_or_keyword |
styles |
- | - | positional_or_keyword |
heading2_style |
- | - | positional_or_keyword |
heading3_style |
- | - | positional_or_keyword |
Parameter Details
story: A ReportLab story list (list of flowable elements) that will be built into a PDF document. Elements are appended to this list to construct the PDF content.
data_section: A DataSection model object containing the analysis data to be added to the PDF. Must have attributes: title, description, current_content, generated_plots, metadata (dict with 'analysis_history' and 'last_saved'), dataset_info (JSON string or dict), analysis_session_id, and id.
styles: A ReportLab StyleSheet object (typically from getSampleStyleSheet()) containing predefined paragraph styles like 'Normal', 'Italic', 'Heading4' used for text formatting.
heading2_style: A ReportLab ParagraphStyle object used for level 2 headings (section titles). Should be configured with appropriate font size, weight, and spacing.
heading3_style: A ReportLab ParagraphStyle object used for level 3 headings (subsection titles like 'Analysis Information', 'Statistical Conclusions'). Should be configured with appropriate font size and weight.
Return Value
This function returns None. It modifies the 'story' list in-place by appending ReportLab flowable elements (Paragraphs, Spacers, Images) that represent the formatted data section content.
Dependencies
reportlabdatetimeosjsonrehtmllogging
Required Imports
from reportlab.platypus import Paragraph, Spacer, Image as RLImage
from reportlab.lib.units import inch
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from datetime import datetime
import os
import json
import re
import html
import logging
Conditional/Optional Imports
These imports are only needed under specific conditions:
from models import DataSection
Condition: Required for type checking and understanding the data_section parameter structure
Required (conditional)Custom helper functions: html_to_plain_text_with_formatting, extract_conclusion_text_for_pdf
Condition: These appear to be custom utility functions defined elsewhere in the codebase for parsing HTML/markdown content
Required (conditional)Usage Example
from reportlab.platypus import SimpleDocTemplate
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib.enums import TA_LEFT
import io
# Setup PDF document
buffer = io.BytesIO()
doc = SimpleDocTemplate(buffer, pagesize=letter)
story = []
# Setup styles
styles = getSampleStyleSheet()
heading2_style = ParagraphStyle(
'CustomHeading2',
parent=styles['Heading2'],
fontSize=16,
textColor='blue',
spaceAfter=12
)
heading3_style = ParagraphStyle(
'CustomHeading3',
parent=styles['Heading3'],
fontSize=14,
spaceAfter=10
)
# Assume data_section is a DataSection object from your database
# data_section = DataSection.query.get(section_id)
# Add the data section to PDF
add_data_section_to_pdf(
story=story,
data_section=data_section,
styles=styles,
heading2_style=heading2_style,
heading3_style=heading3_style
)
# Build the PDF
doc.build(story)
pdf_bytes = buffer.getvalue()
buffer.close()
Best Practices
- Ensure the DataSection object has either current_content/generated_plots (selective save) or metadata['analysis_history'] (legacy format) populated before calling this function
- Plot files must exist in the filesystem at the expected paths (smartstat_scripts/{session_id}/project_*/) or they will be listed as 'file not found'
- The function uses html.escape() extensively to prevent injection attacks - maintain this pattern when modifying
- The function handles both HTML and markdown formatted text - test with both formats when making changes
- Image dimensions are hardcoded (6x4 inch for saved plots, 5x3.5 inch for legacy) - adjust based on your page layout requirements
- The function logs warnings for missing plots and parsing errors - monitor these logs to identify data issues
- Consider implementing error boundaries around the entire function to prevent PDF generation failures from crashing the application
- The function searches all project_* directories for plots - this could be slow with many projects; consider caching or indexing plot locations
- HTML parsing relies on custom helper functions (html_to_plain_text_with_formatting, extract_conclusion_text_for_pdf) - ensure these are available and tested
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function add_data_section_to_docx 79.9% similar
-
class DataSection 66.2% similar
-
function add_formatted_content_to_pdf 65.4% similar
-
function add_formatted_content_to_pdf_v1 64.9% similar
-
function export_to_pdf 63.7% similar