class HybridPDFGenerator
A class that generates hybrid PDF documents combining formatted text content with embedded graphics, optimized for e-ink displays.
/tf/active/vicechatdev/e-ink-llm/hybrid_pdf_generator.py
86 - 461
complex
Purpose
HybridPDFGenerator creates professional PDF documents that integrate text content with dynamically generated graphics. It processes text with markdown-style formatting, identifies graphic placeholders, embeds base64-encoded images at appropriate positions, and applies custom styling optimized for e-ink readers. The class handles the complete workflow from content parsing to PDF generation, including metadata sections, processing summaries, and proper layout management.
Source Code
class HybridPDFGenerator:
"""Generates hybrid PDFs combining text and graphics"""
def __init__(self):
self.styles = getSampleStyleSheet()
self.setup_hybrid_styles()
self.placeholder_pattern = re.compile(
r'\[GRAPHIC:(\w+):([^:]+):([^\]]+)\]',
re.IGNORECASE
)
def setup_hybrid_styles(self):
"""Setup custom styles for hybrid PDFs"""
# Main title style
self.styles.add(ParagraphStyle(
name='HybridTitle',
parent=self.styles['Title'],
fontSize=18,
leading=24,
alignment=TA_CENTER,
spaceAfter=20,
textColor=colors.black,
fontName='Helvetica-Bold'
))
# Section header style
self.styles.add(ParagraphStyle(
name='HybridHeader',
parent=self.styles['Heading1'],
fontSize=14,
leading=18,
spaceAfter=12,
spaceBefore=16,
textColor=colors.black,
fontName='Helvetica-Bold'
))
# Sub-header style
self.styles.add(ParagraphStyle(
name='HybridSubHeader',
parent=self.styles['Heading2'],
fontSize=12,
leading=16,
spaceAfter=8,
spaceBefore=12,
textColor=colors.black,
fontName='Helvetica-Bold'
))
# Body text optimized for e-ink
self.styles.add(ParagraphStyle(
name='HybridBody',
parent=self.styles['Normal'],
fontSize=11,
leading=15,
alignment=TA_JUSTIFY,
spaceAfter=8,
textColor=colors.black,
fontName='Helvetica'
))
# Graphics caption style
self.styles.add(ParagraphStyle(
name='GraphicCaption',
parent=self.styles['Normal'],
fontSize=9,
leading=12,
alignment=TA_CENTER,
spaceAfter=12,
spaceBefore=4,
textColor=colors.grey,
fontName='Helvetica-Oblique'
))
# Metadata style
self.styles.add(ParagraphStyle(
name='HybridMeta',
parent=self.styles['Normal'],
fontSize=9,
leading=12,
alignment=TA_LEFT,
spaceAfter=4,
textColor=colors.grey,
fontName='Helvetica-Oblique'
))
async def create_hybrid_pdf(self,
text_content: str,
placeholders: List[Any],
graphics: Dict[str, Any],
metadata: Dict[str, Any],
output_path: str,
conversation_id: Optional[str] = None,
exchange_number: Optional[int] = None) -> str:
"""
Create a hybrid PDF with text and embedded graphics
Args:
text_content: Text content with graphic placeholders
placeholders: List of graphic placeholders found in text
graphics: Generated graphics keyed by ID
metadata: Processing metadata
output_path: Path for output PDF
conversation_id: Session conversation ID
exchange_number: Exchange number
Returns:
Path to generated hybrid PDF
"""
print(f"📄 Creating hybrid PDF with {len(graphics)} graphics...")
# Use custom hybrid document template
doc = HybridSessionDocTemplate(
output_path,
conversation_id=conversation_id,
exchange_number=exchange_number,
pagesize=letter,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=72
)
story = []
# Add title
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
title = f"AI Hybrid Response - {timestamp}"
story.append(Paragraph(title, self.styles['HybridTitle']))
story.append(Spacer(1, 20))
# Add metadata section
story.append(Paragraph("Document Information", self.styles['HybridHeader']))
# Source information
source_info = f"Source: {metadata.get('source_file', 'Unknown')}"
story.append(Paragraph(source_info, self.styles['HybridMeta']))
# Processing information
if metadata.get('dimensions'):
dims_info = f"Original dimensions: {metadata['dimensions']}"
story.append(Paragraph(dims_info, self.styles['HybridMeta']))
processing_info = f"Processing mode: Hybrid (Text + Graphics)"
story.append(Paragraph(processing_info, self.styles['HybridMeta']))
graphics_info = f"Generated graphics: {len(graphics)}"
story.append(Paragraph(graphics_info, self.styles['HybridMeta']))
story.append(Spacer(1, 20))
# Process content and insert graphics
content_elements = self._process_content_with_graphics(
text_content, placeholders, graphics
)
story.extend(content_elements)
# Add processing summary
story.append(PageBreak())
story.append(Paragraph("Processing Summary", self.styles['HybridHeader']))
summary_items = [
f"• Generated graphics: {len(graphics)}",
f"• Successfully embedded: {len([g for g in graphics.values() if g.image_data])}",
f"• Response optimization: E-ink display",
f"• Timestamp: {timestamp}"
]
for item in summary_items:
story.append(Paragraph(item, self.styles['HybridMeta']))
# Build PDF
doc.build(story)
print(f"✅ Hybrid PDF created: {Path(output_path).name}")
return output_path
def _process_content_with_graphics(self,
text_content: str,
placeholders: List[Any],
graphics: Dict[str, Any]) -> List[Any]:
"""Process text content and insert graphics at placeholder positions"""
elements = []
current_pos = 0
# Sort placeholders by their position in the text
placeholder_positions = []
for placeholder in placeholders:
pos = text_content.find(placeholder.position_marker)
if pos >= 0:
placeholder_positions.append((pos, placeholder))
placeholder_positions.sort(key=lambda x: x[0])
for pos, placeholder in placeholder_positions:
# Add text before this placeholder
if pos > current_pos:
text_before = text_content[current_pos:pos]
text_elements = self._convert_text_to_elements(text_before)
elements.extend(text_elements)
# Add the graphic if available
if placeholder.id in graphics:
graphic_elements = self._create_graphic_elements(
graphics[placeholder.id], placeholder
)
elements.extend(graphic_elements)
else:
# Add placeholder text if graphic generation failed
placeholder_text = f"[Graphic placeholder: {placeholder.description}]"
elements.append(Paragraph(placeholder_text, self.styles['GraphicCaption']))
# Update position to after this placeholder
current_pos = pos + len(placeholder.position_marker)
# Add remaining text after last placeholder
if current_pos < len(text_content):
remaining_text = text_content[current_pos:]
text_elements = self._convert_text_to_elements(remaining_text)
elements.extend(text_elements)
return elements
def _convert_text_to_elements(self, text: str) -> List[Any]:
"""Convert text content to PDF elements with proper formatting"""
elements = []
# Split text into sections based on markdown-style headers
sections = self._parse_markdown_sections(text)
for section_type, content in sections:
if section_type == 'header1':
elements.append(Paragraph(content, self.styles['HybridHeader']))
elif section_type == 'header2':
elements.append(Paragraph(content, self.styles['HybridSubHeader']))
elif section_type == 'body':
# Split paragraphs and add each one
paragraphs = content.split('\n\n')
for paragraph in paragraphs:
if paragraph.strip():
# Process markdown formatting
formatted_paragraph = self._process_markdown_formatting(paragraph)
elements.append(Paragraph(formatted_paragraph, self.styles['HybridBody']))
elements.append(Spacer(1, 6))
elif section_type == 'code':
elements.append(Paragraph(content, self.styles['Code']))
return elements
def _parse_markdown_sections(self, text: str) -> List[tuple]:
"""Parse text for markdown-style sections"""
sections = []
lines = text.split('\n')
current_section = []
current_type = 'body'
for line in lines:
line = line.strip()
if line.startswith('# '):
# Save previous section
if current_section:
sections.append((current_type, '\n'.join(current_section)))
# Start new header section
sections.append(('header1', line[2:]))
current_section = []
current_type = 'body'
elif line.startswith('## '):
# Save previous section
if current_section:
sections.append((current_type, '\n'.join(current_section)))
# Start new subheader section
sections.append(('header2', line[3:]))
current_section = []
current_type = 'body'
elif line.startswith('```'):
# Toggle code section
if current_type == 'code':
sections.append((current_type, '\n'.join(current_section)))
current_section = []
current_type = 'body'
else:
if current_section:
sections.append((current_type, '\n'.join(current_section)))
current_section = []
current_type = 'code'
else:
current_section.append(line)
# Add final section
if current_section:
sections.append((current_type, '\n'.join(current_section)))
return sections
def _process_markdown_formatting(self, text: str) -> str:
"""Process basic markdown formatting for reportlab"""
# Convert **bold** to <b>bold</b>
text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)
# Convert *italic* to <i>italic</i>
text = re.sub(r'\*(.*?)\*', r'<i>\1</i>', text)
# Convert `code` to monospace
text = re.sub(r'`(.*?)`', r'<font name="Courier">\1</font>', text)
return text
def _create_graphic_elements(self,
graphic: Any,
placeholder: Any) -> List[Any]:
"""Create PDF elements for a graphic"""
elements = []
if not graphic.image_data:
# No image data available
error_text = f"[Graphic generation failed: {placeholder.description}]"
elements.append(Paragraph(error_text, self.styles['GraphicCaption']))
return elements
try:
# Decode image data
image_data = base64.b64decode(graphic.image_data)
image_buffer = io.BytesIO(image_data)
# Create ReportLab Image
# Calculate appropriate size for e-ink display
max_width = 6 * inch # Maximum width for graphics
max_height = 4 * inch # Maximum height for graphics
# Use graphic's dimensions if available
if graphic.width and graphic.height:
aspect_ratio = graphic.width / graphic.height
if aspect_ratio > max_width / max_height:
# Width is limiting factor
img_width = max_width
img_height = max_width / aspect_ratio
else:
# Height is limiting factor
img_height = max_height
img_width = max_height * aspect_ratio
else:
# Default size
img_width = max_width
img_height = max_height
# Create image element
img_element = Image(image_buffer, width=img_width, height=img_height)
# Create caption
caption_text = placeholder.description
if graphic.type.value:
caption_text = f"{graphic.type.value.title()}: {caption_text}"
# Add spacing, image, and caption as a group
graphic_group = KeepTogether([
Spacer(1, 12),
img_element,
Paragraph(caption_text, self.styles['GraphicCaption']),
Spacer(1, 12)
])
elements.append(graphic_group)
except Exception as e:
print(f"Error embedding graphic {graphic.id}: {e}")
error_text = f"[Error embedding graphic: {placeholder.description}]"
elements.append(Paragraph(error_text, self.styles['GraphicCaption']))
return elements
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
No constructor parameters: The __init__ method takes no parameters. It initializes the class with default styles and sets up a regex pattern for detecting graphic placeholders.
Return Value
Instantiation returns a HybridPDFGenerator object. The main method create_hybrid_pdf returns a string containing the path to the generated PDF file. Helper methods return various types: _process_content_with_graphics returns List[Any] of PDF elements, _convert_text_to_elements returns List[Any] of formatted elements, _parse_markdown_sections returns List[tuple] of section types and content, _process_markdown_formatting returns str with HTML-like tags, and _create_graphic_elements returns List[Any] of graphic elements.
Class Interface
Methods
__init__(self)
Purpose: Initialize the HybridPDFGenerator with default styles and placeholder pattern
Returns: None - initializes instance attributes
setup_hybrid_styles(self)
Purpose: Configure custom paragraph styles optimized for hybrid PDFs and e-ink displays
Returns: None - modifies self.styles by adding custom ParagraphStyle objects
async create_hybrid_pdf(self, text_content: str, placeholders: List[Any], graphics: Dict[str, Any], metadata: Dict[str, Any], output_path: str, conversation_id: Optional[str] = None, exchange_number: Optional[int] = None) -> str
Purpose: Generate a complete hybrid PDF document with embedded text and graphics
Parameters:
text_content: String containing the main text content with graphic placeholder markersplaceholders: List of placeholder objects with id, position_marker, and description attributesgraphics: Dictionary mapping graphic IDs to graphic objects with image_data, type, width, heightmetadata: Dictionary containing document metadata like source_file and dimensionsoutput_path: File system path where the PDF should be savedconversation_id: Optional session conversation identifier for document templateexchange_number: Optional exchange number for document template
Returns: String containing the path to the generated PDF file
_process_content_with_graphics(self, text_content: str, placeholders: List[Any], graphics: Dict[str, Any]) -> List[Any]
Purpose: Parse text content and insert graphics at placeholder positions, returning PDF elements
Parameters:
text_content: String containing text with placeholder markersplaceholders: List of placeholder objects to locate in textgraphics: Dictionary of graphic objects to embed
Returns: List of ReportLab flowable elements (Paragraphs, Images, Spacers) ready for PDF building
_convert_text_to_elements(self, text: str) -> List[Any]
Purpose: Convert plain text with markdown formatting into styled PDF elements
Parameters:
text: String containing text with markdown-style formatting
Returns: List of ReportLab Paragraph and Spacer elements with appropriate styles applied
_parse_markdown_sections(self, text: str) -> List[tuple]
Purpose: Parse text into sections based on markdown headers and code blocks
Parameters:
text: String containing markdown-formatted text
Returns: List of tuples where each tuple is (section_type, content) - section_type is 'header1', 'header2', 'body', or 'code'
_process_markdown_formatting(self, text: str) -> str
Purpose: Convert markdown formatting syntax to ReportLab HTML-like tags
Parameters:
text: String with markdown formatting like **bold**, *italic*, `code`
Returns: String with HTML-like tags (<b>, <i>, <font>) that ReportLab can render
_create_graphic_elements(self, graphic: Any, placeholder: Any) -> List[Any]
Purpose: Create PDF elements for embedding a graphic with caption and proper sizing
Parameters:
graphic: Graphic object with image_data, type, width, height attributesplaceholder: Placeholder object with description attribute for caption
Returns: List of ReportLab elements including Image, Paragraph (caption), and Spacers, wrapped in KeepTogether
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
styles |
StyleSheet1 | ReportLab stylesheet containing all paragraph styles including custom hybrid styles | instance |
placeholder_pattern |
re.Pattern | Compiled regex pattern for matching graphic placeholder markers in format [GRAPHIC:id:type:description] | instance |
Dependencies
iobase64repathlibdatetimetypingdataclassesreportlabPIL
Required Imports
import io
import base64
import re
from pathlib import Path
from datetime import datetime
from typing import Dict, Any, List, Optional
from dataclasses import dataclass
from reportlab.lib.pagesizes import letter, A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib.enums import TA_LEFT, TA_JUSTIFY, TA_CENTER
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak, KeepTogether
from reportlab.platypus.doctemplate import PageTemplate, BaseDocTemplate
from reportlab.platypus.frames import Frame
from reportlab.lib import colors
from PIL import Image as PILImage
Usage Example
import asyncio
from hybrid_pdf_generator import HybridPDFGenerator
# Instantiate the generator
generator = HybridPDFGenerator()
# Prepare content with placeholders
text_content = "# Report Title\n\nThis is body text.\n\n[GRAPHIC:chart1:bar:Sales Data]\n\nMore text here."
# Define placeholders (mock objects)
class Placeholder:
def __init__(self, id, position_marker, description):
self.id = id
self.position_marker = position_marker
self.description = description
placeholders = [
Placeholder('chart1', '[GRAPHIC:chart1:bar:Sales Data]', 'Sales Data')
]
# Define graphics (mock objects)
class Graphic:
def __init__(self, id, image_data, type, width, height):
self.id = id
self.image_data = image_data
self.type = type
self.width = width
self.height = height
class GraphicType:
def __init__(self, value):
self.value = value
graphics = {
'chart1': Graphic('chart1', 'base64_encoded_image_data', GraphicType('chart'), 800, 600)
}
# Metadata
metadata = {
'source_file': 'data.csv',
'dimensions': '1920x1080'
}
# Generate PDF
async def generate():
output_path = await generator.create_hybrid_pdf(
text_content=text_content,
placeholders=placeholders,
graphics=graphics,
metadata=metadata,
output_path='output/hybrid_report.pdf',
conversation_id='conv_123',
exchange_number=1
)
print(f'PDF generated: {output_path}')
asyncio.run(generate())
Best Practices
- Always instantiate the class before calling any methods - the constructor sets up required styles and patterns
- Call setup_hybrid_styles() only once during initialization - it's automatically called by __init__
- Ensure graphic objects have valid base64-encoded image_data before passing to create_hybrid_pdf
- Placeholder position_marker strings must exactly match the text in text_content for proper replacement
- Use await when calling create_hybrid_pdf as it's an async method
- Provide complete metadata dictionary with at least 'source_file' key for proper document information
- Graphics dictionary keys must match placeholder.id values for successful embedding
- Output directory must exist before calling create_hybrid_pdf
- The class maintains state through self.styles - avoid modifying styles after initialization
- Image data should be in a format PIL can decode (PNG, JPEG, etc.) when base64 decoded
- Text content can use markdown formatting: # for headers, ## for subheaders, ** for bold, * for italic, ` for code
- Graphics are automatically sized to fit within 6x4 inch bounds while maintaining aspect ratio
- The placeholder_pattern regex expects format: [GRAPHIC:id:type:description]
- Method call order: instantiate -> create_hybrid_pdf (which internally calls all helper methods)
- Helper methods (_process_content_with_graphics, _convert_text_to_elements, etc.) are internal and should not be called directly
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class PDFGenerator 76.8% similar
-
class HybridResponseHandler 70.7% similar
-
class HybridResponse 67.2% similar
-
function demo_hybrid_response 65.5% similar
-
class HybridSessionDocTemplate 64.7% similar