function html_to_pdf
Converts HTML content to a PDF file using ReportLab with intelligent parsing of email-formatted HTML, including metadata extraction, body content processing, and attachment information.
/tf/active/vicechatdev/msg_to_eml.py
647 - 842
complex
Purpose
This function is designed to convert HTML-formatted email messages into well-structured PDF documents. It extracts email components (title, metadata like From/To/Date, body content, attachments) from HTML and creates a professionally formatted PDF. It includes a robust fallback mechanism that attempts a simpler conversion if the primary method fails, ensuring reliability even with malformed HTML.
Source Code
def html_to_pdf(html_content, output_path):
"""Convert HTML to PDF using various methods with fallbacks"""
try:
# Try using reportlab directly as primary method since external tools may not be available
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
# Extract content from HTML
import re
# Extract title if available
title_match = re.search(r'<h1>(.*?)</h1>', html_content, re.DOTALL)
title = title_match.group(1) if title_match else "Email Message"
# Clean title of HTML entities
import html
title = html.unescape(title)
# Extract metadata from HTML
meta_data = {}
meta_matches = re.findall(r'<strong>(.*?):</strong>(.*?)</p>', html_content, re.DOTALL)
for key, value in meta_matches:
meta_data[key.strip()] = html.unescape(value.strip())
# Try to extract body content - more robust pattern matching
body_match = re.search(r'<div class=["\']body["\']>(.*?)</div>\s*(?:<div class=["\']attachments|<\/body>)', html_content, re.DOTALL | re.IGNORECASE)
body_content = body_match.group(1) if body_match else ""
# Clean up body content - more thorough approach
# First try to extract text from any existing pre tag
pre_match = re.search(r'<pre>(.*?)</pre>', body_content, re.DOTALL)
if pre_match:
# Use the pre-formatted text as-is
body_text = html.unescape(pre_match.group(1))
body_paragraphs = body_text.split('\n')
else:
# More robust HTML parsing to extract readable text
body_content = re.sub(r'<br\s*/?>', '\n', body_content)
body_content = re.sub(r'<p[^>]*>', '\n\n', body_content)
body_content = re.sub(r'</p>', '', body_content)
body_content = re.sub(r'<div[^>]*>', '\n', body_content)
body_content = re.sub(r'</div>', '', body_content)
body_content = re.sub(r'<span[^>]*>', '', body_content)
body_content = re.sub(r'</span>', '', body_content)
# Remove all other HTML tags
body_text = re.sub(r'<[^>]*>', '', body_content)
# Unescape HTML entities
body_text = html.unescape(body_text)
# Clean up whitespace
body_text = re.sub(r'\n{3,}', '\n\n', body_text)
body_text = re.sub(r'\s{2,}', ' ', body_text)
body_paragraphs = [p.strip() for p in body_text.split('\n\n') if p.strip()]
# Extract attachment info
attachments = []
attachment_matches = re.findall(r'<div class=["\']attachment["\']>\s*<p><strong>(.*?)</strong></p>\s*</div>', html_content, re.DOTALL)
attachments.extend([html.unescape(attachment) for attachment in attachment_matches])
# Create PDF document
doc = SimpleDocTemplate(
output_path,
pagesize=letter,
rightMargin=0.75*inch,
leftMargin=0.75*inch,
topMargin=0.75*inch,
bottomMargin=0.75*inch
)
# Define styles
styles = getSampleStyleSheet()
# Add custom styles
styles.add(ParagraphStyle(
name='EmailTitle',
parent=styles['Heading1'],
fontSize=16,
spaceAfter=0.25*inch
))
styles.add(ParagraphStyle(
name='MetaData',
parent=styles['Normal'],
fontSize=10,
textColor=colors.darkslategray
))
styles.add(ParagraphStyle(
name='EmailBody',
parent=styles['Normal'],
fontSize=11,
leading=14,
spaceBefore=0.05*inch,
spaceAfter=0.05*inch
))
styles.add(ParagraphStyle(
name='AttachmentTitle',
parent=styles['Heading2'],
fontSize=14,
spaceAfter=0.15*inch,
spaceBefore=0.25*inch,
textColor=colors.darkblue
))
# Create flowable elements
elements = []
# Add title - escape any special ReportLab XML characters
safe_title = title.replace('&', '&').replace('<', '<').replace('>', '>')
elements.append(Paragraph(safe_title, styles['EmailTitle']))
# Add metadata - escape special characters
for key, value in meta_data.items():
safe_key = key.replace('&', '&').replace('<', '<').replace('>', '>')
safe_value = value.replace('&', '&').replace('<', '<').replace('>', '>')
elements.append(Paragraph(f"<b>{safe_key}:</b> {safe_value}", styles['MetaData']))
# Add separator
elements.append(Spacer(1, 0.2*inch))
elements.append(Paragraph("<hr/>", styles['Normal']))
elements.append(Spacer(1, 0.1*inch))
# Add body content - process each paragraph with proper escaping
for para in body_paragraphs:
if para.strip():
# Escape special XML characters for ReportLab
safe_para = para.strip().replace('&', '&').replace('<', '<').replace('>', '>')
elements.append(Paragraph(safe_para, styles['EmailBody']))
elements.append(Spacer(1, 0.05*inch))
# Add attachment info if any
if attachments:
elements.append(Spacer(1, 0.2*inch))
elements.append(Paragraph("<hr/>", styles['Normal']))
elements.append(Paragraph(f"Attachments ({len(attachments)})", styles['AttachmentTitle']))
for attachment in attachments:
# Escape special characters
safe_attachment = attachment.replace('&', '&').replace('<', '<').replace('>', '>')
elements.append(Paragraph(f"• {safe_attachment}", styles['Normal']))
# Build PDF
doc.build(elements)
if os.path.exists(output_path):
return output_path
except Exception as e:
logger.error(f"Error in primary HTML to PDF conversion: {str(e)}")
logger.error(traceback.format_exc())
try:
# Much simpler fallback that just creates a basic PDF with visible text
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
# Create super simple text-only version with minimal HTML parsing
import html
plain_text = re.sub(r'<.*?>', ' ', html_content)
plain_text = re.sub(r'\s+', ' ', plain_text).strip()
plain_text = html.unescape(plain_text)
doc = SimpleDocTemplate(output_path, pagesize=letter)
styles = getSampleStyleSheet()
# Create elements for the document
elements = [
Paragraph("Email Message", styles['Title']),
Spacer(1, 0.25*inch)
]
# Add content in chunks to avoid overflow
chunks = [plain_text[i:i+300] for i in range(0, len(plain_text), 300)]
for chunk in chunks:
if chunk.strip():
safe_chunk = chunk.strip().replace('&', '&').replace('<', '<').replace('>', '>')
p = Paragraph(safe_chunk, styles['Normal'])
elements.append(p)
elements.append(Spacer(1, 0.1*inch))
doc.build(elements)
if os.path.exists(output_path):
return output_path
except Exception as e2:
logger.error(f"Error in fallback HTML to PDF conversion: {str(e2)}")
return None
return None
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
html_content |
- | - | positional_or_keyword |
output_path |
- | - | positional_or_keyword |
Parameter Details
html_content: String containing the HTML content to be converted. Expected to be email-formatted HTML with specific structure including h1 tags for title, strong tags for metadata labels, div with class 'body' for message content, and div with class 'attachment' for attachment information. Can handle various HTML entities and formatting.
output_path: String or Path object specifying the file system path where the generated PDF should be saved. Must be a valid writable path with .pdf extension. The directory must exist or be creatable.
Return Value
Returns the output_path (string) if PDF generation is successful and the file exists at that location. Returns None if both primary and fallback conversion methods fail. The returned path can be used to verify the PDF was created and to access the file.
Dependencies
reportlabosrehtmlloggingtraceback
Required Imports
import os
import re
import html
import logging
import traceback
Conditional/Optional Imports
These imports are only needed under specific conditions:
from reportlab.lib.pagesizes import letter
Condition: always required for PDF generation
Required (conditional)from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
Condition: always required for PDF generation
Required (conditional)from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
Condition: always required for PDF styling
Required (conditional)from reportlab.lib.units import inch
Condition: always required for PDF layout measurements
Required (conditional)from reportlab.lib import colors
Condition: always required for PDF text coloring
Required (conditional)Usage Example
import os
import logging
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
import re
import html
import traceback
# Setup logger
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Sample HTML email content
html_content = '''
<html>
<body>
<h1>Meeting Reminder</h1>
<p><strong>From:</strong> john@example.com</p>
<p><strong>To:</strong> team@example.com</p>
<p><strong>Date:</strong> 2024-01-15</p>
<div class='body'>
<pre>Hello Team,
This is a reminder about our meeting tomorrow at 10 AM.
Best regards,
John</pre>
</div>
<div class='attachments'>
<div class='attachment'>
<p><strong>agenda.pdf</strong></p>
</div>
</div>
</body>
</html>
'''
# Convert to PDF
output_path = '/tmp/email_output.pdf'
result = html_to_pdf(html_content, output_path)
if result:
print(f'PDF created successfully at: {result}')
else:
print('PDF conversion failed')
Best Practices
- Ensure a logger object is configured before calling this function to capture error messages
- Validate that the output_path directory exists and is writable before calling
- The HTML content should follow email formatting conventions with h1 for title, strong tags for metadata labels, and div with class 'body' for content
- Handle the None return value to detect conversion failures
- The function includes automatic fallback to a simpler conversion method if the primary method fails
- Special XML characters (&, <, >) are automatically escaped for ReportLab compatibility
- The function expects specific HTML structure but will attempt to extract readable text even from malformed HTML
- Consider wrapping calls in try-except blocks for additional error handling at the application level
- The function creates professional-looking PDFs with proper margins (0.75 inch) and styling
- Attachment information is extracted but not embedded - only attachment names are listed in the PDF
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function convert_markdown_to_html 73.4% similar
-
function export_to_pdf_v1 72.4% similar
-
function eml_to_pdf 69.8% similar
-
function convert_markdown_to_html_v1 68.3% similar
-
function export_to_pdf 65.8% similar