html_to_pdf - Code Extractor

function html_to_pdf

Maturity: 51

Converts HTML content to a PDF file using ReportLab with intelligent parsing of email-formatted HTML, including metadata extraction, body content processing, and attachment information.

File:
/tf/active/vicechatdev/msg_to_eml.py

Lines:
647 - 842

Complexity:
complex

Purpose

This function is designed to convert HTML-formatted email messages into well-structured PDF documents. It extracts email components (title, metadata like From/To/Date, body content, attachments) from HTML and creates a professionally formatted PDF. It includes a robust fallback mechanism that attempts a simpler conversion if the primary method fails, ensuring reliability even with malformed HTML.

Source Code

def html_to_pdf(html_content, output_path):
    """Convert HTML to PDF using various methods with fallbacks"""
    try:
        # Try using reportlab directly as primary method since external tools may not be available
        from reportlab.lib.pagesizes import letter
        from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
        from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
        from reportlab.lib.units import inch
        from reportlab.lib import colors
        
        # Extract content from HTML
        import re
        
        # Extract title if available
        title_match = re.search(r'<h1>(.*?)</h1>', html_content, re.DOTALL)
        title = title_match.group(1) if title_match else "Email Message"
        
        # Clean title of HTML entities
        import html
        title = html.unescape(title)
        
        # Extract metadata from HTML
        meta_data = {}
        meta_matches = re.findall(r'<strong>(.*?):</strong>(.*?)</p>', html_content, re.DOTALL)
        for key, value in meta_matches:
            meta_data[key.strip()] = html.unescape(value.strip())
        
        # Try to extract body content - more robust pattern matching
        body_match = re.search(r'<div class=["\']body["\']>(.*?)</div>\s*(?:<div class=["\']attachments|<\/body>)', html_content, re.DOTALL | re.IGNORECASE)
        body_content = body_match.group(1) if body_match else ""
        
        # Clean up body content - more thorough approach
        # First try to extract text from any existing pre tag
        pre_match = re.search(r'<pre>(.*?)</pre>', body_content, re.DOTALL)
        if pre_match:
            # Use the pre-formatted text as-is
            body_text = html.unescape(pre_match.group(1))
            body_paragraphs = body_text.split('\n')
        else:
            # More robust HTML parsing to extract readable text
            body_content = re.sub(r'<br\s*/?>', '\n', body_content)
            body_content = re.sub(r'<p[^>]*>', '\n\n', body_content)
            body_content = re.sub(r'</p>', '', body_content)
            body_content = re.sub(r'<div[^>]*>', '\n', body_content)
            body_content = re.sub(r'</div>', '', body_content)
            body_content = re.sub(r'<span[^>]*>', '', body_content)
            body_content = re.sub(r'</span>', '', body_content)
            
            # Remove all other HTML tags
            body_text = re.sub(r'<[^>]*>', '', body_content)
            
            # Unescape HTML entities
            body_text = html.unescape(body_text)
            
            # Clean up whitespace
            body_text = re.sub(r'\n{3,}', '\n\n', body_text)
            body_text = re.sub(r'\s{2,}', ' ', body_text)
            body_paragraphs = [p.strip() for p in body_text.split('\n\n') if p.strip()]
        
        # Extract attachment info
        attachments = []
        attachment_matches = re.findall(r'<div class=["\']attachment["\']>\s*<p><strong>(.*?)</strong></p>\s*</div>', html_content, re.DOTALL)
        attachments.extend([html.unescape(attachment) for attachment in attachment_matches])

        # Create PDF document
        doc = SimpleDocTemplate(
            output_path,
            pagesize=letter,
            rightMargin=0.75*inch,
            leftMargin=0.75*inch,
            topMargin=0.75*inch,
            bottomMargin=0.75*inch
        )
        
        # Define styles
        styles = getSampleStyleSheet()
        
        # Add custom styles
        styles.add(ParagraphStyle(
            name='EmailTitle',
            parent=styles['Heading1'],
            fontSize=16,
            spaceAfter=0.25*inch
        ))
        
        styles.add(ParagraphStyle(
            name='MetaData',
            parent=styles['Normal'],
            fontSize=10,
            textColor=colors.darkslategray
        ))
        
        styles.add(ParagraphStyle(
            name='EmailBody',
            parent=styles['Normal'],
            fontSize=11,
            leading=14,
            spaceBefore=0.05*inch,
            spaceAfter=0.05*inch
        ))
        
        styles.add(ParagraphStyle(
            name='AttachmentTitle',
            parent=styles['Heading2'],
            fontSize=14,
            spaceAfter=0.15*inch,
            spaceBefore=0.25*inch,
            textColor=colors.darkblue
        ))
        
        # Create flowable elements
        elements = []
        
        # Add title - escape any special ReportLab XML characters
        safe_title = title.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
        elements.append(Paragraph(safe_title, styles['EmailTitle']))
        
        # Add metadata - escape special characters
        for key, value in meta_data.items():
            safe_key = key.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
            safe_value = value.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
            elements.append(Paragraph(f"<b>{safe_key}:</b> {safe_value}", styles['MetaData']))
        
        # Add separator
        elements.append(Spacer(1, 0.2*inch))
        elements.append(Paragraph("<hr/>", styles['Normal']))
        elements.append(Spacer(1, 0.1*inch))
        
        # Add body content - process each paragraph with proper escaping
        for para in body_paragraphs:
            if para.strip():
                # Escape special XML characters for ReportLab
                safe_para = para.strip().replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
                elements.append(Paragraph(safe_para, styles['EmailBody']))
                elements.append(Spacer(1, 0.05*inch))
        
        # Add attachment info if any
        if attachments:
            elements.append(Spacer(1, 0.2*inch))
            elements.append(Paragraph("<hr/>", styles['Normal']))
            elements.append(Paragraph(f"Attachments ({len(attachments)})", styles['AttachmentTitle']))
            
            for attachment in attachments:
                # Escape special characters
                safe_attachment = attachment.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
                elements.append(Paragraph(f"• {safe_attachment}", styles['Normal']))
        
        # Build PDF
        doc.build(elements)
        
        if os.path.exists(output_path):
            return output_path
            
    except Exception as e:
        logger.error(f"Error in primary HTML to PDF conversion: {str(e)}")
        logger.error(traceback.format_exc())
        try:
            # Much simpler fallback that just creates a basic PDF with visible text
            from reportlab.lib.pagesizes import letter
            from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
            from reportlab.lib.styles import getSampleStyleSheet
            
            # Create super simple text-only version with minimal HTML parsing
            import html
            plain_text = re.sub(r'<.*?>', ' ', html_content)
            plain_text = re.sub(r'\s+', ' ', plain_text).strip()
            plain_text = html.unescape(plain_text)
            
            doc = SimpleDocTemplate(output_path, pagesize=letter)
            styles = getSampleStyleSheet()
            
            # Create elements for the document
            elements = [
                Paragraph("Email Message", styles['Title']),
                Spacer(1, 0.25*inch)
            ]
            
            # Add content in chunks to avoid overflow
            chunks = [plain_text[i:i+300] for i in range(0, len(plain_text), 300)]
            for chunk in chunks:
                if chunk.strip():
                    safe_chunk = chunk.strip().replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
                    p = Paragraph(safe_chunk, styles['Normal'])
                    elements.append(p)
                    elements.append(Spacer(1, 0.1*inch))
            
            doc.build(elements)
            
            if os.path.exists(output_path):
                return output_path
                
        except Exception as e2:
            logger.error(f"Error in fallback HTML to PDF conversion: {str(e2)}")
            return None
    
    return None

Parameters

Name	Type	Default	Kind
`html_content`	-	-	positional_or_keyword
`output_path`	-	-	positional_or_keyword

Parameter Details

html_content: String containing the HTML content to be converted. Expected to be email-formatted HTML with specific structure including h1 tags for title, strong tags for metadata labels, div with class 'body' for message content, and div with class 'attachment' for attachment information. Can handle various HTML entities and formatting.

output_path: String or Path object specifying the file system path where the generated PDF should be saved. Must be a valid writable path with .pdf extension. The directory must exist or be creatable.

Return Value

Returns the output_path (string) if PDF generation is successful and the file exists at that location. Returns None if both primary and fallback conversion methods fail. The returned path can be used to verify the PDF was created and to access the file.

Dependencies

reportlab
os
re
html
logging
traceback

Required Imports

import os
import re
import html
import logging
import traceback

Conditional/Optional Imports

These imports are only needed under specific conditions:

from reportlab.lib.pagesizes import letter

Condition: always required for PDF generation

Required (conditional)

from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer

Condition: always required for PDF generation

Required (conditional)

from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle

Condition: always required for PDF styling

Required (conditional)

from reportlab.lib.units import inch

Condition: always required for PDF layout measurements

Required (conditional)

from reportlab.lib import colors

Condition: always required for PDF text coloring

Required (conditional)

Usage Example

import os
import logging
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
import re
import html
import traceback

# Setup logger
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Sample HTML email content
html_content = '''
<html>
<body>
<h1>Meeting Reminder</h1>
<p><strong>From:</strong> john@example.com</p>
<p><strong>To:</strong> team@example.com</p>
<p><strong>Date:</strong> 2024-01-15</p>
<div class='body'>
<pre>Hello Team,

This is a reminder about our meeting tomorrow at 10 AM.

Best regards,
John</pre>
</div>
<div class='attachments'>
<div class='attachment'>
<p><strong>agenda.pdf</strong></p>
</div>
</div>
</body>
</html>
'''

# Convert to PDF
output_path = '/tmp/email_output.pdf'
result = html_to_pdf(html_content, output_path)

if result:
    print(f'PDF created successfully at: {result}')
else:
    print('PDF conversion failed')

Best Practices

Ensure a logger object is configured before calling this function to capture error messages
Validate that the output_path directory exists and is writable before calling
The HTML content should follow email formatting conventions with h1 for title, strong tags for metadata labels, and div with class 'body' for content
Handle the None return value to detect conversion failures
The function includes automatic fallback to a simpler conversion method if the primary method fails
Special XML characters (&, <, >) are automatically escaped for ReportLab compatibility
The function expects specific HTML structure but will attempt to extract readable text even from malformed HTML
Consider wrapping calls in try-except blocks for additional error handling at the application level
The function creates professional-looking PDFs with proper margins (0.75 inch) and styling
Attachment information is extracted but not embedded - only attachment names are listed in the PDF

Similar Components

AI-powered semantic similarity - components with related functionality:

function convert_markdown_to_html 73.4% similar

Converts basic markdown formatting (bold, italic, code) to HTML markup suitable for PDF generation using ReportLab.
From: /tf/active/vicechatdev/vice_ai/complex_app.py
function export_to_pdf_v1 72.4% similar

Converts a document object with sections and references into a formatted PDF file using ReportLab, supporting multiple heading levels, text content with markdown/HTML processing, and reference management.
From: /tf/active/vicechatdev/vice_ai/complex_app.py
function eml_to_pdf 69.8% similar

Converts an .eml email file to PDF format, including the email body and all attachments merged into a single PDF document.
From: /tf/active/vicechatdev/msg_to_eml.py
function convert_markdown_to_html_v1 68.3% similar

Converts basic Markdown syntax to HTML markup compatible with ReportLab PDF generation, including support for clickable links, bold, italic, and inline code formatting.
From: /tf/active/vicechatdev/vice_ai/new_app.py
function export_to_pdf 65.8% similar

Exports a document with text and data sections to a PDF file using ReportLab, handling custom styling, section ordering, and content formatting including Quill Delta to HTML/Markdown conversion.
From: /tf/active/vicechatdev/vice_ai/new_app.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            def html_to_pdf(html_content, output_path):
    """Convert HTML to PDF using various methods with fallbacks"""
    try:
        # Try using reportlab directly as primary method since external tools may not be available
        from reportlab.lib.pagesizes import letter
        from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
        from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
        from reportlab.lib.units import inch
        from reportlab.lib import colors
        
        # Extract content from HTML
        import re
        
        # Extract title if available
        title_match = re.search(r'<h1>(.*?)</h1>', html_content, re.DOTALL)
        title = title_match.group(1) if title_match else "Email Message"
        
        # Clean title of HTML entities
        import html
        title = html.unescape(title)
        
        # Extract metadata from HTML
        meta_data = {}
        meta_matches = re.findall(r'<strong>(.*?):</strong>(.*?)</p>', html_content, re.DOTALL)
        for key, value in meta_matches:
            meta_data[key.strip()] = html.unescape(value.strip())
        
        # Try to extract body content - more robust pattern matching
        body_match = re.search(r'<div class=["\']body["\']>(.*?)</div>\s*(?:<div class=["\']attachments|<\/body>)', html_content, re.DOTALL | re.IGNORECASE)
        body_content = body_match.group(1) if body_match else ""
        
        # Clean up body content - more thorough approach
        # First try to extract text from any existing pre tag
        pre_match = re.search(r'<pre>(.*?)</pre>', body_content, re.DOTALL)
        if pre_match:
            # Use the pre-formatted text as-is
            body_text = html.unescape(pre_match.group(1))
            body_paragraphs = body_text.split('\n')
        else:
            # More robust HTML parsing to extract readable text
            body_content = re.sub(r'<br\s*/?>', '\n', body_content)
            body_content = re.sub(r'<p[^>]*>', '\n\n', body_content)
            body_content = re.sub(r'</p>', '', body_content)
            body_content = re.sub(r'<div[^>]*>', '\n', body_content)
            body_content = re.sub(r'</div>', '', body_content)
            body_content = re.sub(r'<span[^>]*>', '', body_content)
            body_content = re.sub(r'</span>', '', body_content)
            
            # Remove all other HTML tags
            body_text = re.sub(r'<[^>]*>', '', body_content)
            
            # Unescape HTML entities
            body_text = html.unescape(body_text)
            
            # Clean up whitespace
            body_text = re.sub(r'\n{3,}', '\n\n', body_text)
            body_text = re.sub(r'\s{2,}', ' ', body_text)
            body_paragraphs = [p.strip() for p in body_text.split('\n\n') if p.strip()]
        
        # Extract attachment info
        attachments = []
        attachment_matches = re.findall(r'<div class=["\']attachment["\']>\s*<p><strong>(.*?)</strong></p>\s*</div>', html_content, re.DOTALL)
        attachments.extend([html.unescape(attachment) for attachment in attachment_matches])

        # Create PDF document
        doc = SimpleDocTemplate(
            output_path,
            pagesize=letter,
            rightMargin=0.75*inch,
            leftMargin=0.75*inch,
            topMargin=0.75*inch,
            bottomMargin=0.75*inch
        )
        
        # Define styles
        styles = getSampleStyleSheet()
        
        # Add custom styles
        styles.add(ParagraphStyle(
            name='EmailTitle',
            parent=styles['Heading1'],
            fontSize=16,
            spaceAfter=0.25*inch
        ))
        
        styles.add(ParagraphStyle(
            name='MetaData',
            parent=styles['Normal'],
            fontSize=10,
            textColor=colors.darkslategray
        ))
        
        styles.add(ParagraphStyle(
            name='EmailBody',
            parent=styles['Normal'],
            fontSize=11,
            leading=14,
            spaceBefore=0.05*inch,
            spaceAfter=0.05*inch
        ))
        
        styles.add(ParagraphStyle(
            name='AttachmentTitle',
            parent=styles['Heading2'],
            fontSize=14,
            spaceAfter=0.15*inch,
            spaceBefore=0.25*inch,
            textColor=colors.darkblue
        ))
        
        # Create flowable elements
        elements = []
        
        # Add title - escape any special ReportLab XML characters
        safe_title = title.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
        elements.append(Paragraph(safe_title, styles['EmailTitle']))
        
        # Add metadata - escape special characters
        for key, value in meta_data.items():
            safe_key = key.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
            safe_value = value.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
            elements.append(Paragraph(f"<b>{safe_key}:</b> {safe_value}", styles['MetaData']))
        
        # Add separator
        elements.append(Spacer(1, 0.2*inch))
        elements.append(Paragraph("<hr/>", styles['Normal']))
        elements.append(Spacer(1, 0.1*inch))
        
        # Add body content - process each paragraph with proper escaping
        for para in body_paragraphs:
            if para.strip():
                # Escape special XML characters for ReportLab
                safe_para = para.strip().replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
                elements.append(Paragraph(safe_para, styles['EmailBody']))
                elements.append(Spacer(1, 0.05*inch))
        
        # Add attachment info if any
        if attachments:
            elements.append(Spacer(1, 0.2*inch))
            elements.append(Paragraph("<hr/>", styles['Normal']))
            elements.append(Paragraph(f"Attachments ({len(attachments)})", styles['AttachmentTitle']))
            
            for attachment in attachments:
                # Escape special characters
                safe_attachment = attachment.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
                elements.append(Paragraph(f"• {safe_attachment}", styles['Normal']))
        
        # Build PDF
        doc.build(elements)
        
        if os.path.exists(output_path):
            return output_path
            
    except Exception as e:
        logger.error(f"Error in primary HTML to PDF conversion: {str(e)}")
        logger.error(traceback.format_exc())
        try:
            # Much simpler fallback that just creates a basic PDF with visible text
            from reportlab.lib.pagesizes import letter
            from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
            from reportlab.lib.styles import getSampleStyleSheet
            
            # Create super simple text-only version with minimal HTML parsing
            import html
            plain_text = re.sub(r'<.*?>', ' ', html_content)
            plain_text = re.sub(r'\s+', ' ', plain_text).strip()
            plain_text = html.unescape(plain_text)
            
            doc = SimpleDocTemplate(output_path, pagesize=letter)
            styles = getSampleStyleSheet()
            
            # Create elements for the document
            elements = [
                Paragraph("Email Message", styles['Title']),
                Spacer(1, 0.25*inch)
            ]
            
            # Add content in chunks to avoid overflow
            chunks = [plain_text[i:i+300] for i in range(0, len(plain_text), 300)]
            for chunk in chunks:
                if chunk.strip():
                    safe_chunk = chunk.strip().replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
                    p = Paragraph(safe_chunk, styles['Normal'])
                    elements.append(p)
                    elements.append(Spacer(1, 0.1*inch))
            
            doc.build(elements)
            
            if os.path.exists(output_path):
                return output_path
                
        except Exception as e2:
            logger.error(f"Error in fallback HTML to PDF conversion: {str(e2)}")
            return None
    
    return None
                        

Improved Code

🔍 Code Extractor

function html_to_pdf

Purpose

Source Code

Parameters

Parameter Details

Return Value

Dependencies

Required Imports

Conditional/Optional Imports

Usage Example

Best Practices

Tags

Similar Components

function convert_markdown_to_html 73.4% similar

function export_to_pdf_v1 72.4% similar

function eml_to_pdf 69.8% similar

function convert_markdown_to_html_v1 68.3% similar

function export_to_pdf 65.8% similar

function html_to_pdf

Purpose

Source Code

Parameters

Parameter Details

Return Value

Dependencies

Required Imports

Conditional/Optional Imports

Usage Example

Best Practices

Tags

Similar Components

function convert_markdown_to_html 73.4% similar

function export_to_pdf_v1 72.4% similar

function eml_to_pdf 69.8% similar

function convert_markdown_to_html_v1 68.3% similar

function export_to_pdf 65.8% similar

✨ Improve Code: html_to_pdf

Code Comparison