function generate_html_from_msg
Converts an email message object into a formatted HTML representation with styling, headers, body content, and attachment information.
/tf/active/vicechatdev/msg_to_eml.py
476 - 645
moderate
Purpose
This function generates a complete, styled HTML document from an email message object (typically from extract_msg library). It handles both HTML and plain text email bodies, includes email metadata (subject, sender, recipients, date), preserves formatting, converts URLs to hyperlinks, and lists attachments. The output is suitable for viewing in web browsers or converting to other formats like PDF.
Source Code
def generate_html_from_msg(msg, include_headers=True):
"""Generate HTML representation of email message with cleaner formatting"""
html_parts = []
# Add CSS styling with more robust formatting
html_parts.append("""
<html>
<head>
<meta charset="utf-8">
<style>
body {
font-family: Arial, sans-serif;
line-height: 1.6;
color: #333;
margin: 20px;
max-width: 800px;
margin: 0 auto;
padding: 20px;
}
.header {
background-color: #f5f5f5;
padding: 15px;
border-bottom: 1px solid #ddd;
margin-bottom: 20px;
border-radius: 4px;
}
.header h1 {
margin: 0 0 10px 0;
padding: 0;
color: #444;
font-size: 22px;
font-weight: bold;
}
.meta {
margin: 10px 0;
color: #666;
font-size: 14px;
}
.meta p {
margin: 5px 0;
}
.meta strong {
color: #333;
}
.body {
padding: 10px 0;
border-top: 1px solid #eee;
}
.attachments {
margin-top: 20px;
padding-top: 10px;
border-top: 1px solid #eee;
}
.attachment {
background-color: #f9f9f9;
padding: 8px;
margin-bottom: 5px;
border-left: 3px solid #ccc;
}
pre {
white-space: pre-wrap;
font-family: inherit;
margin: 0;
}
blockquote {
border-left: 3px solid #ddd;
padding-left: 10px;
color: #555;
margin: 10px 0 10px 20px;
}
</style>
</head>
<body>
""")
# Header section with email metadata
if include_headers:
html_parts.append(f"<div class='header'>")
html_parts.append(f"<h1>{msg.subject or '(No Subject)'}</h1>")
html_parts.append(f"<div class='meta'>")
html_parts.append(f"<p><strong>From:</strong> {msg.sender}</p>")
html_parts.append(f"<p><strong>To:</strong> {msg.to}</p>")
if msg.cc:
html_parts.append(f"<p><strong>CC:</strong> {msg.cc}</p>")
if hasattr(msg, 'date') and msg.date:
html_parts.append(f"<p><strong>Date:</strong> {msg.date}</p>")
html_parts.append(f"</div>") # Close meta
html_parts.append(f"</div>") # Close header
# Body content - prefer HTML if available
html_parts.append(f"<div class='body'>")
# Get HTML body content if available, otherwise use plain text
body_html = None
if hasattr(msg, 'htmlBody') and msg.htmlBody:
body_html = msg.htmlBody
elif hasattr(msg, 'html') and msg.html:
body_html = msg.html
if body_html:
# Clean up HTML body - ensuring proper string type
if isinstance(body_html, bytes):
try:
clean_html = body_html.decode('utf-8')
except UnicodeDecodeError:
try:
clean_html = body_html.decode('latin-1')
except UnicodeDecodeError:
clean_html = body_html.decode('utf-8', errors='replace')
else:
clean_html = str(body_html)
# Clean up HTML content - replace problematic tags and attributes
import re
# Replace charset if needed and replace problematic elements
clean_html = clean_html.replace('charset="us-ascii"', 'charset="utf-8"')
# Remove potentially problematic CSS that might mess up rendering
clean_html = re.sub(r'<style[^>]*>.*?</style>', '', clean_html, flags=re.DOTALL)
# Simplify complex tables if present
if '<table' in clean_html.lower():
clean_html = re.sub(r'<table[^>]*>', '<table border="1" cellpadding="4" style="border-collapse:collapse">', clean_html)
# Ensure body content is properly enclosed in body tags
if '<body' not in clean_html.lower():
clean_html = f"<div>{clean_html}</div>"
html_parts.append(clean_html)
else:
# Convert plain text to HTML with proper line breaks and formatting
body_text = msg.body or '(No content)'
# Convert URLs to hyperlinks
import re
url_pattern = r'(https?://[^\s<>"]+|www\.[^\s<>"]+)'
body_text = re.sub(url_pattern, r'<a href="\1">\1</a>', body_text)
# Preserve line breaks properly
body_text = body_text.replace('\r\n', '\n').replace('\n', '<br>\n')
# Escape any remaining HTML characters except our added tags
import html
body_text = html.escape(body_text, quote=False).replace('<br>', '<br>')
body_text = body_text.replace('<a href=', '<a href=').replace('</a>', '</a>')
html_parts.append(f"<pre>{body_text}</pre>")
html_parts.append(f"</div>") # Close body
# Add attachment info section
if len(msg.attachments) > 0:
html_parts.append(f"<div class='attachments'>")
html_parts.append(f"<h2>Attachments ({len(msg.attachments)})</h2>")
for attachment in msg.attachments:
filename = getattr(attachment, 'longFilename', None) or getattr(attachment, 'shortFilename', None) or 'attachment'
html_parts.append(f"<div class='attachment'>")
html_parts.append(f"<p><strong>{filename}</strong></p>")
html_parts.append(f"</div>")
html_parts.append(f"</div>") # Close attachments
html_parts.append("</body></html>")
return "\n".join(html_parts)
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
msg |
- | - | positional_or_keyword |
include_headers |
- | True | positional_or_keyword |
Parameter Details
msg: An email message object (typically from extract_msg library) containing properties like subject, sender, to, cc, date, body, htmlBody/html, and attachments. Expected to have attributes: subject, sender, to, cc (optional), date (optional), body, htmlBody or html (optional), and attachments list.
include_headers: Boolean flag (default: True) that controls whether to include the email header section (subject, from, to, cc, date) in the generated HTML. Set to False to generate only the body content without metadata.
Return Value
Returns a string containing a complete HTML document with embedded CSS styling. The HTML includes a header section with email metadata (if include_headers=True), the email body content (preferring HTML format if available, otherwise formatted plain text), and an attachments section listing all attached files with their filenames. The HTML is UTF-8 encoded and includes responsive styling with a maximum width of 800px.
Dependencies
htmlre
Required Imports
import html
import re
Usage Example
import extract_msg
import html
import re
# Load an email message from .msg file
msg = extract_msg.Message('path/to/email.msg')
# Generate HTML with headers
html_output = generate_html_from_msg(msg, include_headers=True)
# Save to file
with open('email_output.html', 'w', encoding='utf-8') as f:
f.write(html_output)
# Or generate without headers (body only)
html_body_only = generate_html_from_msg(msg, include_headers=False)
Best Practices
- Ensure the msg object has the expected attributes (subject, sender, to, body, attachments) before calling this function
- The function handles both HTML and plain text email bodies, preferring HTML when available
- Character encoding is handled automatically for both UTF-8 and Latin-1 encoded content
- The function sanitizes HTML content by removing style tags and simplifying tables to prevent rendering issues
- URLs in plain text emails are automatically converted to clickable hyperlinks
- The generated HTML is self-contained with embedded CSS, making it portable and easy to save or display
- For large emails with many attachments, consider the performance impact of generating the full HTML
- The function uses html.escape() to prevent XSS vulnerabilities when displaying plain text content
- Attachment content is not embedded; only filenames are listed in the output
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function generate_simple_html_from_eml 82.0% similar
-
function msg_to_eml 67.1% similar
-
function msg_to_pdf 66.5% similar
-
function msg_to_eml_alternative 65.5% similar
-
function html_to_pdf 64.7% similar