function eml_to_pdf
Converts an .eml email file to PDF format, including the email body and all attachments merged into a single PDF document.
/tf/active/vicechatdev/msg_to_eml.py
1150 - 1291
complex
Purpose
This function provides comprehensive email-to-PDF conversion by parsing .eml files, rendering the email content as HTML, converting it to PDF, extracting and converting attachments (images, documents, etc.) to PDF format, and merging everything into a single output PDF file. It handles multiple conversion methods with fallbacks, supports various attachment types, and includes robust error handling for production use.
Source Code
def eml_to_pdf(eml_path, pdf_path):
"""Convert an .eml file to PDF format with attachments included"""
try:
if not os.path.exists(eml_path):
logger.error(f"Input EML file not found: {eml_path}")
return False
# Create a temporary directory for processing
with tempfile.TemporaryDirectory() as temp_dir:
# Parse the EML file
with open(eml_path, 'rb') as f:
parsed_email = email.message_from_binary_file(f)
# Create a simpler, more reliable HTML representation
email_html = generate_simple_html_from_eml(parsed_email)
html_path = os.path.join(temp_dir, "email.html")
with open(html_path, 'w', encoding='utf-8') as f:
f.write(email_html)
# Convert email body to PDF first
email_body_pdf = os.path.join(temp_dir, "email_body.pdf")
# Try using wkhtmltopdf (more reliable HTML renderer) if available
body_pdf_created = False
if shutil.which('wkhtmltopdf'):
try:
cmd = [
'wkhtmltopdf',
'--encoding', 'utf-8',
'--quiet',
html_path,
email_body_pdf
]
subprocess.run(cmd, check=True, timeout=30)
body_pdf_created = os.path.exists(email_body_pdf)
except Exception as e:
logger.warning(f"wkhtmltopdf conversion failed: {str(e)}")
# Fall back to ReportLab if needed
if not body_pdf_created:
body_pdf_created = html_to_pdf(email_html, email_body_pdf)
if not body_pdf_created:
logger.error(f"Failed to convert email body to PDF for {eml_path}")
return False
# List to collect all PDFs for merging (starting with the email body PDF)
pdf_files_to_merge = [email_body_pdf]
# Process attachments
pdf_converter = PDFConverter(temp_dir)
# Find and extract all attachments
attachment_counter = 0
if parsed_email.is_multipart():
for i, part in enumerate(parsed_email.walk()):
if part.get_content_disposition() == 'attachment':
try:
# Get filename
filename = part.get_filename()
if not filename:
filename = f'attachment_{i+1}'
# Clean up filename (remove problematic characters)
filename = ''.join(c for c in filename if c.isalnum() or c in '._- ')
logger.info(f"Processing EML attachment: {filename}")
# Extract attachment data
attachment_data = part.get_payload(decode=True)
if not attachment_data:
logger.warning(f"Skipping empty attachment: {filename}")
continue
# Save attachment to temp file
attachment_path = os.path.join(temp_dir, filename)
with open(attachment_path, 'wb') as f:
f.write(attachment_data)
# Skip conversion for very small files (likely empty)
if os.path.getsize(attachment_path) < 10:
logger.warning(f"Skipping tiny attachment: {filename}")
continue
# Try to convert attachment to PDF if possible
attachment_counter += 1
attachment_pdf = os.path.join(temp_dir, f"attachment_{attachment_counter}.pdf")
# If already PDF, just use it as is
if filename.lower().endswith('.pdf'):
shutil.copy2(attachment_path, attachment_pdf)
pdf_files_to_merge.append(attachment_pdf)
continue
# Try to convert to PDF
conversion_result = pdf_converter.convert_to_pdf(attachment_path, attachment_pdf)
if conversion_result and os.path.exists(attachment_pdf):
pdf_files_to_merge.append(attachment_pdf)
logger.info(f"Successfully converted attachment: {filename}")
else:
logger.warning(f"Could not convert attachment to PDF: {filename}")
# For images that failed normal conversion, try direct image-to-pdf
ext = os.path.splitext(filename.lower())[1]
if ext in pdf_converter.IMAGE_EXTENSIONS:
try:
pdf_converter._convert_image_to_pdf(attachment_path, attachment_pdf)
if os.path.exists(attachment_pdf):
pdf_files_to_merge.append(attachment_pdf)
logger.info(f"Converted image using direct method: {filename}")
except Exception as e:
logger.error(f"Failed direct image conversion: {str(e)}")
except Exception as e:
logger.error(f"Error processing attachment {filename}: {str(e)}")
# Merge all PDFs if we have multiple
if len(pdf_files_to_merge) > 1:
logger.info(f"Merging {len(pdf_files_to_merge)} PDFs")
merge_result = merge_pdfs(pdf_files_to_merge, pdf_path)
if merge_result and os.path.exists(pdf_path):
logger.info(f"Successfully merged PDFs to: {pdf_path}")
return True
else:
logger.error("Failed to merge PDFs")
return False
elif len(pdf_files_to_merge) == 1:
# Just copy the single PDF if only email body was converted
shutil.copy2(pdf_files_to_merge[0], pdf_path)
logger.info(f"Created PDF without attachments: {pdf_path}")
return True
else:
logger.error("No PDFs were created for merging")
return False
except Exception as e:
logger.error(f"Error converting {eml_path} to PDF: {str(e)}")
logger.error(traceback.format_exc())
return False
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
eml_path |
- | - | positional_or_keyword |
pdf_path |
- | - | positional_or_keyword |
Parameter Details
eml_path: String path to the input .eml email file to be converted. Must be a valid file path pointing to an existing .eml file. The function will validate existence before processing.
pdf_path: String path where the output PDF file should be saved. This is the destination for the merged PDF containing the email body and all converted attachments. Parent directory must exist or be writable.
Return Value
Returns a boolean value: True if the conversion was successful and the PDF was created at pdf_path, False if any error occurred during parsing, conversion, or merging. The function logs detailed error messages for debugging.
Dependencies
extract_msgemailosmimetypesloggingtracebacktempfilesysbase64shutilsubprocesspathlibdatetimeargparsehtmlrereportlabtimePILfitzPyPDF2
Required Imports
import os
import email
import tempfile
import shutil
import subprocess
import traceback
import logging
Conditional/Optional Imports
These imports are only needed under specific conditions:
from reportlab.lib.pagesizes import letter
Condition: Required for PDF generation using ReportLab fallback method
Required (conditional)from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
Condition: Required for PDF generation using ReportLab fallback method
Required (conditional)from reportlab.lib.styles import getSampleStyleSheet
Condition: Required for PDF generation using ReportLab fallback method
Required (conditional)from PyPDF2 import PdfMerger
Condition: Required for merging multiple PDFs (email body + attachments)
Required (conditional)from PIL import Image
Condition: Required for image attachment processing and conversion
Required (conditional)import fitz
Condition: Required for PDF manipulation (PyMuPDF library)
Required (conditional)Usage Example
import logging
import os
# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Define paths
eml_file = '/path/to/email.eml'
output_pdf = '/path/to/output.pdf'
# Convert EML to PDF
success = eml_to_pdf(eml_file, output_pdf)
if success:
print(f'Successfully converted {eml_file} to {output_pdf}')
print(f'Output file size: {os.path.getsize(output_pdf)} bytes')
else:
print('Conversion failed. Check logs for details.')
Best Practices
- Ensure the logger is properly configured before calling this function to capture detailed error messages
- Verify that all required helper functions (generate_simple_html_from_eml, html_to_pdf, merge_pdfs) and the PDFConverter class are defined in the same module
- Install wkhtmltopdf system binary for better HTML rendering quality, though the function will fall back to ReportLab if unavailable
- Ensure sufficient disk space for temporary files, especially when processing emails with large attachments
- The function automatically cleans up temporary files using context managers, but ensure the process has write permissions
- Handle the boolean return value to implement proper error handling in calling code
- Be aware that very large attachments or many attachments may cause memory issues or slow processing
- The function skips attachments smaller than 10 bytes to avoid processing empty or corrupted files
- Attachment filenames are sanitized to remove problematic characters for filesystem compatibility
- Consider implementing timeout handling for very large email files or numerous attachments
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function html_to_pdf 69.8% similar
-
function msg_to_pdf_improved 69.4% similar
-
function msg_to_pdf 69.0% similar
-
function msg_to_eml 65.0% similar
-
function generate_simple_html_from_eml 63.3% similar