class EmailSearchApp
A class for authenticating with Microsoft Graph API and searching emails in a user's mailbox, with support for downloading PDF attachments and maintaining download records.
/tf/active/vicechatdev/mailsearch/email_search_app.py
40 - 433
complex
Purpose
EmailSearchApp provides a comprehensive interface for interacting with Microsoft 365 mailboxes via the Graph API. It handles OAuth authentication (both public and confidential client flows), searches emails by sender and keyword, downloads PDF attachments, and maintains a CSV register of downloaded files. The class supports both authenticated user's mailbox and delegated access to other users' mailboxes.
Source Code
class EmailSearchApp:
"""Application for searching emails in a user's mailbox"""
def __init__(self, tenant_id: str, client_id: str, client_secret: Optional[str] = None, target_mailbox: Optional[str] = None):
self.tenant_id = tenant_id
self.client_id = client_id
self.client_secret = client_secret
self.target_mailbox = target_mailbox
self.access_token: Optional[str] = None
self.authority = f"https://login.microsoftonline.com/{tenant_id}"
self.is_confidential = client_secret is not None
def authenticate(self, scopes: List[str]) -> str:
"""
Authenticate user using device code flow with MSAL
Supports both Public Client (device code) and Confidential Client (with secret)
Returns access token
"""
if self.is_confidential:
# Confidential Client Application (requires client_secret)
app = msal.ConfidentialClientApplication(
client_id=self.client_id,
client_credential=self.client_secret,
authority=self.authority
)
else:
# Public Client Application (no secret needed)
app = msal.PublicClientApplication(
client_id=self.client_id,
authority=self.authority
)
# Try to get token from cache first
accounts = app.get_accounts()
if accounts:
print("Using cached authentication...")
result = app.acquire_token_silent(scopes=scopes, account=accounts[0])
if result and "access_token" in result:
self.access_token = result["access_token"]
return self.access_token
# Interactive device code flow
print("\n=== Authentication Required ===")
flow = app.initiate_device_flow(scopes=scopes)
if "user_code" not in flow:
raise RuntimeError(
"Failed to initiate device flow. "
"Check your app registration and permissions."
)
print(f"\nTo authenticate, open: {flow['verification_uri']}")
print(f"Enter code: {flow['user_code']}")
print("\nWaiting for authentication...")
result = app.acquire_token_by_device_flow(flow)
if "access_token" not in result:
error_desc = result.get("error_description", "Unknown error")
raise RuntimeError(f"Authentication failed: {error_desc}")
self.access_token = result["access_token"]
print("ā Authentication successful!\n")
return self.access_token
def search_emails(
self,
sender: str,
keyword: str,
max_results: int = 50
) -> List[Dict]:
"""
Search for emails from a specific sender containing a keyword
Args:
sender: Email address of the sender
keyword: Keyword to search for in subject/body
max_results: Maximum number of results to retrieve per page
Returns:
List of email message objects filtered by sender and keyword
"""
if not self.access_token:
raise RuntimeError("Not authenticated. Call authenticate() first.")
headers = {
"Authorization": f"Bearer {self.access_token}",
"Accept": "application/json"
}
# Construct base URL based on whether we're searching another user's mailbox
if self.target_mailbox:
base_url = f"https://graph.microsoft.com/v1.0/users/{self.target_mailbox}/messages"
print(f"Searching mailbox: {self.target_mailbox}")
else:
base_url = "https://graph.microsoft.com/v1.0/me/messages"
print(f"Searching authenticated user's mailbox")
# Note: Combining $search and $filter is too complex for Graph API
# Strategy: Use $filter for sender (more specific), then filter keyword in code
# Alternative: Use $search for keyword, then filter sender in code
# We'll use $filter approach as it's more efficient
params = {
"$filter": f"from/emailAddress/address eq '{sender}'",
"$top": str(max_results),
"$select": "id,subject,from,receivedDateTime,bodyPreview,body,hasAttachments,importance,isRead"
}
all_messages = []
url = base_url
page = 1
print(f"Searching for emails from '{sender}' containing '{keyword}'...\n")
while url:
# First request uses params, subsequent use only the nextLink
if page == 1:
response = requests.get(url, headers=headers, params=params)
else:
response = requests.get(url, headers=headers)
if response.status_code != 200:
raise RuntimeError(
f"API request failed: {response.status_code}\n"
f"Response: {response.text}"
)
data = response.json()
batch = data.get("value", [])
all_messages.extend(batch)
print(f"Retrieved page {page}: {len(batch)} emails from sender")
# Check for next page
url = data.get("@odata.nextLink")
page += 1
# Filter messages by keyword (case-insensitive search in subject and body)
print(f"\nFiltering for keyword '{keyword}'...")
keyword_lower = keyword.lower()
filtered_messages = []
for msg in all_messages:
subject = msg.get("subject", "").lower()
body_preview = msg.get("bodyPreview", "").lower()
# Get full body content if available
body_content = ""
body = msg.get("body", {})
if isinstance(body, dict):
body_content = body.get("content", "").lower()
# Check if keyword appears in subject, body preview, or full body
if (keyword_lower in subject or
keyword_lower in body_preview or
keyword_lower in body_content):
filtered_messages.append(msg)
if len(filtered_messages) == 0 and len(all_messages) > 0:
print(f"\nNote: Found {len(all_messages)} emails from sender but none contained '{keyword}'")
print("First few subjects for reference:")
for i, msg in enumerate(all_messages[:3], 1):
print(f" {i}. {msg.get('subject', '(no subject)')}")
print(f"Found {len(filtered_messages)} emails matching keyword from {len(all_messages)} total")
return filtered_messages
def download_pdf_attachments(self, email: Dict, output_dir: str) -> List[Dict]:
"""
Download PDF attachments from a specific email message
Args:
email: The email message object containing id, subject, receivedDateTime, etc.
output_dir: Directory to save attachments
Returns:
List of dictionaries with download metadata:
- filename: name of the saved file
- filepath: full path to saved file
- subject: email subject
- received_date: email received date
- sender: sender email address
- file_size: size in bytes
"""
if not self.access_token:
raise RuntimeError("Not authenticated. Call authenticate() first.")
message_id = email.get("id")
message_subject = email.get("subject", "(No Subject)")
# Parse received date
received = email.get("receivedDateTime", "")
if received:
dt = datetime.fromisoformat(received.replace("Z", "+00:00"))
received_date = dt.strftime("%Y-%m-%d %H:%M:%S")
else:
received_date = "Unknown"
# Get sender info
from_data = email.get("from", {})
sender_info = from_data.get("emailAddress", {})
sender_email = sender_info.get("address", "unknown@unknown.com")
headers = {
"Authorization": f"Bearer {self.access_token}",
"Accept": "application/json"
}
# Construct URL for attachments
if self.target_mailbox:
url = f"https://graph.microsoft.com/v1.0/users/{self.target_mailbox}/messages/{message_id}/attachments"
else:
url = f"https://graph.microsoft.com/v1.0/me/messages/{message_id}/attachments"
response = requests.get(url, headers=headers)
if response.status_code != 200:
print(f" ā Failed to get attachments: {response.status_code}")
return []
attachments = response.json().get("value", [])
downloaded_metadata = []
for att in attachments:
# Only process file attachments (not item attachments like embedded emails)
if att.get("@odata.type") != "#microsoft.graph.fileAttachment":
continue
name = att.get("name", "attachment")
content_type = att.get("contentType", "")
content_bytes = att.get("contentBytes")
# Only download PDF files
if not (name.lower().endswith('.pdf') or 'pdf' in content_type.lower()):
print(f" ā Skipping non-PDF: {name}")
continue
if not content_bytes:
print(f" ā No content for: {name}")
continue
# Create output directory if it doesn't exist
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Generate safe filename
safe_filename = self._generate_safe_filename(name, output_dir)
file_path = os.path.join(output_dir, safe_filename)
try:
# Decode and save the file
with open(file_path, "wb") as f:
f.write(base64.b64decode(content_bytes))
file_size = os.path.getsize(file_path)
print(f" ā Downloaded: {safe_filename} ({file_size:,} bytes)")
# Add metadata for this download
downloaded_metadata.append({
"filename": safe_filename,
"filepath": file_path,
"subject": message_subject,
"received_date": received_date,
"sender": sender_email,
"file_size": file_size,
"email_id": message_id
})
except Exception as e:
print(f" ā Error saving {name}: {str(e)}")
return downloaded_metadata
def _generate_safe_filename(self, filename: str, output_dir: str) -> str:
"""
Generate a safe, unique filename avoiding duplicates
Args:
filename: Original filename
output_dir: Target directory
Returns:
Safe filename (with counter if needed)
"""
# Remove any path components
filename = os.path.basename(filename)
file_path = os.path.join(output_dir, filename)
# If file doesn't exist, use original name
if not os.path.exists(file_path):
return filename
# Add counter to avoid duplicates
base, ext = os.path.splitext(filename)
counter = 1
while os.path.exists(file_path):
new_filename = f"{base}_{counter}{ext}"
file_path = os.path.join(output_dir, new_filename)
counter += 1
return os.path.basename(file_path)
def save_download_register(self, download_records: List[Dict], register_file: str):
"""
Save download metadata to a CSV register file
Args:
download_records: List of download metadata dictionaries
register_file: Path to the CSV register file
"""
if not download_records:
print("\nNo files to register.")
return
# Ensure output directory exists
Path(os.path.dirname(register_file)).mkdir(parents=True, exist_ok=True)
# Write to CSV
fieldnames = ["filename", "filepath", "subject", "received_date", "sender", "file_size", "email_id", "download_timestamp"]
# Add download timestamp to each record
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
for record in download_records:
record["download_timestamp"] = timestamp
# Check if file exists to determine if we need headers
file_exists = os.path.exists(register_file)
with open(register_file, mode='a', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
# Write header if new file
if not file_exists:
writer.writeheader()
# Write all records
writer.writerows(download_records)
print(f"\nā Register updated: {register_file}")
print(f" Added {len(download_records)} file(s) to register")
def display_email_list(self, emails: List[Dict]):
"""
Display a formatted list of email entries
Args:
emails: List of email message objects from Graph API
"""
if not emails:
print("\nNo emails found matching the search criteria.")
return
print(f"\n{'='*80}")
print(f"Found {len(emails)} email(s)")
print(f"{'='*80}\n")
for idx, email in enumerate(emails, 1):
# Extract email details
email_id = email.get("id", "N/A")
subject = email.get("subject", "(No Subject)")
# Parse sender information
from_data = email.get("from", {})
sender_info = from_data.get("emailAddress", {})
sender_name = sender_info.get("name", "Unknown")
sender_email = sender_info.get("address", "unknown@unknown.com")
# Parse received date
received = email.get("receivedDateTime", "")
if received:
dt = datetime.fromisoformat(received.replace("Z", "+00:00"))
received_str = dt.strftime("%Y-%m-%d %H:%M:%S")
else:
received_str = "Unknown"
# Other metadata
has_attachments = email.get("hasAttachments", False)
is_read = email.get("isRead", False)
importance = email.get("importance", "normal")
body_preview = email.get("bodyPreview", "")[:100] # First 100 chars
# Display formatted entry
print(f"[{idx}] Email ID: {email_id}")
print(f" Subject: {subject}")
print(f" From: {sender_name} <{sender_email}>")
print(f" Received: {received_str}")
print(f" Status: {'Read' if is_read else 'Unread'} | "
f"Importance: {importance} | "
f"Attachments: {'Yes' if has_attachments else 'No'}")
print(f" Preview: {body_preview}...")
print(f" {'-'*76}\n")
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
tenant_id: Azure AD tenant ID (GUID) for the Microsoft 365 organization. This identifies which Azure AD instance to authenticate against.
client_id: Application (client) ID from Azure AD app registration. This identifies your application to Microsoft's identity platform.
client_secret: Optional client secret for confidential client authentication. If provided, uses client credentials flow; if None, uses device code flow for public clients.
target_mailbox: Optional email address of a specific user's mailbox to search. If None, searches the authenticated user's own mailbox. Requires appropriate delegated permissions.
Return Value
Instantiation returns an EmailSearchApp object. Key method returns: authenticate() returns an access token string; search_emails() returns a list of email dictionaries with fields like id, subject, from, receivedDateTime, body, etc.; download_pdf_attachments() returns a list of dictionaries containing download metadata (filename, filepath, subject, received_date, sender, file_size, email_id); display_email_list() returns None but prints formatted output.
Class Interface
Methods
__init__(self, tenant_id: str, client_id: str, client_secret: Optional[str] = None, target_mailbox: Optional[str] = None)
Purpose: Initialize the EmailSearchApp with Azure AD credentials and optional target mailbox
Parameters:
tenant_id: Azure AD tenant IDclient_id: Application client ID from Azure ADclient_secret: Optional client secret for confidential client flowtarget_mailbox: Optional email address of mailbox to search (if not the authenticated user's)
Returns: None
authenticate(self, scopes: List[str]) -> str
Purpose: Authenticate with Microsoft identity platform using device code flow (public client) or client credentials (confidential client), with token caching support
Parameters:
scopes: List of OAuth scopes required (e.g., ['https://graph.microsoft.com/Mail.Read'])
Returns: Access token string that is also stored in self.access_token
search_emails(self, sender: str, keyword: str, max_results: int = 50) -> List[Dict]
Purpose: Search for emails from a specific sender containing a keyword in subject or body, with pagination support
Parameters:
sender: Email address of the sender to filter bykeyword: Keyword to search for (case-insensitive) in subject, bodyPreview, or body contentmax_results: Maximum number of results to retrieve per page (default 50)
Returns: List of email message dictionaries with fields: id, subject, from, receivedDateTime, bodyPreview, body, hasAttachments, importance, isRead
download_pdf_attachments(self, email: Dict, output_dir: str) -> List[Dict]
Purpose: Download all PDF attachments from a specific email message to a local directory
Parameters:
email: Email message dictionary (from search_emails result) containing at minimum an 'id' fieldoutput_dir: Directory path where PDF files will be saved (created if doesn't exist)
Returns: List of dictionaries with download metadata: filename, filepath, subject, received_date, sender, file_size, email_id
_generate_safe_filename(self, filename: str, output_dir: str) -> str
Purpose: Generate a safe, unique filename by adding counters to avoid overwriting existing files
Parameters:
filename: Original filename from attachmentoutput_dir: Target directory to check for existing files
Returns: Safe filename string with counter suffix if needed (e.g., 'file_1.pdf')
save_download_register(self, download_records: List[Dict], register_file: str)
Purpose: Append download metadata to a CSV register file for tracking downloaded attachments
Parameters:
download_records: List of download metadata dictionaries from download_pdf_attachments()register_file: Path to CSV file (created if doesn't exist, appended to if exists)
Returns: None (prints confirmation message)
display_email_list(self, emails: List[Dict])
Purpose: Display a formatted, human-readable list of email entries to console
Parameters:
emails: List of email message dictionaries from search_emails()
Returns: None (prints formatted output to console)
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
tenant_id |
str | Azure AD tenant ID for authentication | instance |
client_id |
str | Application client ID from Azure AD app registration | instance |
client_secret |
Optional[str] | Client secret for confidential client authentication (None for public clients) | instance |
target_mailbox |
Optional[str] | Email address of target mailbox to search (None for authenticated user's mailbox) | instance |
access_token |
Optional[str] | OAuth access token obtained after authentication, used for Graph API calls | instance |
authority |
str | Microsoft identity platform authority URL constructed from tenant_id | instance |
is_confidential |
bool | Flag indicating whether this is a confidential client (True if client_secret provided) | instance |
Dependencies
msalrequestsosbase64csvtypingdatetimepathlib
Required Imports
import os
import base64
import csv
import msal
import requests
from typing import List, Dict, Optional
from datetime import datetime
from pathlib import Path
Usage Example
# Public client (device code flow)
app = EmailSearchApp(
tenant_id='your-tenant-id',
client_id='your-client-id'
)
# Authenticate
scopes = ['https://graph.microsoft.com/Mail.Read']
app.authenticate(scopes)
# Search emails
emails = app.search_emails(
sender='sender@example.com',
keyword='invoice',
max_results=50
)
# Display results
app.display_email_list(emails)
# Download PDF attachments from first email
if emails:
metadata = app.download_pdf_attachments(
email=emails[0],
output_dir='./downloads'
)
# Save download register
app.save_download_register(
download_records=metadata,
register_file='./downloads/register.csv'
)
# Confidential client example
app_confidential = EmailSearchApp(
tenant_id='your-tenant-id',
client_id='your-client-id',
client_secret='your-client-secret',
target_mailbox='user@example.com'
)
app_confidential.authenticate(['https://graph.microsoft.com/.default'])
Best Practices
- Always call authenticate() before calling any other methods that require API access (search_emails, download_pdf_attachments)
- The access_token is stored as an instance attribute and reused across method calls within the same session
- For confidential clients, use the '.default' scope (e.g., 'https://graph.microsoft.com/.default')
- For public clients, specify explicit scopes (e.g., 'https://graph.microsoft.com/Mail.Read')
- When searching another user's mailbox (target_mailbox), ensure your app has appropriate delegated permissions
- The search_emails() method filters by sender using Graph API $filter, then filters by keyword in-memory for better performance
- PDF attachments are automatically deduplicated using filename counters to avoid overwriting
- The save_download_register() method appends to existing CSV files, maintaining a cumulative record
- Handle RuntimeError exceptions from authenticate() and search_emails() for proper error handling
- The class supports token caching through MSAL - subsequent authentications may use cached tokens
- Method call order: instantiate -> authenticate() -> search_emails() -> download_pdf_attachments() -> save_download_register()
- The is_confidential attribute determines authentication flow automatically based on client_secret presence
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function main_v99 72.2% similar
-
function main_v13 69.4% similar
-
class O365Client 67.7% similar
-
function search_messages 61.6% similar
-
function download_attachments_for_message 61.1% similar