class SessionDetector
Detects session information (conversation ID and exchange number) from PDF files using multiple detection methods including metadata, filename, footer, and content analysis.
/tf/active/vicechatdev/e-ink-llm/session_detector.py
33 - 243
moderate
Purpose
This class provides automatic session detection capabilities for PDF files to enable conversation continuation. It attempts to extract conversation IDs and exchange numbers from PDFs using a prioritized set of detection methods, returning the most reliable match with confidence scores. The class is designed to work with PDFs that contain session information embedded in various locations (metadata, filenames, footers, or content) and supports the conversation ID format 'conv_YYYYMMDD_HHMMSS_XXXXXXXX' along with exchange numbers.
Source Code
class SessionDetector:
"""Detect session information from PDF files for automatic conversation continuation"""
def __init__(self):
self.logger = logging.getLogger(__name__)
# Regex patterns for session detection
self.conv_id_pattern = re.compile(r'conv_(\d{8}_\d{6}_[a-f0-9]{8})')
self.exchange_pattern = re.compile(r'ex(?:change)?[#\s]*(\d+)', re.IGNORECASE)
self.session_footer_pattern = re.compile(
r'Session:\s*(conv_\d{8}_\d{6}_[a-f0-9]{8})\s*\|\s*Exchange\s*#?(\d+)',
re.IGNORECASE
)
def detect_session_from_pdf(self, pdf_path: str) -> Optional[SessionInfo]:
"""
Detect session information from a PDF file using multiple methods
Args:
pdf_path: Path to the PDF file
Returns:
SessionInfo if detected, None otherwise
"""
if not PDF_READER_AVAILABLE:
self.logger.warning("PDF reading not available - install PyPDF2 or pypdf")
return None
pdf_path = Path(pdf_path)
if not pdf_path.exists():
self.logger.error(f"PDF file not found: {pdf_path}")
return None
# Try multiple detection methods in order of reliability
methods = [
self._detect_from_metadata,
self._detect_from_filename,
self._detect_from_footer,
self._detect_from_content
]
best_match = None
for method in methods:
try:
result = method(pdf_path)
if result and (not best_match or result.confidence > best_match.confidence):
best_match = result
# If we have high confidence, use it immediately
if result.confidence >= 0.9:
break
except Exception as e:
self.logger.debug(f"Detection method {method.__name__} failed: {e}")
continue
if best_match:
self.logger.info(f"Detected session: {best_match.conversation_id} "
f"exchange #{best_match.exchange_number} "
f"(confidence: {best_match.confidence:.2f}, source: {best_match.source})")
else:
self.logger.debug(f"No session information detected in {pdf_path.name}")
return best_match
def _detect_from_metadata(self, pdf_path: Path) -> Optional[SessionInfo]:
"""Detect session info from PDF metadata"""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
metadata = pdf_reader.metadata
if not metadata:
return None
# Check various metadata fields
fields_to_check = [
getattr(metadata, 'subject', ''),
getattr(metadata, 'title', ''),
getattr(metadata, 'creator', ''),
str(metadata.get('/Subject', '')),
str(metadata.get('/Title', '')),
str(metadata.get('/Creator', ''))
]
for field in fields_to_check:
if field:
conv_match = self.conv_id_pattern.search(str(field))
if conv_match:
conv_id = f"conv_{conv_match.group(1)}"
# Look for exchange number in same field
ex_match = self.exchange_pattern.search(str(field))
if ex_match:
return SessionInfo(
conversation_id=conv_id,
exchange_number=int(ex_match.group(1)),
confidence=0.95,
source='metadata'
)
except Exception as e:
self.logger.debug(f"Metadata detection failed: {e}")
return None
def _detect_from_filename(self, pdf_path: Path) -> Optional[SessionInfo]:
"""Detect session info from filename"""
filename = pdf_path.name
# Look for session-aware filename pattern
# RESPONSE_conv_20250731_224420_6a63a783_ex001_filename.pdf
# ERROR_conv_20250731_224420_6a63a783_ex002_filename.pdf
conv_match = self.conv_id_pattern.search(filename)
if conv_match:
conv_id = f"conv_{conv_match.group(1)}"
# Look for exchange number after conversation ID
ex_match = re.search(rf'{re.escape(conv_id)}_ex(\d+)', filename)
if ex_match:
return SessionInfo(
conversation_id=conv_id,
exchange_number=int(ex_match.group(1)),
confidence=0.9,
source='filename'
)
# Fallback: any exchange number in filename
ex_match = self.exchange_pattern.search(filename)
if ex_match:
return SessionInfo(
conversation_id=conv_id,
exchange_number=int(ex_match.group(1)),
confidence=0.7,
source='filename'
)
return None
def _detect_from_footer(self, pdf_path: Path) -> Optional[SessionInfo]:
"""Detect session info from PDF footer content"""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
# Check last few pages for footer information
pages_to_check = min(3, len(pdf_reader.pages))
for i in range(pages_to_check):
page = pdf_reader.pages[-(i+1)] # Start from last page
text = page.extract_text()
# Look for footer pattern: "Session: conv_id | Exchange #num"
footer_match = self.session_footer_pattern.search(text)
if footer_match:
return SessionInfo(
conversation_id=footer_match.group(1),
exchange_number=int(footer_match.group(2)),
confidence=0.85,
source='footer'
)
except Exception as e:
self.logger.debug(f"Footer detection failed: {e}")
return None
def _detect_from_content(self, pdf_path: Path) -> Optional[SessionInfo]:
"""Detect session info from PDF content (last resort)"""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
# Extract text from first page (most likely to contain session info)
if len(pdf_reader.pages) > 0:
text = pdf_reader.pages[0].extract_text()
# Look for conversation ID anywhere in content
conv_match = self.conv_id_pattern.search(text)
if conv_match:
conv_id = f"conv_{conv_match.group(1)}"
# Look for exchange number in same content
ex_match = self.exchange_pattern.search(text)
if ex_match:
return SessionInfo(
conversation_id=conv_id,
exchange_number=int(ex_match.group(1)),
confidence=0.6,
source='content'
)
except Exception as e:
self.logger.debug(f"Content detection failed: {e}")
return None
def get_next_exchange_number(self, conversation_id: str, detected_exchange: int) -> int:
"""
Calculate the next exchange number for continuation
Args:
conversation_id: The conversation ID
detected_exchange: The exchange number from the PDF
Returns:
Next exchange number to use
"""
# For response PDFs, the next exchange is detected + 1
# For error PDFs, we might want to retry the same exchange or increment
return detected_exchange + 1
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
No constructor parameters: The __init__ method takes no parameters. It initializes a logger and compiles regex patterns for session detection internally.
Return Value
Instantiation returns a SessionDetector object. The main method detect_session_from_pdf returns Optional[SessionInfo] - either a SessionInfo object containing conversation_id, exchange_number, confidence score (0.0-1.0), and source of detection, or None if no session information could be detected. The get_next_exchange_number method returns an integer representing the next exchange number to use.
Class Interface
Methods
__init__(self)
Purpose: Initialize the SessionDetector with logger and compile regex patterns for session detection
Returns: None - initializes the instance
detect_session_from_pdf(self, pdf_path: str) -> Optional[SessionInfo]
Purpose: Main public method to detect session information from a PDF file using multiple detection methods in order of reliability
Parameters:
pdf_path: Path to the PDF file as a string. Can be relative or absolute path.
Returns: SessionInfo object containing conversation_id, exchange_number, confidence (0.0-1.0), and source if detected; None if no session information found or PDF reading unavailable
_detect_from_metadata(self, pdf_path: Path) -> Optional[SessionInfo]
Purpose: Private method to detect session info from PDF metadata fields (subject, title, creator). Highest confidence method (0.95).
Parameters:
pdf_path: Path object pointing to the PDF file
Returns: SessionInfo with confidence 0.95 and source 'metadata' if found, None otherwise
_detect_from_filename(self, pdf_path: Path) -> Optional[SessionInfo]
Purpose: Private method to detect session info from the PDF filename. Looks for patterns like 'conv_YYYYMMDD_HHMMSS_XXXXXXXX_exNNN'. Confidence 0.7-0.9.
Parameters:
pdf_path: Path object pointing to the PDF file
Returns: SessionInfo with confidence 0.7-0.9 and source 'filename' if found, None otherwise
_detect_from_footer(self, pdf_path: Path) -> Optional[SessionInfo]
Purpose: Private method to detect session info from PDF footer content. Checks last 3 pages for footer pattern 'Session: conv_id | Exchange #num'. Confidence 0.85.
Parameters:
pdf_path: Path object pointing to the PDF file
Returns: SessionInfo with confidence 0.85 and source 'footer' if found, None otherwise
_detect_from_content(self, pdf_path: Path) -> Optional[SessionInfo]
Purpose: Private method to detect session info from PDF content (first page). Last resort method with lowest confidence (0.6).
Parameters:
pdf_path: Path object pointing to the PDF file
Returns: SessionInfo with confidence 0.6 and source 'content' if found, None otherwise
get_next_exchange_number(self, conversation_id: str, detected_exchange: int) -> int
Purpose: Calculate the next exchange number for conversation continuation based on the detected exchange number
Parameters:
conversation_id: The conversation ID string (currently not used in calculation but provided for future extensibility)detected_exchange: The exchange number detected from the PDF
Returns: Integer representing the next exchange number (detected_exchange + 1)
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
logger |
logging.Logger | Logger instance for the class, used to log detection progress, warnings, and debug information | instance |
conv_id_pattern |
re.Pattern | Compiled regex pattern for matching conversation IDs in format 'conv_YYYYMMDD_HHMMSS_XXXXXXXX' | instance |
exchange_pattern |
re.Pattern | Compiled regex pattern for matching exchange numbers in various formats (ex001, exchange 1, ex#1, etc.) | instance |
session_footer_pattern |
re.Pattern | Compiled regex pattern for matching session footer format 'Session: conv_id | Exchange #num' | instance |
Dependencies
reloggingpathlibtypingdataclassesPyPDF2pypdf
Required Imports
import re
import logging
from pathlib import Path
from typing import Optional
Conditional/Optional Imports
These imports are only needed under specific conditions:
from PyPDF2 import PdfReader
Condition: Required for PDF reading functionality. If not available, the class will log warnings and return None from detect_session_from_pdf
Optionalfrom pypdf import PdfReader
Condition: Alternative to PyPDF2 for PDF reading. Either PyPDF2 or pypdf must be installed for the class to function
OptionalUsage Example
# Instantiate the detector
detector = SessionDetector()
# Detect session from a PDF file
pdf_path = 'RESPONSE_conv_20250731_224420_6a63a783_ex001_report.pdf'
session_info = detector.detect_session_from_pdf(pdf_path)
if session_info:
print(f"Conversation ID: {session_info.conversation_id}")
print(f"Exchange Number: {session_info.exchange_number}")
print(f"Confidence: {session_info.confidence}")
print(f"Source: {session_info.source}")
# Get next exchange number for continuation
next_exchange = detector.get_next_exchange_number(
session_info.conversation_id,
session_info.exchange_number
)
print(f"Next exchange: {next_exchange}")
else:
print("No session information detected")
Best Practices
- Always check if the returned SessionInfo is None before accessing its attributes
- The class tries multiple detection methods in order of reliability (metadata > filename > footer > content), stopping early if high confidence (>=0.9) is achieved
- Confidence scores range from 0.6 (content detection) to 0.95 (metadata detection), use these to determine if you should trust the detection
- The class is stateless - each detection is independent and the detector can be reused for multiple PDFs
- Ensure PyPDF2 or pypdf is installed before using this class, otherwise all detections will return None
- The conversation ID format expected is 'conv_YYYYMMDD_HHMMSS_XXXXXXXX' where XXXXXXXX is an 8-character hex string
- Exchange numbers are expected to be integers, typically formatted as 'ex001', 'ex002', etc. in filenames
- For production use, configure logging appropriately to capture detection failures and debug information
- The detector reads PDF files from disk, ensure proper file permissions and handle file not found errors
- Detection methods fail gracefully - if one method throws an exception, others are still attempted
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function test_session_detection 74.4% similar
-
class SessionInfo 74.0% similar
-
function detect_session_from_file 66.1% similar
-
function test_pdf_session_integration 62.3% similar
-
class AnnotationDetector 58.0% similar