class MultiPagePDFProcessor
A class for processing multi-page PDF documents with context-aware analysis, OCR, and summarization capabilities.
/tf/active/vicechatdev/e-ink-llm/multi_page_processor.py
39 - 372
complex
Purpose
MultiPagePDFProcessor handles extraction, analysis, and summarization of multi-page PDF documents. It converts PDF pages to high-quality images, extracts text content, maintains context across pages, generates document-level summaries, classifies document types, and creates visualizations. The class is designed for comprehensive document understanding with AI-assisted analysis, supporting workflows that require page-by-page processing with awareness of document structure and flow.
Source Code
class MultiPagePDFProcessor:
"""Process multi-page PDFs with context awareness and summarization"""
def __init__(self, max_pages: int = 50, high_quality: bool = True):
"""
Initialize multi-page PDF processor
Args:
max_pages: Maximum number of pages to process (safety limit)
high_quality: Use high DPI for better OCR and analysis
"""
self.max_pages = max_pages
self.high_quality = high_quality
self.dpi_scale = 2.0 if high_quality else 1.5
self.logger = logging.getLogger(__name__)
def extract_all_pages(self, pdf_path: Path) -> Tuple[List[PageAnalysis], Dict[str, Any]]:
"""
Extract all pages from PDF with metadata
Args:
pdf_path: Path to PDF file
Returns:
Tuple of (list of page analyses, document metadata)
"""
if not pdf_path.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
try:
doc = fitz.open(pdf_path)
page_count = len(doc)
if page_count > self.max_pages:
self.logger.warning(f"PDF has {page_count} pages, processing first {self.max_pages}")
page_count = self.max_pages
# Document-level metadata
doc_metadata = {
'source_type': 'multi_page_pdf',
'source_file': str(pdf_path),
'total_pages': len(doc),
'processed_pages': page_count,
'file_size': pdf_path.stat().st_size,
'creation_date': doc.metadata.get('creationDate', ''),
'title': doc.metadata.get('title', ''),
'author': doc.metadata.get('author', ''),
'subject': doc.metadata.get('subject', '')
}
pages = []
for page_num in range(page_count):
page_analysis = self._extract_single_page(doc, page_num)
pages.append(page_analysis)
self.logger.info(f"Extracted page {page_num + 1}/{page_count}: "
f"{len(page_analysis.text_content)} chars text")
doc.close()
return pages, doc_metadata
except Exception as e:
self.logger.error(f"Error extracting PDF pages: {e}")
raise
def _extract_single_page(self, doc: fitz.Document, page_num: int) -> PageAnalysis:
"""Extract single page with image and text"""
page = doc[page_num]
# Extract text content
text_content = page.get_text()
# Render page as high-quality image
mat = fitz.Matrix(self.dpi_scale, self.dpi_scale)
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image
img_data = pix.tobytes("png")
img = PILImage.open(io.BytesIO(img_data))
# Convert to base64
buffer = io.BytesIO()
img.save(buffer, format='PNG')
img_b64 = base64.b64encode(buffer.getvalue()).decode()
return PageAnalysis(
page_number=page_num + 1,
image_b64=img_b64,
text_content=text_content.strip(),
dimensions=(img.width, img.height)
)
def create_context_aware_prompt(self, pages: List[PageAnalysis],
current_page: int,
conversation_context: str = "") -> str:
"""
Create context-aware prompt for current page analysis
Args:
pages: All page analyses
current_page: Current page number (1-based)
conversation_context: Previous conversation context
Returns:
Context-aware prompt for LLM
"""
total_pages = len(pages)
current_idx = current_page - 1
# Build context from previous pages
previous_context = ""
if current_page > 1:
# Include summaries of previous pages
prev_summaries = []
for i in range(min(3, current_idx)): # Last 3 pages for context
page = pages[current_idx - 1 - i]
if page.analysis_result:
prev_summaries.append(f"Page {page.page_number}: {page.analysis_result[:200]}...")
if prev_summaries:
previous_context = "\n\nPrevious pages context:\n" + "\n".join(reversed(prev_summaries))
# Build forward context (if available)
forward_context = ""
if current_page < total_pages:
next_pages_text = []
for i in range(current_idx + 1, min(current_idx + 3, total_pages)): # Next 2 pages
if pages[i].text_content:
next_pages_text.append(f"Page {i + 1} preview: {pages[i].text_content[:100]}...")
if next_pages_text:
forward_context = "\n\nUpcoming pages preview:\n" + "\n".join(next_pages_text)
# Conversation context
conv_context = ""
if conversation_context:
conv_context = f"\n\nConversation history:\n{conversation_context}"
# Main prompt
prompt = f"""You are analyzing page {current_page} of {total_pages} from a document.
DOCUMENT CONTEXT:
- Current page: {current_page}/{total_pages}
- This is {'the first' if current_page == 1 else 'a middle' if current_page < total_pages else 'the final'} page
{previous_context}
{forward_context}
{conv_context}
Please analyze this page considering:
1. Content on this specific page
2. How it relates to previous pages (if any)
3. Document flow and continuity
4. Key information that should be highlighted
5. Any questions or clarifications needed
Provide a comprehensive analysis that builds upon the document context."""
return prompt
def generate_document_summary(self, pages: List[PageAnalysis],
document_metadata: Dict[str, Any]) -> DocumentSummary:
"""
Generate comprehensive document summary
Args:
pages: All analyzed pages
document_metadata: Document-level metadata
Returns:
DocumentSummary with comprehensive analysis
"""
# Extract key information
total_pages = len(pages)
# Collect page summaries
page_summaries = []
main_topics = set()
key_findings = []
for page in pages:
if page.analysis_result:
# Create page summary
summary = f"Page {page.page_number}: {page.analysis_result[:150]}..."
page_summaries.append(summary)
# Extract topics (simplified - could be enhanced with NLP)
if page.key_elements:
main_topics.update(page.key_elements)
# Determine document type based on content
doc_type = self._classify_document_type(pages, document_metadata)
# Generate overall summary
overall_summary = self._generate_overall_summary(pages, doc_type)
# Calculate confidence score
confidence = self._calculate_confidence_score(pages)
return DocumentSummary(
total_pages=total_pages,
document_type=doc_type,
main_topics=list(main_topics),
key_findings=key_findings,
page_summaries=page_summaries,
overall_summary=overall_summary,
confidence_score=confidence
)
def _classify_document_type(self, pages: List[PageAnalysis],
metadata: Dict[str, Any]) -> str:
"""Classify document type based on content and metadata"""
# Simple classification based on content patterns
text_content = " ".join([page.text_content for page in pages])
text_lower = text_content.lower()
# Check for common document types
if any(word in text_lower for word in ['research', 'study', 'methodology', 'results', 'conclusion']):
return 'research_paper'
elif any(word in text_lower for word in ['contract', 'agreement', 'terms', 'conditions']):
return 'legal_document'
elif any(word in text_lower for word in ['financial', 'budget', 'revenue', 'expenses']):
return 'financial_report'
elif any(word in text_lower for word in ['manual', 'instructions', 'guide', 'how to']):
return 'instructional'
elif any(word in text_lower for word in ['meeting', 'agenda', 'minutes', 'discussion']):
return 'meeting_document'
elif len(pages) > 10 and any(word in text_lower for word in ['chapter', 'section']):
return 'book_document'
else:
return 'general_document'
def _generate_overall_summary(self, pages: List[PageAnalysis], doc_type: str) -> str:
"""Generate overall document summary"""
total_pages = len(pages)
# Count pages with substantial content
content_pages = sum(1 for page in pages if len(page.text_content) > 100)
summary = f"This {doc_type.replace('_', ' ')} contains {total_pages} pages "
summary += f"with {content_pages} pages of substantial content. "
if any(page.analysis_result for page in pages):
summary += "The document has been analyzed with AI assistance, "
summary += "providing detailed insights for each page. "
# Add type-specific summary elements
if doc_type == 'research_paper':
summary += "Key sections likely include methodology, results, and conclusions."
elif doc_type == 'legal_document':
summary += "Contains legal terms and conditions requiring careful review."
elif doc_type == 'financial_report':
summary += "Includes financial data and metrics for analysis."
elif doc_type == 'instructional':
summary += "Provides step-by-step guidance and instructions."
return summary
def _calculate_confidence_score(self, pages: List[PageAnalysis]) -> float:
"""Calculate confidence score for document analysis"""
if not pages:
return 0.0
# Factors contributing to confidence
text_coverage = sum(1 for page in pages if page.text_content) / len(pages)
analysis_coverage = sum(1 for page in pages if page.analysis_result) / len(pages)
# Average text length per page (normalized)
avg_text_length = sum(len(page.text_content) for page in pages) / len(pages)
text_score = min(avg_text_length / 500, 1.0) # Normalize to 500 chars
# Combined confidence score
confidence = (text_coverage * 0.3 + analysis_coverage * 0.4 + text_score * 0.3)
return round(confidence, 2)
def create_combined_visualization(self, pages: List[PageAnalysis],
max_width: int = 2400) -> str:
"""
Create combined visualization of multiple pages
Args:
pages: List of page analyses
max_width: Maximum width for combined image
Returns:
Base64 encoded combined image
"""
if not pages:
raise ValueError("No pages to combine")
# Load all page images
images = []
for page in pages:
img_data = base64.b64decode(page.image_b64)
img = PILImage.open(io.BytesIO(img_data))
images.append(img)
# Calculate combined dimensions
total_height = sum(img.height for img in images)
max_width_actual = max(img.width for img in images)
# Scale if necessary
if max_width_actual > max_width:
scale_factor = max_width / max_width_actual
scaled_images = []
total_height = 0
for img in images:
new_width = int(img.width * scale_factor)
new_height = int(img.height * scale_factor)
scaled_img = img.resize((new_width, new_height), PILImage.Resampling.LANCZOS)
scaled_images.append(scaled_img)
total_height += new_height
images = scaled_images
combined_width = max_width
else:
combined_width = max_width_actual
# Create combined image
combined = PILImage.new('RGB', (combined_width, total_height), 'white')
y_offset = 0
for img in images:
combined.paste(img, (0, y_offset))
y_offset += img.height
# Convert to base64
buffer = io.BytesIO()
combined.save(buffer, format='PNG')
combined_b64 = base64.b64encode(buffer.getvalue()).decode()
return combined_b64
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
max_pages: Maximum number of pages to process from a PDF document. Acts as a safety limit to prevent processing extremely large documents. Default is 50 pages. If a PDF has more pages than this limit, only the first max_pages will be processed.
high_quality: Boolean flag to enable high-quality image rendering for better OCR and analysis. When True, uses DPI scale of 2.0; when False, uses 1.5. Higher quality produces better results but larger images and slower processing. Default is True.
Return Value
Instantiation returns a MultiPagePDFProcessor object configured with the specified parameters. Key method returns: extract_all_pages() returns a tuple of (List[PageAnalysis], Dict[str, Any]) containing page analyses and document metadata; create_context_aware_prompt() returns a string prompt for LLM analysis; generate_document_summary() returns a DocumentSummary object; create_combined_visualization() returns a base64-encoded string of the combined page images.
Class Interface
Methods
__init__(self, max_pages: int = 50, high_quality: bool = True)
Purpose: Initialize the MultiPagePDFProcessor with configuration parameters
Parameters:
max_pages: Maximum number of pages to process (default: 50)high_quality: Use high DPI for better quality (default: True)
Returns: None - initializes instance attributes
extract_all_pages(self, pdf_path: Path) -> Tuple[List[PageAnalysis], Dict[str, Any]]
Purpose: Extract all pages from a PDF file with metadata, converting each page to an image and extracting text content
Parameters:
pdf_path: Path object pointing to the PDF file to process
Returns: Tuple containing: (1) List of PageAnalysis objects, one per page with image and text data, (2) Dictionary with document-level metadata including source_type, source_file, total_pages, processed_pages, file_size, creation_date, title, author, subject
_extract_single_page(self, doc: fitz.Document, page_num: int) -> PageAnalysis
Purpose: Internal method to extract a single page from an open PDF document, rendering it as an image and extracting text
Parameters:
doc: Open fitz.Document objectpage_num: Zero-based page number to extract
Returns: PageAnalysis object containing page_number (1-based), image_b64 (base64-encoded PNG), text_content (extracted text), and dimensions (width, height)
create_context_aware_prompt(self, pages: List[PageAnalysis], current_page: int, conversation_context: str = '') -> str
Purpose: Generate a context-aware prompt for LLM analysis of a specific page, including context from previous pages, upcoming pages preview, and conversation history
Parameters:
pages: List of all PageAnalysis objects for the documentcurrent_page: 1-based page number to create prompt forconversation_context: Optional previous conversation history to include in prompt
Returns: String containing a comprehensive prompt for LLM analysis with document context, previous page summaries (up to 3), upcoming page previews (up to 2), and conversation history
generate_document_summary(self, pages: List[PageAnalysis], document_metadata: Dict[str, Any]) -> DocumentSummary
Purpose: Generate a comprehensive summary of the entire document including classification, topics, findings, and confidence score
Parameters:
pages: List of all analyzed PageAnalysis objectsdocument_metadata: Document-level metadata dictionary from extract_all_pages()
Returns: DocumentSummary object containing total_pages, document_type (classified), main_topics (set of topics), key_findings, page_summaries (list of per-page summaries), overall_summary (narrative summary), and confidence_score (0.0-1.0)
_classify_document_type(self, pages: List[PageAnalysis], metadata: Dict[str, Any]) -> str
Purpose: Internal method to classify document type based on content patterns and keywords
Parameters:
pages: List of PageAnalysis objectsmetadata: Document metadata dictionary
Returns: String representing document type: 'research_paper', 'legal_document', 'financial_report', 'instructional', 'meeting_document', 'book_document', or 'general_document'
_generate_overall_summary(self, pages: List[PageAnalysis], doc_type: str) -> str
Purpose: Internal method to generate a narrative overall summary of the document
Parameters:
pages: List of PageAnalysis objectsdoc_type: Classified document type string
Returns: String containing a narrative summary describing the document's content, structure, and type-specific characteristics
_calculate_confidence_score(self, pages: List[PageAnalysis]) -> float
Purpose: Internal method to calculate a confidence score for the document analysis based on text coverage, analysis coverage, and content quality
Parameters:
pages: List of PageAnalysis objects
Returns: Float between 0.0 and 1.0 representing confidence in the analysis, rounded to 2 decimal places. Calculated from text_coverage (30%), analysis_coverage (40%), and text_score (30%)
create_combined_visualization(self, pages: List[PageAnalysis], max_width: int = 2400) -> str
Purpose: Create a single combined image visualization of multiple pages stacked vertically
Parameters:
pages: List of PageAnalysis objects to combinemax_width: Maximum width for the combined image in pixels (default: 2400). Images are scaled down if wider.
Returns: Base64-encoded string of the combined PNG image with all pages stacked vertically
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
max_pages |
int | Maximum number of pages to process from a PDF document, set during initialization | instance |
high_quality |
bool | Flag indicating whether to use high-quality rendering for page images | instance |
dpi_scale |
float | DPI scaling factor for page rendering. Set to 2.0 if high_quality is True, otherwise 1.5 | instance |
logger |
logging.Logger | Logger instance for logging processing information, warnings, and errors | instance |
Dependencies
fitzPyMuPDFbase64iopathlibtypingdataclassesPILPillowloggingsys
Required Imports
import fitz
import base64
import io
from pathlib import Path
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from PIL import Image as PILImage
import logging
Usage Example
from pathlib import Path
from dataclasses import dataclass
from typing import List, Optional
@dataclass
class PageAnalysis:
page_number: int
image_b64: str
text_content: str
dimensions: tuple
analysis_result: Optional[str] = None
key_elements: Optional[List[str]] = None
@dataclass
class DocumentSummary:
total_pages: int
document_type: str
main_topics: List[str]
key_findings: List[str]
page_summaries: List[str]
overall_summary: str
confidence_score: float
# Initialize processor
processor = MultiPagePDFProcessor(max_pages=50, high_quality=True)
# Extract all pages from PDF
pdf_path = Path('document.pdf')
pages, metadata = processor.extract_all_pages(pdf_path)
print(f"Processed {len(pages)} pages")
print(f"Document title: {metadata['title']}")
# Create context-aware prompt for page 1
prompt = processor.create_context_aware_prompt(pages, current_page=1)
# Generate document summary
summary = processor.generate_document_summary(pages, metadata)
print(f"Document type: {summary.document_type}")
print(f"Confidence: {summary.confidence_score}")
# Create combined visualization
combined_image = processor.create_combined_visualization(pages, max_width=2400)
Best Practices
- Always check that the PDF file exists before calling extract_all_pages() to avoid FileNotFoundError
- Set max_pages appropriately based on your use case to prevent memory issues with very large PDFs
- Use high_quality=True for documents requiring accurate OCR or detailed analysis, but be aware of increased processing time and memory usage
- The class maintains no persistent state between method calls except for configuration (max_pages, high_quality, dpi_scale), making it safe to reuse for multiple documents
- PageAnalysis and DocumentSummary dataclasses must be defined before using this class
- Call extract_all_pages() first to get page data before using other methods like create_context_aware_prompt() or generate_document_summary()
- The logger attribute uses __name__, so configure logging at the module level for proper log output
- For large documents, consider processing pages in batches rather than all at once to manage memory
- The create_combined_visualization() method can produce very large images; adjust max_width parameter based on your needs
- Document classification in _classify_document_type() uses simple keyword matching; consider enhancing with NLP for production use
- Context-aware prompts include up to 3 previous and 2 upcoming pages; adjust these limits in create_context_aware_prompt() if needed
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentProcessor_v3 74.8% similar
-
class MultiPageLLMHandler 72.8% similar
-
class MultiPageAnalysisResult 69.4% similar
-
function process_multi_page_pdf 68.7% similar
-
class DocumentProcessor 68.6% similar