class DocumentProcessor_v3
A comprehensive PDF document processor that handles text extraction, OCR (Optical Character Recognition), layout analysis, table detection, and metadata extraction from PDF files.
/tf/active/vicechatdev/invoice_extraction/core/document_processor.py
12 - 451
complex
Purpose
This class provides a complete pipeline for processing PDF invoices and documents. It intelligently combines native PDF text extraction with OCR capabilities using Tesseract, analyzes document layout including headers/footers/body sections, detects tables, and extracts structured information. It's designed to handle both text-based PDFs and scanned/image-based PDFs, automatically determining when OCR is needed. The processor maintains document structure including page-level information, text blocks with bounding boxes, confidence scores, and positional metadata.
Source Code
class DocumentProcessor:
"""Handles PDF processing, OCR, and text extraction."""
def __init__(self, config=None):
self.config = config or {}
# Configure Tesseract
self.tesseract_cmd = self.config.get('tesseract_cmd', 'tesseract')
self.tesseract_langs = self.config.get('tesseract_langs', 'eng+fra+nld')
# Configure OCR settings
self.ocr_dpi = self.config.get('ocr_dpi', 300)
self.ocr_threshold = self.config.get('ocr_threshold', 0.3) # Lower threshold to capture more text
# Check Tesseract installation
try:
pytesseract.get_tesseract_version()
except Exception as e:
logger.warning(f"Tesseract not properly configured: {e}")
logger.warning("OCR capabilities may be limited")
def process(self, file_path):
"""
Process a PDF invoice to extract structured text.
Args:
file_path: Path to the invoice PDF
Returns:
Document object with extracted text and layout information
"""
logger.info(f"Processing document: {file_path}")
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
# Check file extension
if not file_path.lower().endswith('.pdf'):
raise ValueError("Only PDF files are currently supported")
# Process PDF
return self._process_pdf(file_path)
def _process_pdf(self, pdf_path):
"""Process a PDF file to extract text and structure."""
document = {
'path': pdf_path,
'filename': os.path.basename(pdf_path),
'pages': [],
'text': "",
'blocks': [],
}
try:
# Open PDF with PyMuPDF - with error handling
try:
pdf = fitz.Document(pdf_path)
except AttributeError:
# Fallback if Document() doesn't work (older versions)
pdf = fitz.open(pdf_path)
except Exception as e:
logger.error(f"Failed to open PDF with PyMuPDF: {e}")
raise ValueError(f"Could not open PDF: {e}")
# Process each page
for page_idx, page in enumerate(pdf):
page_data = self._process_page(page, page_idx)
document['pages'].append(page_data)
document['text'] += page_data['text'] + "\n\n"
document['blocks'].extend(page_data['blocks'])
# Add metadata
document['metadata'] = self._extract_metadata(pdf)
logger.info(f"Successfully processed {len(document['pages'])} pages")
return document
except Exception as e:
logger.error(f"Error processing PDF {pdf_path}: {str(e)}")
logger.debug(f"Error details: {e}", exc_info=True)
raise
def _process_page(self, page, page_idx):
"""Process a single PDF page."""
# Basic page info
page_data = {
'index': page_idx,
'width': page.rect.width,
'height': page.rect.height,
'text': "",
'blocks': [],
'tables': [],
}
# Try to extract text directly first
text = page.get_text()
# Check if we need OCR (minimal text extracted or config forces OCR)
#force_ocr = self.config.get('force_ocr', False)
force_ocr=True
if force_ocr or self._needs_ocr(text):
logger.info(f"Using OCR for page {page_idx+1}")
text, blocks = self._ocr_page(page)
page_data['text'] = text
page_data['blocks'] = blocks
else:
# Use native PDF text extraction with layout analysis
page_data['text'] = text
page_data['blocks'] = self._extract_blocks(page)
# Detect tables on the page
page_data['tables'] = self._detect_tables(page, page_data['blocks'])
# Explicitly extract footer content
footer_info = self._extract_footer(page, page_data['blocks'])
if footer_info:
page_data['footer'] = footer_info
# Add footer text to the main text content to ensure it's not lost
page_data['text'] += "\n\nFOOTER:\n" + footer_info['text']
print ("page text: ", page_data['text'])
return page_data
def _needs_ocr(self, text):
"""Determine if OCR is needed based on text extraction results."""
# If almost no text was extracted, we likely need OCR
if len(text.strip()) < 50:
return True
# Check for common PDF issues (like text as curves/images)
if text.strip() and not any(char.isalnum() for char in text):
return True
return False
def _ocr_page(self, page):
"""Perform OCR on a page."""
try:
# Render page to an image at higher DPI for better OCR results
pix = page.get_pixmap(matrix=fitz.Matrix(self.ocr_dpi/72, self.ocr_dpi/72))
# Convert to PIL Image
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Run Tesseract OCR with layout analysis (using HOCR output)
with tempfile.NamedTemporaryFile(suffix='.hocr') as tmp:
# Run Tesseract with HOCR output format and improved settings
# Added --psm 3 (use page segmentation mode with auto-orientation)
custom_config = f'--oem 3 --psm 3 -l {self.tesseract_langs} hocr'
try:
hocr = pytesseract.image_to_pdf_or_hocr(img, extension='hocr', config=custom_config)
# Also get plain text
text = pytesseract.image_to_string(img, lang=self.tesseract_langs)
# Parse HOCR to extract block information
blocks = self._parse_hocr(hocr, page.rect.width, page.rect.height)
except Exception as ocr_error:
logger.warning(f"OCR error: {ocr_error}. Falling back to simple text extraction.")
# Fallback to simple text extraction
text = pytesseract.image_to_string(img, lang=self.tesseract_langs)
blocks = [{
'id': "ocr-block-0",
'type': 'text',
'text': text,
'bbox': [0, 0, page.rect.width, page.rect.height],
'confidence': 0.5
}]
return text, blocks
except Exception as e:
logger.error(f"Error during OCR processing: {e}")
# Return empty results rather than failing
return "", []
def _extract_blocks(self, page):
"""Extract text blocks from a PDF page using PyMuPDF."""
blocks = []
# Extract text blocks with bounding boxes
page_dict = page.get_text("dict")
# Convert PyMuPDF blocks to our standard format
for block_idx, block in enumerate(page_dict.get("blocks", [])):
if "lines" not in block:
continue
block_text = ""
for line in block.get("lines", []):
for span in line.get("spans", []):
block_text += span.get("text", "")
block_text += "\n"
# Skip empty blocks
if not block_text.strip():
continue
# Create standardized block representation
blocks.append({
'id': f"block-{page.number}-{block_idx}",
'type': 'text',
'text': block_text.strip(),
'bbox': [
block["bbox"][0],
block["bbox"][1],
block["bbox"][2],
block["bbox"][3]
],
'confidence': 1.0, # Native PDF text has high confidence
'position': self._determine_block_position(block["bbox"], page.rect)
})
return blocks
def _determine_block_position(self, bbox, page_rect):
"""Determine if block is in header, footer, or body."""
y_center = (bbox[1] + bbox[3]) / 2
page_height = page_rect.height
# Adjust footer threshold to capture more footer content - changed from 70% to 60%
if y_center < page_height * 0.2:
return "header"
elif y_center > page_height * 0.6: # Changed from 0.7 to 0.6 to catch more footer content
return "footer"
else:
return "body"
def _parse_hocr(self, hocr_data, page_width, page_height):
"""Parse HOCR output from Tesseract to extract blocks with positions."""
blocks = []
# Convert binary HOCR data to string
hocr_text = hocr_data.decode('utf-8')
import re
# Find all ocr_carea (text block) elements
# Fixed regex pattern - escaped special characters properly
block_pattern = r'<div class=\'ocr_carea\'.+?title=\'bbox (\d+ \d+ \d+ \d+).+?confidence (\d+\.\d+)\'>(.*?)</div>'
try:
block_matches = re.finditer(block_pattern, hocr_text, re.DOTALL)
for block_idx, match in enumerate(block_matches):
bbox_str, confidence, content = match.groups()
x0, y0, x1, y1 = map(float, bbox_str.split())
# Extract text from the block - fixed regex pattern
text_pattern = r'<span class=\'ocrx_word\'[^>]*>(.*?)</span>'
words = re.findall(text_pattern, content)
block_text = ' '.join(words)
# Skip empty blocks
if not block_text.strip():
continue
# Normalize coordinates to PDF space
x0 = x0 / self.ocr_dpi * 72
y0 = y0 / self.ocr_dpi * 72
x1 = x1 / self.ocr_dpi * 72
y1 = y1 / self.ocr_dpi * 72
# Create block with position information
block = {
'id': f"ocr-block-{block_idx}",
'type': 'text',
'text': block_text,
'bbox': [x0, y0, x1, y1],
'confidence': float(confidence) / 100.0,
'position': self._determine_block_position([x0, y0, x1, y1],
fitz.Rect(0, 0, page_width, page_height))
}
blocks.append(block)
except re.error as e:
logger.error(f"Regex error parsing HOCR: {e}")
# Fallback to simple text without blocks
from bs4 import BeautifulSoup
try:
soup = BeautifulSoup(hocr_text, 'html.parser')
text = soup.get_text()
blocks.append({
'id': "ocr-block-0",
'type': 'text',
'text': text,
'bbox': [0, 0, page_width, page_height],
'confidence': 0.5,
'position': "body"
})
except Exception as e2:
logger.error(f"Fallback parsing failed: {e2}")
return blocks
def _detect_tables(self, page, blocks):
"""Detect potential tables on the page based on text blocks."""
# This is a simplified approach - in a real implementation you might want
# to use a dedicated table detection model or algorithm
tables = []
# Skip blocks with very low confidence
valid_blocks = [b for b in blocks if b.get('confidence', 0) >= self.ocr_threshold]
# Group blocks by similar y-positions that might represent table rows
y_tolerance = 10 # pixels
y_groups = {}
for block in valid_blocks:
# Skip footer blocks for table detection unless they have high confidence
# This prevents footer text from being incorrectly treated as tables
if block.get('position') == 'footer' and block.get('confidence', 0) < 0.7:
continue
y_mid = (block['bbox'][1] + block['bbox'][3]) / 2
# Find or create a group for this y-position
group_id = None
for gid, y_val in y_groups.items():
if abs(y_val - y_mid) < y_tolerance:
group_id = gid
break
if group_id is None:
group_id = len(y_groups)
y_groups[group_id] = y_mid
# Add extra attribute to the block for grouping
block['y_group'] = group_id
# Count blocks in each y-group
y_group_counts = {}
for block in valid_blocks:
if 'y_group' not in block:
continue
y_group_counts[block['y_group']] = y_group_counts.get(block['y_group'], 0) + 1
# If multiple y-groups have multiple blocks, this might be a table
# A table should have at least 3 rows with 2+ cells
potential_table_groups = [gid for gid, count in y_group_counts.items() if count >= 2]
if len(potential_table_groups) >= 3:
# Calculate table boundaries for each contiguous set of rows
table_blocks = [b for b in valid_blocks if b.get('y_group') in potential_table_groups]
# Sort by y position to find contiguous rows
table_blocks.sort(key=lambda b: b['bbox'][1])
current_table = []
current_y = None
max_gap = 20 # Max vertical gap between rows to consider them part of the same table
for block in table_blocks:
y_mid = (block['bbox'][1] + block['bbox'][3]) / 2
if current_y is None or abs(y_mid - current_y) <= max_gap:
# Add to current table
current_table.append(block)
current_y = y_mid
else:
# Create a new table if we have enough rows in the current one
if len(set([b.get('y_group') for b in current_table])) >= 3:
self._add_table_from_blocks(tables, current_table)
# Start a new potential table
current_table = [block]
current_y = y_mid
# Check if the last table has enough rows
if len(set([b.get('y_group') for b in current_table])) >= 3:
self._add_table_from_blocks(tables, current_table)
return tables
def _add_table_from_blocks(self, tables, blocks):
"""Create a table from a set of text blocks"""
if not blocks:
return
x0 = min(b['bbox'][0] for b in blocks)
y0 = min(b['bbox'][1] for b in blocks)
x1 = max(b['bbox'][2] for b in blocks)
y1 = max(b['bbox'][3] for b in blocks)
# Count unique y-positions to estimate rows
y_groups = set(b.get('y_group') for b in blocks)
tables.append({
'type': 'table',
'bbox': [x0, y0, x1, y1],
'rows': len(y_groups),
'cells': blocks
})
def _extract_footer(self, page, blocks):
"""Extract footer information specifically"""
# Get blocks that are in the footer area (bottom 40% of page)
footer_blocks = [b for b in blocks if b.get('position') == "footer"]
if not footer_blocks:
return None
# Combine text from all footer blocks
footer_text = "\n".join([b['text'] for b in footer_blocks])
# Calculate the footer area
if footer_blocks:
x0 = min(b['bbox'][0] for b in footer_blocks)
y0 = min(b['bbox'][1] for b in footer_blocks)
x1 = max(b['bbox'][2] for b in footer_blocks)
y1 = max(b['bbox'][3] for b in footer_blocks)
else:
# Default to bottom 40% if no blocks found
x0, y0 = 0, page.rect.height * 0.6
x1, y1 = page.rect.width, page.rect.height
return {
'text': footer_text,
'bbox': [x0, y0, x1, y1],
'blocks': footer_blocks
}
def _extract_metadata(self, pdf):
"""Extract metadata from the PDF document."""
metadata = pdf.metadata
# Convert to standard dict and handle encoding issues
result = {}
for key, value in metadata.items():
if value:
if isinstance(value, str):
result[key] = value
else:
try:
result[key] = str(value)
except:
result[key] = "Unable to decode"
return result
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
config: Optional dictionary containing configuration settings. Supported keys: 'tesseract_cmd' (path to tesseract executable, default: 'tesseract'), 'tesseract_langs' (OCR languages as '+' separated string, default: 'eng+fra+nld'), 'ocr_dpi' (DPI resolution for OCR rendering, default: 300), 'ocr_threshold' (minimum confidence threshold for OCR text blocks, default: 0.3), 'force_ocr' (boolean to force OCR even when native text exists, currently hardcoded to True in code). If None, uses default values for all settings.
Return Value
The class instantiation returns a DocumentProcessor object. The main 'process()' method returns a dictionary (Document object) with keys: 'path' (original file path), 'filename' (base filename), 'pages' (list of page dictionaries), 'text' (concatenated text from all pages), 'blocks' (list of all text blocks across pages), 'metadata' (PDF metadata dictionary). Each page dictionary contains: 'index', 'width', 'height', 'text', 'blocks', 'tables', and optionally 'footer'. Each block contains: 'id', 'type', 'text', 'bbox' (bounding box coordinates), 'confidence', and 'position' (header/body/footer).
Class Interface
Methods
__init__(self, config=None)
Purpose: Initialize the DocumentProcessor with optional configuration settings and verify Tesseract installation
Parameters:
config: Optional dictionary with keys: tesseract_cmd, tesseract_langs, ocr_dpi, ocr_threshold, force_ocr
Returns: None (constructor)
process(self, file_path) -> dict
Purpose: Main entry point to process a PDF file and extract structured text and layout information
Parameters:
file_path: String path to the PDF file to process
Returns: Dictionary (Document object) containing extracted text, pages, blocks, tables, and metadata
_process_pdf(self, pdf_path) -> dict
Purpose: Internal method to process entire PDF document, iterating through all pages
Parameters:
pdf_path: String path to the PDF file
Returns: Dictionary with document structure including all pages, text, blocks, and metadata
_process_page(self, page, page_idx) -> dict
Purpose: Process a single PDF page, extracting text, blocks, tables, and footer information
Parameters:
page: PyMuPDF page objectpage_idx: Integer index of the page (0-based)
Returns: Dictionary with page data including index, dimensions, text, blocks, tables, and optional footer
_needs_ocr(self, text) -> bool
Purpose: Determine if OCR is needed based on the quality and quantity of extracted text
Parameters:
text: String of text extracted from PDF using native methods
Returns: Boolean indicating whether OCR should be performed (True if text is insufficient or malformed)
_ocr_page(self, page) -> tuple[str, list]
Purpose: Perform OCR on a page by rendering it to an image and using Tesseract with HOCR output
Parameters:
page: PyMuPDF page object to perform OCR on
Returns: Tuple of (extracted_text_string, list_of_block_dictionaries)
_extract_blocks(self, page) -> list
Purpose: Extract text blocks from PDF page using native PyMuPDF text extraction with layout information
Parameters:
page: PyMuPDF page object
Returns: List of block dictionaries with id, type, text, bbox, confidence, and position
_determine_block_position(self, bbox, page_rect) -> str
Purpose: Classify a text block as being in header, footer, or body based on vertical position
Parameters:
bbox: List of 4 coordinates [x0, y0, x1, y1] representing block bounding boxpage_rect: PyMuPDF Rect object representing page dimensions
Returns: String: 'header' (top 20%), 'footer' (bottom 40%), or 'body' (middle 40%)
_parse_hocr(self, hocr_data, page_width, page_height) -> list
Purpose: Parse HOCR XML output from Tesseract to extract structured text blocks with positions and confidence
Parameters:
hocr_data: Binary HOCR data from Tesseractpage_width: Float width of the page in PDF unitspage_height: Float height of the page in PDF units
Returns: List of block dictionaries with normalized coordinates and confidence scores
_detect_tables(self, page, blocks) -> list
Purpose: Detect potential tables on the page by analyzing text block alignment and grouping
Parameters:
page: PyMuPDF page objectblocks: List of text block dictionaries from the page
Returns: List of table dictionaries with type, bbox, rows count, and cells
_add_table_from_blocks(self, tables, blocks) -> None
Purpose: Create a table structure from a set of aligned text blocks and add it to the tables list
Parameters:
tables: List to append the new table dictionary toblocks: List of text block dictionaries that form the table
Returns: None (modifies tables list in place)
_extract_footer(self, page, blocks) -> dict | None
Purpose: Extract footer information from blocks positioned in the bottom 40% of the page
Parameters:
page: PyMuPDF page objectblocks: List of text block dictionaries from the page
Returns: Dictionary with footer text, bbox, and blocks, or None if no footer found
_extract_metadata(self, pdf) -> dict
Purpose: Extract metadata from the PDF document (author, title, creation date, etc.)
Parameters:
pdf: PyMuPDF Document object
Returns: Dictionary of metadata key-value pairs with encoding issues handled
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
config |
dict | Configuration dictionary storing all settings passed during initialization | instance |
tesseract_cmd |
str | Path or command to invoke Tesseract OCR executable (default: 'tesseract') | instance |
tesseract_langs |
str | Plus-separated string of language codes for Tesseract OCR (default: 'eng+fra+nld') | instance |
ocr_dpi |
int | DPI resolution for rendering PDF pages to images for OCR (default: 300) | instance |
ocr_threshold |
float | Minimum confidence threshold (0.0-1.0) for including OCR text blocks (default: 0.3) | instance |
Dependencies
loggingostempfilepathlibfitz (PyMuPDF)pytesseractPIL (Pillow)numpyrebeautifulsoup4
Required Imports
import logging
import os
import tempfile
from pathlib import Path
import fitz
import pytesseract
from PIL import Image
import numpy as np
import re
from bs4 import BeautifulSoup
Conditional/Optional Imports
These imports are only needed under specific conditions:
from bs4 import BeautifulSoup
Condition: only used as fallback when HOCR parsing fails with regex errors
Required (conditional)Usage Example
# Basic usage with default configuration
processor = DocumentProcessor()
document = processor.process('invoice.pdf')
print(f"Extracted text: {document['text']}")
print(f"Number of pages: {len(document['pages'])}")
print(f"Number of blocks: {len(document['blocks'])}")
# Advanced usage with custom configuration
config = {
'tesseract_cmd': '/usr/local/bin/tesseract',
'tesseract_langs': 'eng+deu+spa',
'ocr_dpi': 400,
'ocr_threshold': 0.5
}
processor = DocumentProcessor(config=config)
document = processor.process('scanned_invoice.pdf')
# Access page-level information
for page in document['pages']:
print(f"Page {page['index']}: {len(page['blocks'])} blocks, {len(page['tables'])} tables")
if 'footer' in page:
print(f"Footer text: {page['footer']['text']}")
# Access blocks with position information
for block in document['blocks']:
print(f"Block {block['id']}: position={block['position']}, confidence={block['confidence']}")
print(f"Text: {block['text'][:100]}...")
Best Practices
- Always check that Tesseract is properly installed before processing documents that may require OCR
- Handle FileNotFoundError and ValueError exceptions when calling process() method
- The class currently has force_ocr hardcoded to True in _process_page(), which means OCR will always be used regardless of native text availability
- For large PDFs, be aware of memory usage as pages are rendered at high DPI (300 default) for OCR
- The ocr_threshold parameter (default 0.3) filters out low-confidence text blocks - adjust based on document quality
- Footer detection uses bottom 40% of page (changed from 30% in code) - blocks in this area are marked with position='footer'
- Table detection requires at least 3 rows with 2+ cells each - simple heuristic that may need tuning for complex layouts
- The processor modifies page text by appending footer content with 'FOOTER:' prefix to ensure it's not lost
- HOCR parsing uses regex which may fail on malformed output - fallback to BeautifulSoup parsing is implemented
- Confidence scores are 1.0 for native PDF text and 0.0-1.0 for OCR text based on Tesseract confidence
- Block IDs follow format 'block-{page_number}-{block_index}' for native extraction or 'ocr-block-{index}' for OCR
- The class logs extensively - configure logging appropriately to capture warnings and errors
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentProcessor_v2 75.3% similar
-
class DocumentProcessor_v1 74.8% similar
-
class DocumentProcessor 74.0% similar
-
function test_enhanced_pdf_processing 68.3% similar
-
class PDFTextExtractor 67.5% similar