class DocumentProcessor_v2
A document processing class that extracts text from PDF and Word documents using llmsherpa as the primary method with fallback support for PyPDF2, pdfplumber, and python-docx.
/tf/active/vicechatdev/contract_validity_analyzer/utils/document_processor_old.py
13 - 290
complex
Purpose
DocumentProcessor handles document text extraction from various file formats (PDF, DOC, DOCX) with intelligent fallback mechanisms. It primarily uses llmsherpa for PDF processing (via an API endpoint), with automatic fallback to PyPDF2 or pdfplumber if llmsherpa fails. For Word documents, it uses python-docx. The class manages file validation, size limits, format detection, and provides comprehensive metadata about processed documents. It's designed for robust document ingestion pipelines where text extraction reliability is critical.
Source Code
class DocumentProcessor:
"""Handles document processing and text extraction using llmsherpa."""
def __init__(self, config: Dict[str, Any]):
"""
Initialize document processor.
Args:
config: Configuration dictionary with processing settings
"""
self.config = config
self.supported_extensions = config.get('supported_extensions', ['.pdf', '.doc', '.docx'])
self.max_file_size_mb = config.get('max_file_size_mb', 50)
self.timeout = config.get('text_extraction_timeout', 300)
# llmsherpa API endpoint (same as offline_docstore_multi_vice.py)
self.llmsherpa_api_url = "http://llmsherpa:5001/api/parseDocument?renderFormat=all&useNewIndentParser=yes"
# Check for required libraries
self.llmsherpa_available = self._check_llmsherpa_support()
self.fallback_pdf_available = self._check_fallback_pdf_support()
self.word_available = self._check_word_support()
def _check_llmsherpa_support(self) -> bool:
"""Check if llmsherpa is available."""
try:
from llmsherpa.readers import LayoutPDFReader
return True
except ImportError:
logger.warning("llmsherpa not found. Install with: pip install llmsherpa")
return False
def _check_fallback_pdf_support(self) -> bool:
"""Check if fallback PDF processing libraries are available."""
try:
import PyPDF2
return True
except ImportError:
try:
import pdfplumber
return True
except ImportError:
logger.warning("No fallback PDF processing library found.")
return False
def _check_word_support(self) -> bool:
"""Check if Word document processing libraries are available."""
try:
import docx
return True
except ImportError:
logger.warning("python-docx not found. Word document processing unavailable.")
return False
def extract_text(self, file_path: str) -> Optional[str]:
"""
Extract text from a document file using llmsherpa.
Args:
file_path: Path to the document file
Returns:
Extracted text as string, or None if failed
"""
if not os.path.exists(file_path):
logger.error(f"File not found: {file_path}")
return None
# Check file size
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
if file_size_mb > self.max_file_size_mb:
logger.warning(f"File too large ({file_size_mb:.1f} MB): {file_path}")
return None
# Get file extension
_, ext = os.path.splitext(file_path)
ext = ext.lower()
try:
if ext == '.pdf':
return self._extract_pdf_text_llmsherpa(file_path)
elif ext in ['.doc', '.docx']:
return self._extract_word_text(file_path)
else:
logger.warning(f"Unsupported file format: {ext}")
return None
except Exception as e:
logger.error(f"Error extracting text from {file_path}: {e}")
return None
def _extract_pdf_text_llmsherpa(self, file_path: str) -> Optional[str]:
"""Extract text from PDF file using llmsherpa (same as offline_docstore_multi_vice.py)."""
if not self.llmsherpa_available:
logger.warning("llmsherpa not available, trying fallback method")
return self._extract_pdf_text_fallback(file_path)
try:
from llmsherpa.readers import LayoutPDFReader
logger.debug(f"Processing PDF with llmsherpa: {file_path}")
# Use same approach as offline_docstore_multi_vice.py
pdf_reader = LayoutPDFReader(self.llmsherpa_api_url)
doc = pdf_reader.read_pdf(str(file_path))
# Extract text chunks and combine them
text_chunks = []
min_chunk_len = 4000 # Same as reference code
text_chunk_interim = ""
for chunk in doc.chunks():
# Handle paragraph chunks (same logic as reference)
if hasattr(chunk, 'to_text'):
clean_text = chunk.to_text().replace("- ","").replace("\n","")
text_chunk_interim = clean_text if text_chunk_interim == "" else text_chunk_interim + "\n" + clean_text
if len(text_chunk_interim) > min_chunk_len:
text_chunks.append(text_chunk_interim)
text_chunk_interim = ""
# Add any remaining text
if text_chunk_interim:
text_chunks.append(text_chunk_interim)
# Combine all chunks into one text
full_text = "\n\n".join(text_chunks)
logger.debug(f"Extracted {len(full_text)} characters from PDF using llmsherpa")
return full_text if full_text.strip() else None
except Exception as e:
logger.error(f"Error processing PDF with llmsherpa: {e}")
logger.info("Trying fallback PDF extraction method")
return self._extract_pdf_text_fallback(file_path)
import pdfplumber
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
if text.strip():
return text.strip()
except ImportError:
pass
except Exception as e:
logger.debug(f"pdfplumber failed for {file_path}: {e}")
# Fallback to PyPDF2
try:
import PyPDF2
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
return text.strip() if text.strip() else None
except ImportError:
logger.error("No PDF processing library available")
return None
except Exception as e:
logger.error(f"PyPDF2 failed for {file_path}: {e}")
return None
def _extract_word_text(self, file_path: str) -> Optional[str]:
"""Extract text from Word document."""
if not self.word_available:
logger.error("Word document processing not available")
return None
try:
import docx
doc = docx.Document(file_path)
text_parts = []
# Extract text from paragraphs
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text_parts.append(paragraph.text.strip())
# Extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text_parts.append(cell.text.strip())
return "\n".join(text_parts) if text_parts else None
except Exception as e:
logger.error(f"Error extracting Word text from {file_path}: {e}")
return None
def process_document(self, file_path: str, filename: str = None) -> Dict[str, Any]:
"""
Process a document and extract metadata.
Args:
file_path: Path to the document file
filename: Original filename (if different from file_path)
Returns:
Dictionary with processing results
"""
filename = filename or os.path.basename(file_path)
result = {
'filename': filename,
'file_path': file_path,
'success': False,
'text': None,
'error': None,
'metadata': {}
}
try:
# Extract basic file information
if os.path.exists(file_path):
stat = os.stat(file_path)
result['metadata'] = {
'size_bytes': stat.st_size,
'size_mb': stat.st_size / (1024 * 1024),
'modified_time': stat.st_mtime,
'extension': os.path.splitext(filename)[1].lower()
}
# Extract text
text = self.extract_text(file_path)
if text:
result['text'] = text
result['success'] = True
result['metadata']['text_length'] = len(text)
result['metadata']['word_count'] = len(text.split())
else:
result['error'] = "Failed to extract text from document"
except Exception as e:
result['error'] = f"Document processing failed: {str(e)}"
logger.error(f"Error processing document {filename}: {e}")
return result
def is_supported_format(self, filename: str) -> bool:
"""
Check if a file format is supported.
Args:
filename: Name of the file
Returns:
True if supported, False otherwise
"""
_, ext = os.path.splitext(filename)
return ext.lower() in self.supported_extensions
def cleanup_temp_file(self, file_path: str):
"""
Clean up a temporary file.
Args:
file_path: Path to the temporary file
"""
try:
if os.path.exists(file_path):
os.unlink(file_path)
logger.debug(f"Cleaned up temporary file: {file_path}")
except Exception as e:
logger.warning(f"Failed to clean up temporary file {file_path}: {e}")
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
config: A dictionary containing configuration settings for document processing. Expected keys include: 'supported_extensions' (list of file extensions like ['.pdf', '.doc', '.docx'], defaults to these three), 'max_file_size_mb' (maximum file size in megabytes, defaults to 50), 'text_extraction_timeout' (timeout in seconds for extraction operations, defaults to 300). The config dictionary allows flexible configuration of processing behavior without modifying code.
Return Value
The constructor returns a DocumentProcessor instance. Key methods return: extract_text() returns Optional[str] (extracted text or None on failure), process_document() returns Dict[str, Any] with keys 'filename', 'file_path', 'success' (bool), 'text' (extracted text or None), 'error' (error message or None), and 'metadata' (dict with size_bytes, size_mb, modified_time, extension, text_length, word_count), is_supported_format() returns bool indicating if file format is supported, cleanup_temp_file() returns None.
Class Interface
Methods
__init__(self, config: Dict[str, Any])
Purpose: Initialize the DocumentProcessor with configuration settings and check for available processing libraries
Parameters:
config: Dictionary with keys: supported_extensions (list), max_file_size_mb (int), text_extraction_timeout (int)
Returns: None - initializes instance attributes
_check_llmsherpa_support(self) -> bool
Purpose: Check if llmsherpa library is available for PDF processing
Returns: True if llmsherpa can be imported, False otherwise
_check_fallback_pdf_support(self) -> bool
Purpose: Check if fallback PDF processing libraries (PyPDF2 or pdfplumber) are available
Returns: True if either PyPDF2 or pdfplumber can be imported, False otherwise
_check_word_support(self) -> bool
Purpose: Check if python-docx library is available for Word document processing
Returns: True if docx can be imported, False otherwise
extract_text(self, file_path: str) -> Optional[str]
Purpose: Extract text from a document file using appropriate method based on file type
Parameters:
file_path: Path to the document file to process
Returns: Extracted text as string if successful, None if extraction fails or file doesn't exist
_extract_pdf_text_llmsherpa(self, file_path: str) -> Optional[str]
Purpose: Extract text from PDF using llmsherpa API, with automatic fallback to other methods if it fails
Parameters:
file_path: Path to the PDF file
Returns: Extracted text as string if successful, None if all methods fail
_extract_pdf_text_fallback(self, file_path: str) -> Optional[str]
Purpose: Extract text from PDF using fallback methods (pdfplumber first, then PyPDF2)
Parameters:
file_path: Path to the PDF file
Returns: Extracted text as string if successful, None if all fallback methods fail
_extract_word_text(self, file_path: str) -> Optional[str]
Purpose: Extract text from Word documents (.doc, .docx) including paragraphs and tables
Parameters:
file_path: Path to the Word document file
Returns: Extracted text as string with paragraphs and table content, None if extraction fails
process_document(self, file_path: str, filename: str = None) -> Dict[str, Any]
Purpose: Process a document and extract both text and comprehensive metadata
Parameters:
file_path: Path to the document filefilename: Original filename if different from file_path basename (optional)
Returns: Dictionary with keys: filename, file_path, success (bool), text (str or None), error (str or None), metadata (dict with size_bytes, size_mb, modified_time, extension, text_length, word_count)
is_supported_format(self, filename: str) -> bool
Purpose: Check if a file format is supported based on its extension
Parameters:
filename: Name of the file to check
Returns: True if the file extension is in supported_extensions list, False otherwise
cleanup_temp_file(self, file_path: str)
Purpose: Delete a temporary file from the filesystem
Parameters:
file_path: Path to the temporary file to delete
Returns: None - logs warnings if cleanup fails
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
config |
Dict[str, Any] | Configuration dictionary passed during initialization | instance |
supported_extensions |
List[str] | List of supported file extensions (e.g., ['.pdf', '.doc', '.docx']) | instance |
max_file_size_mb |
int | Maximum allowed file size in megabytes (default 50) | instance |
timeout |
int | Timeout in seconds for text extraction operations (default 300) | instance |
llmsherpa_api_url |
str | URL endpoint for llmsherpa API service (http://llmsherpa:5001/api/parseDocument?renderFormat=all&useNewIndentParser=yes) | instance |
llmsherpa_available |
bool | Flag indicating if llmsherpa library is available for use | instance |
fallback_pdf_available |
bool | Flag indicating if fallback PDF libraries (PyPDF2 or pdfplumber) are available | instance |
word_available |
bool | Flag indicating if python-docx library is available for Word document processing | instance |
Dependencies
loggingostempfiletypingpathlibllmsherpaPyPDF2python-docxpdfplumber
Required Imports
import logging
import os
import tempfile
from typing import Optional, Dict, Any
from pathlib import Path
Conditional/Optional Imports
These imports are only needed under specific conditions:
from llmsherpa.readers import LayoutPDFReader
Condition: only if llmsherpa is available and being used for PDF processing (primary method)
Optionalimport PyPDF2
Condition: only if llmsherpa fails or is unavailable, used as fallback for PDF processing
Optionalimport pdfplumber
Condition: only if llmsherpa fails or is unavailable, used as fallback for PDF processing (tried before PyPDF2)
Optionalimport docx
Condition: only if processing Word documents (.doc, .docx files)
OptionalUsage Example
# Basic usage
config = {
'supported_extensions': ['.pdf', '.doc', '.docx'],
'max_file_size_mb': 50,
'text_extraction_timeout': 300
}
processor = DocumentProcessor(config)
# Check if file format is supported
if processor.is_supported_format('document.pdf'):
# Extract text only
text = processor.extract_text('/path/to/document.pdf')
if text:
print(f"Extracted {len(text)} characters")
# Process document with full metadata
result = processor.process_document('/path/to/document.pdf', 'document.pdf')
if result['success']:
print(f"Text: {result['text'][:100]}...")
print(f"Metadata: {result['metadata']}")
else:
print(f"Error: {result['error']}")
# Clean up temporary files
processor.cleanup_temp_file('/tmp/temp_document.pdf')
Best Practices
- Always check is_supported_format() before attempting to process a file to avoid unnecessary processing attempts
- The class automatically handles fallback mechanisms - llmsherpa is tried first for PDFs, then pdfplumber, then PyPDF2
- Use process_document() for comprehensive processing with metadata, or extract_text() for simple text extraction
- File size limits are enforced automatically based on max_file_size_mb configuration to prevent memory issues
- The class checks for library availability at initialization and sets flags (llmsherpa_available, fallback_pdf_available, word_available) to determine processing capabilities
- Always call cleanup_temp_file() after processing temporary files to prevent disk space issues
- The llmsherpa API endpoint must be accessible and running for primary PDF processing to work
- Error handling is built-in - methods return None or set error fields rather than raising exceptions in most cases
- For PDFs, llmsherpa chunks text into 4000+ character segments before combining, which may affect downstream processing
- Word document processing extracts both paragraph text and table content
- The class is stateless after initialization - you can reuse the same instance for multiple documents
- Check the 'success' field in process_document() results before using the extracted text
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentProcessor_v1 99.2% similar
-
class DocumentProcessor_v3 84.0% similar
-
class TestDocumentProcessor 76.4% similar
-
function test_document_processor 76.3% similar
-
class DocumentProcessor 72.2% similar