class DocumentProcessor_v1
A document processing class that extracts text from PDF and Word documents using llmsherpa as the primary method with fallback support for PyPDF2, pdfplumber, and python-docx.
/tf/active/vicechatdev/contract_validity_analyzer/utils/document_processor_new.py
13 - 302
complex
Purpose
DocumentProcessor handles document text extraction from various file formats (PDF, DOC, DOCX) with intelligent fallback mechanisms. It primarily uses llmsherpa for PDF processing (via an API endpoint), with automatic fallback to PyPDF2 or pdfplumber if llmsherpa fails. For Word documents, it uses python-docx. The class manages file validation, size limits, format detection, and provides comprehensive metadata about processed documents. It's designed for robust document ingestion pipelines where text extraction reliability is critical.
Source Code
class DocumentProcessor:
"""Handles document processing and text extraction using llmsherpa."""
def __init__(self, config: Dict[str, Any]):
"""
Initialize document processor.
Args:
config: Configuration dictionary with processing settings
"""
self.config = config
self.supported_extensions = config.get('supported_extensions', ['.pdf', '.doc', '.docx'])
self.max_file_size_mb = config.get('max_file_size_mb', 50)
self.timeout = config.get('text_extraction_timeout', 300)
# llmsherpa API endpoint (same as offline_docstore_multi_vice.py)
self.llmsherpa_api_url = "http://llmsherpa:5001/api/parseDocument?renderFormat=all&useNewIndentParser=yes"
# Check for required libraries
self.llmsherpa_available = self._check_llmsherpa_support()
self.fallback_pdf_available = self._check_fallback_pdf_support()
self.word_available = self._check_word_support()
def _check_llmsherpa_support(self) -> bool:
"""Check if llmsherpa is available."""
try:
from llmsherpa.readers import LayoutPDFReader
return True
except ImportError:
logger.warning("llmsherpa not found. Install with: pip install llmsherpa")
return False
def _check_fallback_pdf_support(self) -> bool:
"""Check if fallback PDF processing libraries are available."""
try:
import PyPDF2
return True
except ImportError:
try:
import pdfplumber
return True
except ImportError:
logger.warning("No fallback PDF processing library found.")
return False
def _check_word_support(self) -> bool:
"""Check if Word document processing libraries are available."""
try:
import docx
return True
except ImportError:
logger.warning("python-docx not found. Word document processing unavailable.")
return False
def extract_text(self, file_path: str) -> Optional[str]:
"""
Extract text from a document file using llmsherpa.
Args:
file_path: Path to the document file
Returns:
Extracted text as string, or None if failed
"""
if not os.path.exists(file_path):
logger.error(f"File not found: {file_path}")
return None
# Check file size
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
if file_size_mb > self.max_file_size_mb:
logger.warning(f"File too large ({file_size_mb:.1f} MB): {file_path}")
return None
# Get file extension
_, ext = os.path.splitext(file_path)
ext = ext.lower()
try:
if ext == '.pdf':
return self._extract_pdf_text_llmsherpa(file_path)
elif ext in ['.doc', '.docx']:
return self._extract_word_text(file_path)
else:
logger.warning(f"Unsupported file format: {ext}")
return None
except Exception as e:
logger.error(f"Error extracting text from {file_path}: {e}")
return None
def _extract_pdf_text_llmsherpa(self, file_path: str) -> Optional[str]:
"""Extract text from PDF file using llmsherpa (same as offline_docstore_multi_vice.py)."""
if not self.llmsherpa_available:
logger.warning("llmsherpa not available, trying fallback method")
return self._extract_pdf_text_fallback(file_path)
try:
from llmsherpa.readers import LayoutPDFReader
logger.debug(f"Processing PDF with llmsherpa: {file_path}")
# Use same approach as offline_docstore_multi_vice.py
pdf_reader = LayoutPDFReader(self.llmsherpa_api_url)
doc = pdf_reader.read_pdf(str(file_path))
# Extract text chunks and combine them
text_chunks = []
min_chunk_len = 4000 # Same as reference code
text_chunk_interim = ""
for chunk in doc.chunks():
# Handle paragraph chunks (same logic as reference)
if hasattr(chunk, 'to_text'):
clean_text = chunk.to_text().replace("- ","").replace("\n","")
text_chunk_interim = clean_text if text_chunk_interim == "" else text_chunk_interim + "\n" + clean_text
if len(text_chunk_interim) > min_chunk_len:
text_chunks.append(text_chunk_interim)
text_chunk_interim = ""
# Add any remaining text
if text_chunk_interim:
text_chunks.append(text_chunk_interim)
# Combine all chunks into one text
full_text = "\n\n".join(text_chunks)
logger.debug(f"Extracted {len(full_text)} characters from PDF using llmsherpa")
return full_text if full_text.strip() else None
except Exception as e:
logger.error(f"Error processing PDF with llmsherpa: {e}")
logger.info("Trying fallback PDF extraction method")
return self._extract_pdf_text_fallback(file_path)
def _extract_pdf_text_fallback(self, file_path: str) -> Optional[str]:
"""Fallback PDF text extraction using basic libraries."""
if not self.fallback_pdf_available:
logger.error("No PDF processing libraries available")
return None
text = ""
# Try pdfplumber first (better for complex layouts)
try:
import pdfplumber
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
if text.strip():
logger.debug(f"Extracted {len(text)} characters using pdfplumber")
return text.strip()
except ImportError:
pass
except Exception as e:
logger.warning(f"pdfplumber extraction failed: {e}")
# Fallback to PyPDF2
try:
import PyPDF2
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
if text.strip():
logger.debug(f"Extracted {len(text)} characters using PyPDF2")
return text.strip()
except ImportError:
logger.error("No PDF processing library available")
except Exception as e:
logger.error(f"PyPDF2 extraction failed: {e}")
return None
def _extract_word_text(self, file_path: str) -> Optional[str]:
"""Extract text from Word document."""
if not self.word_available:
logger.error("Word document processing not available")
return None
try:
import docx
doc = docx.Document(file_path)
text_parts = []
# Extract text from paragraphs
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text_parts.append(paragraph.text.strip())
# Extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
if cell.text.strip():
text_parts.append(cell.text.strip())
return "\n".join(text_parts) if text_parts else None
except Exception as e:
logger.error(f"Error extracting Word text from {file_path}: {e}")
return None
def process_document(self, file_path: str, filename: str = None) -> Dict[str, Any]:
"""
Process a document and extract metadata.
Args:
file_path: Path to the document file
filename: Original filename (if different from file_path)
Returns:
Dictionary with processing results
"""
filename = filename or os.path.basename(file_path)
result = {
'filename': filename,
'file_path': file_path,
'success': False,
'text': None,
'error': None,
'metadata': {}
}
try:
# Extract basic file information
if os.path.exists(file_path):
stat = os.stat(file_path)
result['metadata'] = {
'size_bytes': stat.st_size,
'size_mb': stat.st_size / (1024 * 1024),
'modified_time': stat.st_mtime,
'extension': os.path.splitext(filename)[1].lower()
}
# Extract text
text = self.extract_text(file_path)
if text:
result['text'] = text
result['success'] = True
result['metadata']['text_length'] = len(text)
result['metadata']['word_count'] = len(text.split())
else:
result['error'] = "Failed to extract text from document"
except Exception as e:
result['error'] = f"Document processing failed: {str(e)}"
logger.error(f"Error processing document {filename}: {e}")
return result
def is_supported_format(self, filename: str) -> bool:
"""
Check if a file format is supported.
Args:
filename: Name of the file
Returns:
True if supported, False otherwise
"""
_, ext = os.path.splitext(filename)
return ext.lower() in self.supported_extensions
def cleanup_temp_file(self, file_path: str):
"""
Clean up a temporary file.
Args:
file_path: Path to the temporary file
"""
try:
if os.path.exists(file_path):
os.unlink(file_path)
logger.debug(f"Cleaned up temporary file: {file_path}")
except Exception as e:
logger.warning(f"Failed to clean up temporary file {file_path}: {e}")
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
config: A dictionary containing configuration settings for document processing. Expected keys include: 'supported_extensions' (list of file extensions like ['.pdf', '.doc', '.docx'], defaults to these three), 'max_file_size_mb' (maximum file size in megabytes, defaults to 50), and 'text_extraction_timeout' (timeout in seconds for extraction operations, defaults to 300). The config dictionary controls processing behavior and limits.
Return Value
The constructor returns a DocumentProcessor instance. Key methods return: extract_text() returns Optional[str] (extracted text or None on failure), process_document() returns Dict[str, Any] with keys 'filename', 'file_path', 'success' (bool), 'text' (Optional[str]), 'error' (Optional[str]), and 'metadata' (dict with size_bytes, size_mb, modified_time, extension, text_length, word_count), is_supported_format() returns bool indicating format support, and cleanup_temp_file() returns None.
Class Interface
Methods
__init__(self, config: Dict[str, Any])
Purpose: Initialize the DocumentProcessor with configuration settings and check for available processing libraries
Parameters:
config: Configuration dictionary with keys: supported_extensions (list), max_file_size_mb (int), text_extraction_timeout (int)
Returns: None - initializes instance attributes
_check_llmsherpa_support(self) -> bool
Purpose: Check if llmsherpa library is available for PDF processing
Returns: True if llmsherpa can be imported, False otherwise
_check_fallback_pdf_support(self) -> bool
Purpose: Check if fallback PDF processing libraries (PyPDF2 or pdfplumber) are available
Returns: True if at least one fallback library is available, False otherwise
_check_word_support(self) -> bool
Purpose: Check if python-docx library is available for Word document processing
Returns: True if python-docx can be imported, False otherwise
extract_text(self, file_path: str) -> Optional[str]
Purpose: Extract text content from a document file using appropriate extraction method based on file type
Parameters:
file_path: Path to the document file to process
Returns: Extracted text as a string if successful, None if extraction fails or file is unsupported
_extract_pdf_text_llmsherpa(self, file_path: str) -> Optional[str]
Purpose: Extract text from PDF using llmsherpa API with chunking logic (primary PDF extraction method)
Parameters:
file_path: Path to the PDF file
Returns: Extracted text with chunks combined, or None if extraction fails (falls back to _extract_pdf_text_fallback)
_extract_pdf_text_fallback(self, file_path: str) -> Optional[str]
Purpose: Fallback PDF text extraction using pdfplumber (preferred) or PyPDF2
Parameters:
file_path: Path to the PDF file
Returns: Extracted text as string, or None if all fallback methods fail
_extract_word_text(self, file_path: str) -> Optional[str]
Purpose: Extract text from Word documents (.doc, .docx) including paragraphs and tables
Parameters:
file_path: Path to the Word document file
Returns: Extracted text with paragraphs and table content, or None if extraction fails
process_document(self, file_path: str, filename: str = None) -> Dict[str, Any]
Purpose: Process a document and extract both text and comprehensive metadata
Parameters:
file_path: Path to the document filefilename: Optional original filename if different from file_path basename
Returns: Dictionary with keys: filename, file_path, success (bool), text (Optional[str]), error (Optional[str]), metadata (dict with size_bytes, size_mb, modified_time, extension, text_length, word_count)
is_supported_format(self, filename: str) -> bool
Purpose: Check if a file format is supported based on its extension
Parameters:
filename: Name or path of the file to check
Returns: True if the file extension is in supported_extensions list, False otherwise
cleanup_temp_file(self, file_path: str)
Purpose: Delete a temporary file from the filesystem
Parameters:
file_path: Path to the temporary file to delete
Returns: None - logs warnings if cleanup fails
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
config |
Dict[str, Any] | Configuration dictionary passed during initialization containing processing settings | instance |
supported_extensions |
List[str] | List of supported file extensions (e.g., ['.pdf', '.doc', '.docx']), extracted from config or defaults | instance |
max_file_size_mb |
int | Maximum allowed file size in megabytes, extracted from config or defaults to 50 | instance |
timeout |
int | Timeout in seconds for text extraction operations, extracted from config or defaults to 300 | instance |
llmsherpa_api_url |
str | URL endpoint for llmsherpa API service, hardcoded to 'http://llmsherpa:5001/api/parseDocument?renderFormat=all&useNewIndentParser=yes' | instance |
llmsherpa_available |
bool | Flag indicating whether llmsherpa library is available for use, set during initialization | instance |
fallback_pdf_available |
bool | Flag indicating whether fallback PDF libraries (PyPDF2 or pdfplumber) are available, set during initialization | instance |
word_available |
bool | Flag indicating whether python-docx library is available for Word document processing, set during initialization | instance |
Dependencies
llmsherpaPyPDF2pdfplumberpython-docxloggingostempfiletypingpathlib
Required Imports
import logging
import os
import tempfile
from typing import Optional, Dict, Any
from pathlib import Path
Conditional/Optional Imports
These imports are only needed under specific conditions:
from llmsherpa.readers import LayoutPDFReader
Condition: Required for primary PDF processing via llmsherpa API. Checked at runtime via _check_llmsherpa_support()
Optionalimport PyPDF2
Condition: Used as fallback for PDF processing when llmsherpa fails or is unavailable. Checked via _check_fallback_pdf_support()
Optionalimport pdfplumber
Condition: Preferred fallback for PDF processing (better for complex layouts). Checked via _check_fallback_pdf_support()
Optionalimport docx
Condition: Required for Word document (.doc, .docx) processing. Checked via _check_word_support()
OptionalUsage Example
# Basic usage
config = {
'supported_extensions': ['.pdf', '.doc', '.docx'],
'max_file_size_mb': 50,
'text_extraction_timeout': 300
}
processor = DocumentProcessor(config)
# Check if format is supported
if processor.is_supported_format('document.pdf'):
# Extract text only
text = processor.extract_text('/path/to/document.pdf')
if text:
print(f"Extracted {len(text)} characters")
# Process document with full metadata
result = processor.process_document('/path/to/document.pdf', 'document.pdf')
if result['success']:
print(f"Text: {result['text'][:100]}...")
print(f"Metadata: {result['metadata']}")
else:
print(f"Error: {result['error']}")
# Clean up temporary file if needed
processor.cleanup_temp_file('/tmp/temp_document.pdf')
Best Practices
- Always check is_supported_format() before attempting to process a file to avoid unnecessary processing attempts
- The class automatically handles fallback mechanisms, so no manual intervention is needed if primary extraction fails
- Use process_document() instead of extract_text() when you need comprehensive metadata along with text
- The llmsherpa API endpoint must be accessible at initialization time; ensure the service is running before creating instances
- File size limits are enforced automatically; configure max_file_size_mb appropriately for your use case
- The class checks for library availability at initialization and sets flags (llmsherpa_available, fallback_pdf_available, word_available) - these determine available functionality
- For PDF processing, llmsherpa is attempted first, then pdfplumber, then PyPDF2 as fallbacks
- Text extraction may return None on failure - always check return values before using extracted text
- Use cleanup_temp_file() to remove temporary files after processing to avoid disk space issues
- The class is stateless after initialization - safe to reuse for multiple documents
- Logger warnings indicate missing optional dependencies; install them for full functionality
- For production use, ensure at least one PDF library and python-docx are installed to support all advertised formats
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentProcessor_v2 99.2% similar
-
class DocumentProcessor_v3 84.3% similar
-
class TestDocumentProcessor 76.5% similar
-
function test_document_processor 76.4% similar
-
class DocumentProcessor 72.0% similar