class DocumentProcessor_v3
Handles document processing and text extraction using llmsherpa (same approach as offline_docstore_multi_vice.py).
/tf/active/vicechatdev/docchat/document_processor.py
44 - 611
moderate
Purpose
Handles document processing and text extraction using llmsherpa (same approach as offline_docstore_multi_vice.py).
Source Code
class DocumentProcessor:
"""Handles document processing and text extraction using llmsherpa (same approach as offline_docstore_multi_vice.py)."""
def __init__(self, config: Dict[str, Any]):
"""
Initialize document processor.
Args:
config: Configuration dictionary with processing settings
"""
self.config = config
self.supported_extensions = config.get('supported_extensions', ['.pdf', '.doc', '.docx'])
self.max_file_size_mb = config.get('max_file_size_mb', 50)
self.timeout = config.get('text_extraction_timeout', 300)
# llmsherpa API endpoint (same as offline_docstore_multi_vice.py)
self.llmsherpa_api_url = "http://llmsherpa:5001/api/parseDocument?renderFormat=all&useNewIndentParser=yes"
# Create temp directory
self.temp_dir = Path(tempfile.mkdtemp())
os.makedirs(self.temp_dir, exist_ok=True)
# Check for required libraries
self.llmsherpa_available = self._check_llmsherpa_support()
self.word_available = self._check_word_support()
def _check_llmsherpa_support(self) -> bool:
"""Check if llmsherpa is available."""
try:
from llmsherpa.readers import LayoutPDFReader
import llmsherpa.readers.layout_reader
return True
except ImportError:
logger.warning("llmsherpa not found. Install with: pip install llmsherpa")
return False
def _check_word_support(self) -> bool:
"""Check if Word document processing libraries are available."""
try:
import docx
return True
except ImportError:
logger.warning("python-docx not found. Word document processing unavailable.")
return False
def extract_text(self, file_path: str) -> Optional[str]:
"""
Extract text from a document file using llmsherpa.
Args:
file_path: Path to the document file
Returns:
Extracted text as string, or None if failed
"""
if not os.path.exists(file_path):
logger.error(f"File not found: {file_path}")
return None
# Check file size
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
if file_size_mb > self.max_file_size_mb:
logger.warning(f"File too large ({file_size_mb:.1f} MB): {file_path}")
return None
# Get file extension
_, ext = os.path.splitext(file_path)
ext = ext.lower()
try:
if ext == '.pdf':
return self._process_pdf_document(file_path)
elif ext in ['.doc', '.docx']:
return self._process_word_document(file_path)
else:
logger.warning(f"Unsupported file format: {ext}")
return None
except Exception as e:
logger.error(f"Error extracting text from {file_path}: {e}")
return None
def _convert_to_pdf(self, input_file):
"""Convert a document to PDF using LibreOffice with unoconv fallback (same as offline_docstore_multi_vice.py)"""
input_path = Path(input_file)
output_pdf = self.temp_dir / f"{input_path.stem}.pdf"
# First attempt: Use LibreOffice directly
try:
# First check if the input file exists
if not input_path.exists():
logger.error(f"Input file does not exist: {input_path}")
return None
# Make sure the temp directory exists
os.makedirs(self.temp_dir, exist_ok=True)
# Absolute paths to avoid directory issues
abs_input = input_path.absolute()
abs_output_dir = self.temp_dir.absolute()
logger.info(f"Converting {abs_input} to PDF in {abs_output_dir}")
# Use LibreOffice for conversion with expanded command
cmd = [
'libreoffice',
'--headless',
'--norestore',
'--nofirststartwizard',
'--convert-to', 'pdf',
'--outdir', str(abs_output_dir),
str(abs_input)
]
# Run with increased timeout
process = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=180 # 3 minute timeout
)
# Check if there was an error
if process.returncode != 0:
logger.error(f"LibreOffice error: {process.stderr}")
# Don't return None here, try unoconv instead
else:
# Verify the file was actually created
if not output_pdf.exists():
# Sometimes LibreOffice creates files with slightly different names
# Try to find a matching PDF
potential_pdfs = list(self.temp_dir.glob(f"{input_path.stem}*.pdf"))
if potential_pdfs:
output_pdf = potential_pdfs[0]
logger.info(f"Found PDF with alternative name: {output_pdf}")
return output_pdf
# If no file was found, continue to unoconv
else:
logger.info(f"Successfully converted to PDF with LibreOffice: {output_pdf}")
return output_pdf
# If we get here, LibreOffice failed or didn't create the file
logger.info(f"Trying alternative conversion with unoconv for {input_path}")
# Second attempt: Use unoconv
try:
alt_output_pdf = self.temp_dir / f"{input_path.stem}_unoconv.pdf"
# Run unoconv with timeout
unoconv_cmd = [
'unoconv',
'-f', 'pdf',
'-o', str(alt_output_pdf),
str(abs_input)
]
unoconv_process = subprocess.run(
unoconv_cmd,
capture_output=True,
text=True,
timeout=180 # 3 minute timeout
)
if unoconv_process.returncode != 0:
logger.error(f"Unoconv error: {unoconv_process.stderr}")
return None
else:
logger.info(f"Successfully converted to PDF with unoconv: {alt_output_pdf}")
return alt_output_pdf
except subprocess.TimeoutExpired:
logger.error(f"Unoconv timeout for {input_path}")
return None
except Exception as e:
logger.error(f"Unoconv failed for {input_path}: {str(e)}")
return None
except subprocess.TimeoutExpired:
logger.error(f"LibreOffice timeout for {input_path}")
return None
except Exception as e:
logger.error(f"Conversion failed for {input_path}: {str(e)}")
return None
def _process_word_document(self, file_path):
"""Process Word documents by converting to PDF and using llmsherpa (same as offline_docstore_multi_vice.py)"""
logger.info(f"Processing Word document: {file_path}")
# Convert to PDF
pdf_path = self._convert_to_pdf(file_path)
if not pdf_path:
return None
# Process using llmsherpa (same as PDF processing)
return self._process_pdf_document(pdf_path)
def _process_pdf_document(self, file_path):
"""Process PDF documents using llmsherpa (same as offline_docstore_multi_vice.py)"""
logger.info(f"Processing PDF: {file_path}")
if not self.llmsherpa_available:
logger.error("llmsherpa not available")
# If llmsherpa is not available, try OCR directly
if OCR_AVAILABLE:
logger.info("Attempting OCR as primary method since llmsherpa unavailable")
ocr_text = self._extract_text_with_ocr(file_path)
if ocr_text:
return ocr_text
return None
full_text = None
llmsherpa_failed = False
try:
from llmsherpa.readers import LayoutPDFReader
import llmsherpa.readers.layout_reader
logger.debug(f"Processing PDF with llmsherpa: {file_path}")
# Use same approach as offline_docstore_multi_vice.py
pdf_reader = LayoutPDFReader(self.llmsherpa_api_url)
min_chunk_len = 4000
try:
doc = pdf_reader.read_pdf(str(file_path))
except Exception as e:
logger.error(f"Error processing PDF {file_path} with llmsherpa: {str(e)}")
llmsherpa_failed = True
if not llmsherpa_failed:
try:
# Extract text chunks using the exact same logic as offline_docstore_multi_vice.py
text_chunks = []
table_chunks = []
text_chunk_interim = ""
for d in doc.chunks():
# Adding a minimum chunk length (same as reference)
if isinstance(d, llmsherpa.readers.layout_reader.Paragraph):
clean_text = d.to_text().replace("- ","").replace("\n","")
text_chunk_interim = clean_text if text_chunk_interim == "" else text_chunk_interim + "\n" + clean_text
if len(text_chunk_interim) > min_chunk_len:
text_chunks.append(text_chunk_interim)
text_chunk_interim = ""
if isinstance(d, llmsherpa.readers.layout_reader.Table):
table_chunks.append(d.to_text())
# Add any remaining text
if text_chunk_interim != "":
text_chunks.append(text_chunk_interim)
# Combine all chunks into one text
all_text = []
all_text.extend(text_chunks)
all_text.extend(table_chunks)
full_text = "\n\n".join(all_text)
logger.debug(f"Extracted {len(full_text) if full_text else 0} characters from PDF using llmsherpa")
except Exception as e:
logger.error(f"Error extracting text from llmsherpa document: {str(e)}")
llmsherpa_failed = True
except Exception as e:
logger.error(f"Error initializing llmsherpa PDF reader: {e}")
llmsherpa_failed = True
# Determine if we should use OCR
use_ocr = False
if llmsherpa_failed:
logger.info("llmsherpa processing failed, attempting OCR as fallback")
use_ocr = True
elif full_text and self._should_use_ocr(full_text):
logger.info("Text extraction quality is poor, attempting OCR fallback")
use_ocr = True
elif not full_text or not full_text.strip():
logger.info("No text extracted by llmsherpa, attempting OCR as fallback")
use_ocr = True
# Try OCR if needed
if use_ocr and OCR_AVAILABLE:
ocr_text = self._extract_text_with_ocr(file_path)
if ocr_text:
if not full_text or len(ocr_text) > len(full_text):
logger.info("OCR produced better results, using OCR text")
return ocr_text
else:
logger.info("llmsherpa text was better than OCR, using llmsherpa text")
elif use_ocr and not OCR_AVAILABLE:
logger.warning("OCR fallback needed but OCR libraries not available")
return full_text if full_text and full_text.strip() else None
def process_document(self, file_path: str, filename: str = None) -> Dict[str, Any]:
"""
Process a document and extract metadata.
Args:
file_path: Path to the document file
filename: Original filename (if different from file_path)
Returns:
Dictionary with processing results
"""
filename = filename or os.path.basename(file_path)
result = {
'filename': filename,
'file_path': file_path,
'success': False,
'text': None,
'error': None,
'metadata': {}
}
try:
# Extract basic file information
if os.path.exists(file_path):
stat = os.stat(file_path)
result['metadata'] = {
'size_bytes': stat.st_size,
'size_mb': stat.st_size / (1024 * 1024),
'modified_time': stat.st_mtime,
'extension': os.path.splitext(filename)[1].lower()
}
# Extract text
text = self.extract_text(file_path)
if text:
# Check for Adobe Reader redirect message (common in secure document systems)
if self._is_adobe_reader_redirect(text):
result['error'] = "Document contains Adobe Reader redirect - likely points to external secure document system (e.g., Ansarada)"
result['metadata']['document_type'] = 'redirect_placeholder'
logger.warning(f"Document {filename} contains Adobe Reader redirect message")
else:
result['text'] = text
result['success'] = True
result['metadata']['text_length'] = len(text)
result['metadata']['word_count'] = len(text.split())
result['metadata']['document_type'] = 'content_document'
else:
result['error'] = "Failed to extract text from document"
except Exception as e:
result['error'] = f"Document processing failed: {str(e)}"
logger.error(f"Error processing document {filename}: {e}")
return result
def process_document_with_ocr(self, file_path: str, filename: str = None) -> Dict[str, Any]:
"""
Process a document using OCR extraction (forced OCR mode).
Args:
file_path: Path to the document file
filename: Original filename (if different from file_path)
Returns:
Dictionary with processing results
"""
filename = filename or os.path.basename(file_path)
try:
# Force OCR extraction for PDFs
_, ext = os.path.splitext(file_path)
ext = ext.lower()
if ext == '.pdf':
if OCR_AVAILABLE:
logger.info(f"Forcing OCR extraction for {filename}")
ocr_text = self._extract_text_with_ocr(file_path)
if ocr_text:
return {
'success': True,
'text': ocr_text,
'metadata': {
'extraction_method': 'OCR',
'file_extension': ext,
'file_size_bytes': os.path.getsize(file_path),
'text_length': len(ocr_text)
}
}
else:
return {
'success': False,
'error': 'OCR extraction failed'
}
else:
return {
'success': False,
'error': 'OCR libraries not available'
}
else:
# For non-PDF files, fall back to regular processing
return self.process_document(file_path, filename)
except Exception as e:
logger.error(f"Error in OCR processing for {filename}: {e}")
return {
'success': False,
'error': f'OCR processing error: {str(e)}'
}
def _is_adobe_reader_redirect(self, text: str) -> bool:
"""
Check if the extracted text contains Adobe Reader redirect messages.
Args:
text: Extracted text content
Returns:
True if text appears to be an Adobe Reader redirect
"""
if not text or len(text) < 50:
return False
# Common patterns in Adobe Reader redirect messages
adobe_patterns = [
"Adobe Reader needs to be installed",
"attempting to view this document in your browser",
"You need to open this secured document using Adobe Acrobat Reader",
"ansarada.com", # Common secure document platform
"Need a hand? Read our help and support guide",
"Don't have Adobe Reader? Download for free"
]
text_lower = text.lower()
# If text contains multiple Adobe Reader patterns, it's likely a redirect
pattern_count = sum(1 for pattern in adobe_patterns if pattern.lower() in text_lower)
# If text is short and contains Adobe Reader patterns, it's likely a redirect
if len(text) < 500 and pattern_count >= 2:
return True
# If text contains "Adobe Reader" and other specific patterns
if "adobe reader" in text_lower and ("browser" in text_lower or "ansarada" in text_lower):
return True
return False
def is_supported_format(self, filename: str) -> bool:
"""
Check if a file format is supported.
Args:
filename: Name of the file
Returns:
True if supported, False otherwise
"""
_, ext = os.path.splitext(filename)
return ext.lower() in self.supported_extensions
def cleanup_temp_file(self, file_path: str):
"""
Clean up a temporary file.
Args:
file_path: Path to the temporary file
"""
try:
if os.path.exists(file_path):
os.unlink(file_path)
logger.debug(f"Cleaned up temporary file: {file_path}")
except Exception as e:
logger.warning(f"Failed to clean up temporary file {file_path}: {e}")
def __del__(self):
"""Clean up temp directory on destruction."""
try:
import shutil
if hasattr(self, 'temp_dir') and self.temp_dir.exists():
shutil.rmtree(self.temp_dir)
except Exception:
pass # Ignore cleanup errors
def _extract_text_with_ocr(self, file_path: str) -> Optional[str]:
"""
Extract text from PDF using OCR when standard methods fail.
Uses both pytesseract and easyocr for better results.
"""
if not OCR_AVAILABLE:
logger.warning("OCR libraries not available for fallback text extraction")
return None
logger.info(f"Attempting OCR text extraction for: {file_path}")
try:
# Convert PDF pages to images
images = convert_from_path(file_path, dpi=300, first_page=1, last_page=10) # Limit to first 10 pages
logger.debug(f"Converted PDF to {len(images)} images for OCR")
all_text = []
for i, image in enumerate(images):
logger.debug(f"Processing page {i+1} with OCR")
# Method 1: Try pytesseract first (usually faster) - only if available
if TESSERACT_AVAILABLE:
try:
text_pytesseract = pytesseract.image_to_string(image, lang='eng', config='--psm 6')
if text_pytesseract and len(text_pytesseract.strip()) > 50: # Reasonable amount of text
all_text.append(f"=== Page {i+1} ===\n{text_pytesseract}")
continue
except Exception as e:
logger.debug(f"Pytesseract failed on page {i+1}: {e}")
# Method 2: Try easyocr (works without tesseract installation)
try:
# Convert PIL image to numpy array for easyocr
import numpy as np
image_array = np.array(image)
# Initialize easyocr reader (cache it for efficiency)
if not hasattr(self, '_easyocr_reader'):
logger.debug("Initializing EasyOCR reader (CPU mode)")
self._easyocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False)
results = self._easyocr_reader.readtext(image_array)
text_easyocr = '\n'.join([result[1] for result in results if result[2] > 0.5]) # confidence > 0.5
if text_easyocr and len(text_easyocr.strip()) > 50:
all_text.append(f"=== Page {i+1} ===\n{text_easyocr}")
else:
logger.debug(f"Insufficient text extracted from page {i+1}")
except Exception as e:
logger.debug(f"EasyOCR failed on page {i+1}: {e}")
if all_text:
full_text = '\n\n'.join(all_text)
logger.info(f"OCR extracted {len(full_text)} characters from {len(all_text)} pages")
return full_text
else:
logger.warning("OCR failed to extract meaningful text from any pages")
return None
except Exception as e:
logger.error(f"OCR processing failed for {file_path}: {e}")
return None
def _should_use_ocr(self, extracted_text: str) -> bool:
"""
Determine if OCR should be used based on the quality of extracted text.
"""
if not extracted_text or len(extracted_text.strip()) < 100:
return True
# Check for signs of poor text extraction (lots of garbled characters)
garbled_chars = sum(1 for c in extracted_text if ord(c) > 127 and c not in 'àáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ')
garbled_ratio = garbled_chars / len(extracted_text) if extracted_text else 0
if garbled_ratio > 0.1: # More than 10% garbled characters
logger.debug(f"High garbled character ratio ({garbled_ratio:.2%}), considering OCR")
return True
# Check for very short lines (sign of poor extraction)
lines = extracted_text.split('\n')
short_lines = sum(1 for line in lines if len(line.strip()) < 10 and line.strip())
if len(lines) > 10 and short_lines / len(lines) > 0.5:
logger.debug("Many short lines detected, considering OCR")
return True
return False
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
bases: Parameter of type
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self, config)
Purpose: Initialize document processor. Args: config: Configuration dictionary with processing settings
Parameters:
config: Type: Dict[str, Any]
Returns: None
_check_llmsherpa_support(self) -> bool
Purpose: Check if llmsherpa is available.
Returns: Returns bool
_check_word_support(self) -> bool
Purpose: Check if Word document processing libraries are available.
Returns: Returns bool
extract_text(self, file_path) -> Optional[str]
Purpose: Extract text from a document file using llmsherpa. Args: file_path: Path to the document file Returns: Extracted text as string, or None if failed
Parameters:
file_path: Type: str
Returns: Returns Optional[str]
_convert_to_pdf(self, input_file)
Purpose: Convert a document to PDF using LibreOffice with unoconv fallback (same as offline_docstore_multi_vice.py)
Parameters:
input_file: Parameter
Returns: None
_process_word_document(self, file_path)
Purpose: Process Word documents by converting to PDF and using llmsherpa (same as offline_docstore_multi_vice.py)
Parameters:
file_path: Parameter
Returns: None
_process_pdf_document(self, file_path)
Purpose: Process PDF documents using llmsherpa (same as offline_docstore_multi_vice.py)
Parameters:
file_path: Parameter
Returns: None
process_document(self, file_path, filename) -> Dict[str, Any]
Purpose: Process a document and extract metadata. Args: file_path: Path to the document file filename: Original filename (if different from file_path) Returns: Dictionary with processing results
Parameters:
file_path: Type: strfilename: Type: str
Returns: Returns Dict[str, Any]
process_document_with_ocr(self, file_path, filename) -> Dict[str, Any]
Purpose: Process a document using OCR extraction (forced OCR mode). Args: file_path: Path to the document file filename: Original filename (if different from file_path) Returns: Dictionary with processing results
Parameters:
file_path: Type: strfilename: Type: str
Returns: Returns Dict[str, Any]
_is_adobe_reader_redirect(self, text) -> bool
Purpose: Check if the extracted text contains Adobe Reader redirect messages. Args: text: Extracted text content Returns: True if text appears to be an Adobe Reader redirect
Parameters:
text: Type: str
Returns: Returns bool
is_supported_format(self, filename) -> bool
Purpose: Check if a file format is supported. Args: filename: Name of the file Returns: True if supported, False otherwise
Parameters:
filename: Type: str
Returns: Returns bool
cleanup_temp_file(self, file_path)
Purpose: Clean up a temporary file. Args: file_path: Path to the temporary file
Parameters:
file_path: Type: str
Returns: None
__del__(self)
Purpose: Clean up temp directory on destruction.
Returns: None
_extract_text_with_ocr(self, file_path) -> Optional[str]
Purpose: Extract text from PDF using OCR when standard methods fail. Uses both pytesseract and easyocr for better results.
Parameters:
file_path: Type: str
Returns: Returns Optional[str]
_should_use_ocr(self, extracted_text) -> bool
Purpose: Determine if OCR should be used based on the quality of extracted text.
Parameters:
extracted_text: Type: str
Returns: Returns bool
Required Imports
import logging
import os
import tempfile
import subprocess
from typing import Optional
Usage Example
# Example usage:
# result = DocumentProcessor(bases)
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentProcessor_v1 84.3% similar
-
class DocumentProcessor_v2 84.0% similar
-
class TestDocumentProcessor 70.8% similar
-
function test_document_processor 68.2% similar
-
class DocumentProcessor_v7 64.1% similar