function test_enhanced_pdf_processing
A comprehensive test function that validates PDF processing capabilities, including text extraction, cleaning, chunking, and table detection across multiple PDF processing libraries.
/tf/active/vicechatdev/vice_ai/test_enhanced_pdf.py
14 - 117
moderate
Purpose
This function serves as a diagnostic and testing tool for the DocumentProcessor class. It checks the availability of various PDF processing libraries (llmsherpa, PyPDF2, pymupdf, pytesseract, pdf2image), tests text cleaning and chunking functionality, and processes actual PDF files if available in the current directory. It provides detailed console output showing which capabilities are available and the results of processing operations, making it useful for debugging and verifying PDF processing setup.
Source Code
def test_enhanced_pdf_processing():
"""Test the enhanced PDF processing functionality"""
print("π§ͺ Testing Enhanced PDF Processing")
print("=" * 50)
# Initialize document processor
processor = DocumentProcessor()
# Check available capabilities
print("\nπ Available PDF Processing Methods:")
capabilities = []
# Check each processing capability
from document_processor import (
LLMSHERPA_AVAILABLE, PYPDF2_AVAILABLE, PYMUPDF_AVAILABLE,
PYTESSERACT_AVAILABLE, PDF2IMAGE_AVAILABLE
)
if LLMSHERPA_AVAILABLE:
capabilities.append("β
llmsherpa (advanced layout parsing)")
else:
capabilities.append("β llmsherpa (advanced layout parsing)")
if PYPDF2_AVAILABLE:
capabilities.append("β
PyPDF2 (basic text extraction)")
else:
capabilities.append("β PyPDF2 (basic text extraction)")
if PYMUPDF_AVAILABLE:
capabilities.append("β
pymupdf (enhanced PDF + table extraction)")
else:
capabilities.append("β pymupdf (enhanced PDF + table extraction)")
if PYTESSERACT_AVAILABLE:
capabilities.append("β
pytesseract (OCR capabilities)")
else:
capabilities.append("β pytesseract (OCR capabilities)")
if PDF2IMAGE_AVAILABLE:
capabilities.append("β
pdf2image (PDF to image conversion)")
else:
capabilities.append("β pdf2image (PDF to image conversion)")
for capability in capabilities:
print(f" {capability}")
# Test text cleaning function
print("\nπ§½ Testing Text Cleaning:")
sample_messy_text = "This is a test\n\n\n\nwith excessive spaces\nand\nfragmented text ο¬le with ο¬ ligatures."
cleaned = processor._clean_pdf_text(sample_messy_text)
print(f" Original: {repr(sample_messy_text)}")
print(f" Cleaned: {repr(cleaned)}")
# Test chunking
print("\nπ Testing Text Chunking:")
long_text = "This is a test document. " * 100
chunks = processor.chunk_text(long_text, max_chunk_size=200, overlap=50)
print(f" Original length: {len(long_text)} characters")
print(f" Number of chunks: {len(chunks)}")
print(f" First chunk length: {len(chunks[0]) if chunks else 0}")
# Check for existing PDF files to test with
print("\nπ Looking for PDF files to test...")
current_dir = Path(__file__).parent
pdf_files = list(current_dir.glob("*.pdf"))
if pdf_files:
test_file = pdf_files[0]
print(f" Found test file: {test_file.name}")
# Test the enhanced processing
print(f"\nπ Testing enhanced PDF processing on: {test_file.name}")
try:
result = processor.process_document(test_file)
print("π Processing Results:")
print(f" Text chunks: {len(result.get('text_chunks', []))}")
print(f" Tables: {len(result.get('tables', []))}")
print(f" Has error: {result.get('error', 'No')}")
if result.get('text_chunks'):
total_length = sum(len(chunk) for chunk in result['text_chunks'])
print(f" Total text length: {total_length} characters")
print(f" First chunk preview: {result['text_chunks'][0][:100]}...")
if result.get('tables'):
print(f" First table preview: {result['tables'][0][:200]}...")
except Exception as e:
print(f" β Error testing PDF: {e}")
else:
print(" No PDF files found in current directory for testing")
# Check extracted directory
extracted_dir = current_dir / "extracted"
if extracted_dir.exists():
debug_files = list(extracted_dir.glob("*.json"))
print(f"\nπ Debug logs available: {len(debug_files)}")
if debug_files:
print(" Recent debug files:")
for file in sorted(debug_files, key=lambda f: f.stat().st_mtime)[-3:]:
print(f" - {file.name}")
print("\nβ
Enhanced PDF processing test completed!")
Return Value
This function does not return any value (implicitly returns None). It performs testing operations and outputs results directly to the console via print statements, including capability checks, text processing results, and PDF processing outcomes.
Dependencies
loggingpathlibdocument_processor
Required Imports
import logging
from pathlib import Path
from document_processor import DocumentProcessor
from document_processor import LLMSHERPA_AVAILABLE
from document_processor import PYPDF2_AVAILABLE
from document_processor import PYMUPDF_AVAILABLE
from document_processor import PYTESSERACT_AVAILABLE
from document_processor import PDF2IMAGE_AVAILABLE
Usage Example
# Run the test function directly
from pathlib import Path
from document_processor import DocumentProcessor
from document_processor import (
LLMSHERPA_AVAILABLE, PYPDF2_AVAILABLE, PYMUPDF_AVAILABLE,
PYTESSERACT_AVAILABLE, PDF2IMAGE_AVAILABLE
)
# Execute the test
test_enhanced_pdf_processing()
# The function will:
# 1. Display available PDF processing capabilities
# 2. Test text cleaning with sample text
# 3. Test text chunking functionality
# 4. Search for PDF files in the current directory
# 5. Process any found PDF files and display results
# 6. Show debug log information if available
Best Practices
- Run this function in a directory containing sample PDF files to get comprehensive test results
- Ensure the document_processor module is properly installed and configured before running
- Review console output to identify which PDF processing libraries are available and which are missing
- Check the 'extracted' directory after running to review debug logs and processing artifacts
- Use this function during initial setup to verify that all desired PDF processing capabilities are properly installed
- The function is designed for testing and debugging purposes, not for production use
- If specific capabilities show as unavailable, install the corresponding libraries (e.g., pip install llmsherpa, PyPDF2, pymupdf, pytesseract, pdf2image)
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function test_document_processor 81.2% similar
-
function test_document_processing 78.3% similar
-
function test_document_extractor 68.6% similar
-
class TestDocumentProcessor 68.2% similar
-
function test_extraction_debugging 66.8% similar