function test_ocr_retry_logic
Tests the OCR retry logic for extracting contract end dates by first attempting normal text extraction, then falling back to OCR-based extraction if the end date is not found.
/tf/active/vicechatdev/contract_validity_analyzer/test_ocr_retry.py
34 - 146
complex
Purpose
This test function validates a two-stage document processing pipeline: it first processes a PDF document using normal text extraction and LLM analysis to find contract dates. If the end date is missing, it retries using OCR-based extraction to handle scanned or image-based PDFs. The function compares both methods, saves extracted text to files for inspection, and logs detailed results at each step. This is useful for testing document processing reliability and OCR fallback mechanisms in contract analysis systems.
Source Code
def test_ocr_retry_logic():
"""Test the OCR retry logic when no end date is found."""
logger = setup_test_logging()
logger.info("Starting test of OCR retry logic for end date extraction")
try:
# Check if local document exists
if not os.path.exists(LOCAL_DOCUMENT_PATH):
logger.error(f"Local document not found: {LOCAL_DOCUMENT_PATH}")
return False
# Load configuration
config = Config()
# Initialize components
doc_processor = DocumentProcessor(config.get_section('document_processing'))
llm_client = LLMClient(config.get_section('llm'))
logger.info(f"Testing OCR retry logic with document: {LOCAL_DOCUMENT_PATH}")
# Step 1: Normal extraction
logger.info("=" * 60)
logger.info("STEP 1: Normal text extraction")
logger.info("=" * 60)
normal_result = doc_processor.process_document(LOCAL_DOCUMENT_PATH)
if not normal_result.get('success'):
logger.error(f"Normal extraction failed: {normal_result.get('error')}")
return False
normal_text = normal_result.get('text', '')
logger.info(f"Normal extraction: {len(normal_text)} characters")
# Save normal extracted text for comparison
with open('normal_extracted_text.txt', 'w', encoding='utf-8') as f:
f.write(normal_text)
logger.info("Normal extracted text saved to normal_extracted_text.txt")
# Step 2: Normal LLM analysis
logger.info("=" * 60)
logger.info("STEP 2: Normal LLM analysis")
logger.info("=" * 60)
normal_analysis = llm_client.analyze_contract(normal_text, "test_document.pdf")
normal_end_date = normal_analysis.get('end_date')
logger.info(f"Normal analysis results:")
logger.info(f" Contract Type: {normal_analysis.get('contract_type', 'Unknown')}")
logger.info(f" Start Date: {normal_analysis.get('start_date', 'Not found')}")
logger.info(f" End Date: {normal_end_date}")
logger.info(f" Analysis Notes: {normal_analysis.get('analysis_notes', 'None')}")
# Step 3: Simulate OCR retry if no end date found
if not normal_end_date or normal_end_date in ['null', None, '']:
logger.info("=" * 60)
logger.info("STEP 3: OCR retry (no end date found)")
logger.info("=" * 60)
ocr_result = doc_processor.process_document_with_ocr(LOCAL_DOCUMENT_PATH)
if not ocr_result.get('success'):
logger.warning(f"OCR extraction failed: {ocr_result.get('error')}")
return False
ocr_text = ocr_result.get('text', '')
logger.info(f"OCR extraction: {len(ocr_text)} characters")
# Save OCR extracted text for comparison
with open('ocr_extracted_text.txt', 'w', encoding='utf-8') as f:
f.write(ocr_text)
logger.info("OCR extracted text saved to ocr_extracted_text.txt")
# Compare text lengths
if len(ocr_text) > len(normal_text) * 0.5:
logger.info(f"OCR found substantial text ({len(ocr_text)} vs {len(normal_text)} chars)")
# Step 4: OCR LLM analysis
logger.info("=" * 60)
logger.info("STEP 4: OCR LLM analysis")
logger.info("=" * 60)
ocr_analysis = llm_client.analyze_contract(ocr_text, "test_document.pdf")
ocr_end_date = ocr_analysis.get('end_date')
logger.info(f"OCR analysis results:")
logger.info(f" Contract Type: {ocr_analysis.get('contract_type', 'Unknown')}")
logger.info(f" Start Date: {ocr_analysis.get('start_date', 'Not found')}")
logger.info(f" End Date: {ocr_end_date}")
logger.info(f" Analysis Notes: {ocr_analysis.get('analysis_notes', 'None')}")
# Step 5: Compare results
logger.info("=" * 60)
logger.info("STEP 5: Results comparison")
logger.info("=" * 60)
if ocr_end_date and ocr_end_date not in ['null', None, '']:
logger.info("✓ SUCCESS: OCR retry found an end date!")
logger.info(f" Normal method: {normal_end_date or 'None'}")
logger.info(f" OCR method: {ocr_end_date}")
return True
else:
logger.info("✗ OCR retry did not find end date either")
return False
else:
logger.info("OCR did not provide substantial additional text")
return False
else:
logger.info("✓ Normal extraction already found end date, no retry needed")
logger.info(f"End date found: {normal_end_date}")
return True
except Exception as e:
logger.error(f"Test failed with error: {e}")
return False
Return Value
Returns a boolean value: True if the test successfully extracts an end date (either through normal extraction or OCR retry), or if normal extraction already found the end date. Returns False if both methods fail to find an end date, if the document is not found, if extraction fails, or if an exception occurs during testing.
Dependencies
ossysjsonpathliblogging
Required Imports
import os
import sys
import json
from pathlib import Path
from config.config import Config
from utils.document_processor import DocumentProcessor
from utils.llm_client import LLMClient
import logging
Usage Example
# Ensure prerequisites are set up
LOCAL_DOCUMENT_PATH = '/path/to/contract.pdf'
def setup_test_logging():
logging.basicConfig(level=logging.INFO)
return logging.getLogger(__name__)
# Run the test
result = test_ocr_retry_logic()
if result:
print('Test passed: End date extraction successful')
else:
print('Test failed: Could not extract end date')
# Check generated files for comparison
with open('normal_extracted_text.txt', 'r') as f:
normal_text = f.read()
with open('ocr_extracted_text.txt', 'r') as f:
ocr_text = f.read()
Best Practices
- Ensure LOCAL_DOCUMENT_PATH points to a valid PDF document before running the test
- Review the generated text files (normal_extracted_text.txt and ocr_extracted_text.txt) to understand extraction differences
- Monitor the detailed logging output to understand each step of the extraction and analysis process
- The function performs file I/O operations - ensure write permissions in the working directory
- This is an integration test that requires external services (LLM API) to be available and properly configured
- The test may take significant time to complete due to OCR processing and LLM API calls
- Consider the cost implications of LLM API calls when running this test repeatedly
- The function expects specific configuration sections ('document_processing' and 'llm') in the Config object
- OCR retry is only triggered when the normal extraction fails to find an end date
- The function creates files in the current working directory - clean up these files after testing if needed
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function test_single_document 74.5% similar
-
function test_local_document 74.0% similar
-
function test_ocr_fallback 73.1% similar
-
function test_end_date_extraction 72.3% similar
-
function test_simulated_document 67.4% similar