function test_local_document
Integration test function that validates end date extraction from a local PDF document using document processing and LLM-based analysis.
/tf/active/vicechatdev/contract_validity_analyzer/test_local_document.py
34 - 164
complex
Purpose
This function performs a comprehensive end-to-end test of contract document processing capabilities. It loads a local PDF document, extracts text content, searches for contract term-related keywords, and uses an LLM to extract structured contract information including start dates, end dates, contract types, and third parties. The function logs detailed diagnostic information at each step and saves results to files for inspection, making it useful for debugging and validating the document processing pipeline.
Source Code
def test_local_document():
"""Test end date extraction for the local document."""
logger = setup_test_logging()
logger.info("Starting test on local document for end date extraction")
try:
# Check if local document exists
if not os.path.exists(LOCAL_DOCUMENT_PATH):
logger.error(f"Local document not found: {LOCAL_DOCUMENT_PATH}")
return False
# Load configuration
config = Config()
# Initialize components
doc_processor = DocumentProcessor(config.get_section('document_processing'))
llm_client = LLMClient(config.get_section('llm'))
logger.info(f"Testing document: {LOCAL_DOCUMENT_PATH}")
# Extract text from document
logger.info("Extracting text from document...")
document_text = doc_processor.extract_text(LOCAL_DOCUMENT_PATH)
if not document_text or len(document_text.strip()) < 100:
logger.error("No meaningful text extracted from document")
logger.info(f"Extracted text length: {len(document_text) if document_text else 0}")
return False
logger.info(f"Successfully extracted {len(document_text)} characters of text")
# Save full extracted text to a separate file for inspection
with open('extracted_text.txt', 'w', encoding='utf-8') as f:
f.write("FULL EXTRACTED TEXT FROM PDF:\n")
f.write("=" * 80 + "\n")
f.write(document_text)
f.write("\n" + "=" * 80 + "\n")
logger.info("Full extracted text saved to 'extracted_text.txt'")
# Show first 2000 characters in log for quick preview
logger.info("=" * 80)
logger.info("EXTRACTED TEXT PREVIEW (first 2000 chars):")
logger.info("=" * 80)
logger.info(document_text[:2000])
if len(document_text) > 2000:
logger.info(f"... (truncated, full text in extracted_text.txt)")
logger.info("=" * 80)
# Check if the key term clause is present
term_keywords = ["Term:", "shall commence", "period of one (1) year", "thirty (30)", "five (5) years", "Agreement shall commence and be effective"]
found_keywords = []
for keyword in term_keywords:
if keyword.lower() in document_text.lower():
found_keywords.append(keyword)
logger.info(f"Found keyword: '{keyword}' in extracted text")
logger.info(f"Total found term keywords: {len(found_keywords)} out of {len(term_keywords)}")
logger.info(f"Found keywords: {found_keywords}")
# Also check for any text that might indicate duration or terms
duration_patterns = ["year", "month", "term", "period", "expire", "terminate", "commence", "effective"]
duration_matches = []
for pattern in duration_patterns:
if pattern.lower() in document_text.lower():
duration_matches.append(pattern)
logger.info(f"Duration-related words found: {duration_matches}")
# Step 1: Test expiry date finding
logger.info("STEP 1: Testing expiry date extraction...")
step1_result = llm_client._find_expiry_dates(document_text, LOCAL_DOCUMENT_PATH)
logger.info("STEP 1 RESULT:")
logger.info("-" * 60)
expiry_analysis = step1_result.get('expiry_analysis', 'No analysis available')
logger.info(expiry_analysis)
logger.info("-" * 60)
# Step 2: Test complete contract analysis
logger.info("STEP 2: Testing complete contract analysis...")
step2_result = llm_client._extract_complete_contract_info(step1_result, document_text, LOCAL_DOCUMENT_PATH)
logger.info("STEP 2 RESULT:")
logger.info("-" * 60)
if step2_result.get('error'):
logger.error(f"Step 2 failed: {step2_result.get('error')}")
else:
logger.info(f"Contract Type: {step2_result.get('contract_type', 'Unknown')}")
logger.info(f"Third Parties: {step2_result.get('third_parties', [])}")
logger.info(f"Start Date: {step2_result.get('start_date', 'Not found')}")
logger.info(f"End Date: {step2_result.get('end_date', 'Not found')}")
logger.info(f"Is In Effect: {step2_result.get('is_in_effect', 'Unknown')}")
logger.info(f"Confidence: {step2_result.get('confidence', 0.0)}")
logger.info(f"Analysis Notes: {step2_result.get('analysis_notes', 'None')}")
logger.info("-" * 60)
# Save results to JSON for further analysis
result_data = {
'document_path': LOCAL_DOCUMENT_PATH,
'text_length': len(document_text),
'found_keywords': found_keywords,
'step1_result': step1_result,
'step2_result': step2_result
}
with open('local_test_results.json', 'w') as f:
json.dump(result_data, f, indent=2, default=str)
# Check if end date was successfully extracted
end_date = step2_result.get('end_date')
if end_date and end_date not in ['null', None, '']:
logger.info("✓ SUCCESS: End date extracted successfully!")
logger.info(f"End date found: {end_date}")
return True
else:
logger.warning("✗ FAILED: End date not extracted")
# If the term clause keywords were found but end date wasn't extracted,
# this indicates a prompt improvement opportunity
if len(found_keywords) >= 3:
logger.warning("Term clause appears to be present but wasn't properly processed")
logger.warning("This suggests the LLM prompts need improvement")
return False
except Exception as e:
logger.error(f"Test failed with error: {e}")
import traceback
logger.error(traceback.format_exc())
return False
Return Value
Returns a boolean value: True if the end date was successfully extracted from the document (and is not null or empty), False if extraction failed, the document wasn't found, text extraction failed, or an exception occurred during processing.
Dependencies
ossysjsonpathlibloggingtraceback
Required Imports
import os
import sys
import json
from pathlib import Path
from config.config import Config
from utils.document_processor import DocumentProcessor
from utils.llm_client import LLMClient
import logging
import traceback
Usage Example
# Define required module-level variables and functions
import logging
LOCAL_DOCUMENT_PATH = '/path/to/contract.pdf'
def setup_test_logging():
logging.basicConfig(level=logging.INFO)
return logging.getLogger(__name__)
# Import and run the test
from test_module import test_local_document
# Execute the test
success = test_local_document()
if success:
print('Test passed: End date extracted successfully')
# Check extracted_text.txt for full document text
# Check local_test_results.json for detailed results
else:
print('Test failed: Check logs for details')
Best Practices
- Ensure LOCAL_DOCUMENT_PATH points to a valid PDF file before calling this function
- Review the generated 'extracted_text.txt' file to verify text extraction quality
- Check 'local_test_results.json' for detailed analysis results and debugging information
- Monitor log output for diagnostic information about keyword matches and extraction steps
- The function expects at least 100 characters of meaningful text to be extracted from the document
- Ensure proper configuration files are set up for both DocumentProcessor and LLMClient components
- The function performs two-step LLM analysis: first finding expiry dates, then complete contract analysis
- Use this function in a testing or development environment, not in production code paths
- The function writes files to the current working directory, ensure appropriate permissions
- Consider the cost implications of LLM API calls when running this test repeatedly
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function test_single_document 86.2% similar
-
function test_end_date_extraction 85.0% similar
-
function test_simulated_document 82.2% similar
-
function test_ocr_retry_logic 74.0% similar
-
function test_document_processing 73.3% similar