function test_single_document
Tests end date extraction from a specific PDF document by downloading it from FileCloud, extracting text, and using LLM-based analysis to identify contract expiry dates.
/tf/active/vicechatdev/contract_validity_analyzer/test_single_document.py
36 - 153
complex
Purpose
This function serves as an integration test for the document processing pipeline, specifically validating the end date extraction capability. It tests the complete workflow: FileCloud connection, document search and download, text extraction, and two-step LLM analysis (expiry date finding and complete contract analysis). The function is designed to debug and validate the extraction of contract end dates from a specific test document ('241029_VICEBIO LEUKOCARE MTA_executed.pdf').
Source Code
def test_single_document():
"""Test end date extraction for a specific document."""
logger = setup_test_logging()
logger.info("Starting focused test on single document for end date extraction")
try:
# Load configuration
config = Config()
# Initialize components
fc_client = FileCloudClient(config.get_section('filecloud'))
doc_processor = DocumentProcessor(config.get_section('document_processing'))
llm_client = LLMClient(config.get_section('llm'))
# Connect to FileCloud
if not fc_client.connect():
logger.error("Failed to connect to FileCloud")
return False
logger.info(f"Testing document: {TEST_DOCUMENT_PATH}")
try:
# First, search for the document to get its details
logger.info("Searching for document...")
documents = fc_client.search_documents()
target_doc = None
for doc in documents:
if "241029_VICEBIO LEUKOCARE MTA_executed.pdf" in doc['filename']:
target_doc = doc
logger.info(f"Found document: {doc}")
break
if not target_doc:
logger.error("Document not found in search results")
return False
# Download the document
logger.info("Downloading document...")
local_path = fc_client.download_document(target_doc['full_path'])
if not local_path or not os.path.exists(local_path):
logger.error("Failed to download document")
return False
# Extract text from document
logger.info("Extracting text from document...")
document_text = doc_processor.extract_text(local_path)
if not document_text or len(document_text.strip()) < 100:
logger.error("No meaningful text extracted from document")
logger.info(f"Extracted text length: {len(document_text) if document_text else 0}")
return False
logger.info(f"Successfully extracted {len(document_text)} characters of text")
# Show first 1000 characters of extracted text for debugging
logger.info("=" * 60)
logger.info("EXTRACTED TEXT PREVIEW:")
logger.info("=" * 60)
logger.info(document_text[:1000])
logger.info("=" * 60)
# Step 1: Test expiry date finding
logger.info("STEP 1: Testing expiry date extraction...")
step1_result = llm_client._find_expiry_dates(document_text, "241029_VICEBIO LEUKOCARE MTA_executed.pdf")
logger.info("STEP 1 RESULT:")
logger.info("-" * 40)
expiry_analysis = step1_result.get('expiry_analysis', 'No analysis available')
logger.info(expiry_analysis)
logger.info("-" * 40)
# Step 2: Test complete contract analysis
logger.info("STEP 2: Testing complete contract analysis...")
step2_result = llm_client._extract_complete_contract_info(step1_result, document_text, "241029_VICEBIO LEUKOCARE MTA_executed.pdf")
logger.info("STEP 2 RESULT:")
logger.info("-" * 40)
if step2_result.get('error'):
logger.error(f"Step 2 failed: {step2_result.get('error')}")
else:
logger.info(f"Contract Type: {step2_result.get('contract_type', 'Unknown')}")
logger.info(f"Third Parties: {step2_result.get('third_parties', [])}")
logger.info(f"Start Date: {step2_result.get('start_date', 'Not found')}")
logger.info(f"End Date: {step2_result.get('end_date', 'Not found')}")
logger.info(f"Is In Effect: {step2_result.get('is_in_effect', 'Unknown')}")
logger.info(f"Confidence: {step2_result.get('confidence', 0.0)}")
logger.info(f"Analysis Notes: {step2_result.get('analysis_notes', 'None')}")
logger.info("-" * 40)
# Check if end date was successfully extracted
end_date = step2_result.get('end_date')
if end_date and end_date not in ['null', None, '']:
logger.info("✓ SUCCESS: End date extracted successfully!")
logger.info(f"End date found: {end_date}")
return True
else:
logger.warning("✗ FAILED: End date not extracted")
logger.info("Need to improve the prompts further")
return False
# Clean up downloaded file
try:
os.remove(local_path)
except:
pass
except Exception as e:
logger.error(f"Error processing document: {e}")
return False
# Disconnect from FileCloud
fc_client.disconnect()
except Exception as e:
logger.error(f"Test failed with error: {e}")
return False
Return Value
Returns a boolean value: True if the end date was successfully extracted from the document (non-null, non-empty value), False if the extraction failed, document was not found, or any error occurred during processing. The function also logs detailed information about each step of the process.
Dependencies
ossysjsonpathliblogging
Required Imports
import os
import sys
import json
from pathlib import Path
from config.config import Config
from utils.filecloud_client import FileCloudClient
from utils.document_processor import DocumentProcessor
from utils.llm_client import LLMClient
from utils.logging_utils import setup_logging
from utils.logging_utils import get_logger
import logging
Usage Example
# Ensure all required configuration is set up
# config/config.py should contain filecloud, document_processing, and llm sections
# Define TEST_DOCUMENT_PATH before calling
TEST_DOCUMENT_PATH = '/path/to/test/document'
def setup_test_logging():
logging.basicConfig(level=logging.INFO)
return logging.getLogger(__name__)
# Run the test
result = test_single_document()
if result:
print('Test passed: End date extracted successfully')
else:
print('Test failed: End date extraction unsuccessful')
Best Practices
- This function is designed for testing/debugging purposes and should not be used in production code
- Ensure FileCloud connection is properly configured before running this test
- The function expects a specific document filename ('241029_VICEBIO LEUKOCARE MTA_executed.pdf') - modify the search logic for different documents
- Review the logged output to understand each step of the extraction process
- The function performs cleanup by removing downloaded files, but may leave files if an exception occurs
- Consider wrapping the file cleanup in a finally block for more robust resource management
- The function returns False on any error, check logs for detailed error information
- Requires at least 100 characters of extracted text to proceed with analysis
- The two-step LLM analysis (_find_expiry_dates and _extract_complete_contract_info) must be implemented in LLMClient
- Monitor LLM API usage and costs when running this test repeatedly
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function test_end_date_extraction 90.4% similar
-
function test_local_document 86.2% similar
-
function test_simulated_document 80.8% similar
-
function test_ocr_retry_logic 74.5% similar
-
function test_with_simulated_content 71.8% similar