test_single_document - Code Extractor

function test_single_document

Maturity: 50

Tests end date extraction from a specific PDF document by downloading it from FileCloud, extracting text, and using LLM-based analysis to identify contract expiry dates.

File:
/tf/active/vicechatdev/contract_validity_analyzer/test_single_document.py

Lines:
36 - 153

Complexity:
complex

Purpose

This function serves as an integration test for the document processing pipeline, specifically validating the end date extraction capability. It tests the complete workflow: FileCloud connection, document search and download, text extraction, and two-step LLM analysis (expiry date finding and complete contract analysis). The function is designed to debug and validate the extraction of contract end dates from a specific test document ('241029_VICEBIO LEUKOCARE MTA_executed.pdf').

Source Code

def test_single_document():
    """Test end date extraction for a specific document."""
    logger = setup_test_logging()
    logger.info("Starting focused test on single document for end date extraction")
    
    try:
        # Load configuration
        config = Config()
        
        # Initialize components
        fc_client = FileCloudClient(config.get_section('filecloud'))
        doc_processor = DocumentProcessor(config.get_section('document_processing'))
        llm_client = LLMClient(config.get_section('llm'))
        
        # Connect to FileCloud
        if not fc_client.connect():
            logger.error("Failed to connect to FileCloud")
            return False
            
        logger.info(f"Testing document: {TEST_DOCUMENT_PATH}")
        
        try:
            # First, search for the document to get its details
            logger.info("Searching for document...")
            documents = fc_client.search_documents()
            target_doc = None
            
            for doc in documents:
                if "241029_VICEBIO LEUKOCARE MTA_executed.pdf" in doc['filename']:
                    target_doc = doc
                    logger.info(f"Found document: {doc}")
                    break
            
            if not target_doc:
                logger.error("Document not found in search results")
                return False
            
            # Download the document
            logger.info("Downloading document...")
            local_path = fc_client.download_document(target_doc['full_path'])
            
            if not local_path or not os.path.exists(local_path):
                logger.error("Failed to download document")
                return False
            
            # Extract text from document
            logger.info("Extracting text from document...")
            document_text = doc_processor.extract_text(local_path)
            
            if not document_text or len(document_text.strip()) < 100:
                logger.error("No meaningful text extracted from document")
                logger.info(f"Extracted text length: {len(document_text) if document_text else 0}")
                return False
            
            logger.info(f"Successfully extracted {len(document_text)} characters of text")
            
            # Show first 1000 characters of extracted text for debugging
            logger.info("=" * 60)
            logger.info("EXTRACTED TEXT PREVIEW:")
            logger.info("=" * 60)
            logger.info(document_text[:1000])
            logger.info("=" * 60)
            
            # Step 1: Test expiry date finding
            logger.info("STEP 1: Testing expiry date extraction...")
            step1_result = llm_client._find_expiry_dates(document_text, "241029_VICEBIO LEUKOCARE MTA_executed.pdf")
            
            logger.info("STEP 1 RESULT:")
            logger.info("-" * 40)
            expiry_analysis = step1_result.get('expiry_analysis', 'No analysis available')
            logger.info(expiry_analysis)
            logger.info("-" * 40)
            
            # Step 2: Test complete contract analysis
            logger.info("STEP 2: Testing complete contract analysis...")
            step2_result = llm_client._extract_complete_contract_info(step1_result, document_text, "241029_VICEBIO LEUKOCARE MTA_executed.pdf")
            
            logger.info("STEP 2 RESULT:")
            logger.info("-" * 40)
            if step2_result.get('error'):
                logger.error(f"Step 2 failed: {step2_result.get('error')}")
            else:
                logger.info(f"Contract Type: {step2_result.get('contract_type', 'Unknown')}")
                logger.info(f"Third Parties: {step2_result.get('third_parties', [])}")
                logger.info(f"Start Date: {step2_result.get('start_date', 'Not found')}")
                logger.info(f"End Date: {step2_result.get('end_date', 'Not found')}")
                logger.info(f"Is In Effect: {step2_result.get('is_in_effect', 'Unknown')}")
                logger.info(f"Confidence: {step2_result.get('confidence', 0.0)}")
                logger.info(f"Analysis Notes: {step2_result.get('analysis_notes', 'None')}")
            logger.info("-" * 40)
            
            # Check if end date was successfully extracted
            end_date = step2_result.get('end_date')
            if end_date and end_date not in ['null', None, '']:
                logger.info("✓ SUCCESS: End date extracted successfully!")
                logger.info(f"End date found: {end_date}")
                return True
            else:
                logger.warning("✗ FAILED: End date not extracted")
                logger.info("Need to improve the prompts further")
                return False
            
            # Clean up downloaded file
            try:
                os.remove(local_path)
            except:
                pass
                
        except Exception as e:
            logger.error(f"Error processing document: {e}")
            return False
        
        # Disconnect from FileCloud
        fc_client.disconnect()
        
    except Exception as e:
        logger.error(f"Test failed with error: {e}")
        return False

Return Value

Returns a boolean value: True if the end date was successfully extracted from the document (non-null, non-empty value), False if the extraction failed, document was not found, or any error occurred during processing. The function also logs detailed information about each step of the process.

Dependencies

os
sys
json
pathlib
logging

Required Imports

import os
import sys
import json
from pathlib import Path
from config.config import Config
from utils.filecloud_client import FileCloudClient
from utils.document_processor import DocumentProcessor
from utils.llm_client import LLMClient
from utils.logging_utils import setup_logging
from utils.logging_utils import get_logger
import logging

Usage Example

# Ensure all required configuration is set up
# config/config.py should contain filecloud, document_processing, and llm sections
# Define TEST_DOCUMENT_PATH before calling

TEST_DOCUMENT_PATH = '/path/to/test/document'

def setup_test_logging():
    logging.basicConfig(level=logging.INFO)
    return logging.getLogger(__name__)

# Run the test
result = test_single_document()

if result:
    print('Test passed: End date extracted successfully')
else:
    print('Test failed: End date extraction unsuccessful')

Best Practices

This function is designed for testing/debugging purposes and should not be used in production code
Ensure FileCloud connection is properly configured before running this test
The function expects a specific document filename ('241029_VICEBIO LEUKOCARE MTA_executed.pdf') - modify the search logic for different documents
Review the logged output to understand each step of the extraction process
The function performs cleanup by removing downloaded files, but may leave files if an exception occurs
Consider wrapping the file cleanup in a finally block for more robust resource management
The function returns False on any error, check logs for detailed error information
Requires at least 100 characters of extracted text to proceed with analysis
The two-step LLM analysis (_find_expiry_dates and _extract_complete_contract_info) must be implemented in LLMClient
Monitor LLM API usage and costs when running this test repeatedly

Similar Components

AI-powered semantic similarity - components with related functionality:

function test_end_date_extraction 90.4% similar

Tests end date extraction functionality for contract documents that previously had missing end dates by downloading documents from FileCloud, extracting text, analyzing with LLM, and comparing results.
From: /tf/active/vicechatdev/contract_validity_analyzer/test_missing_end_dates.py
function test_local_document 86.2% similar

Integration test function that validates end date extraction from a local PDF document using document processing and LLM-based analysis.
From: /tf/active/vicechatdev/contract_validity_analyzer/test_local_document.py
function test_simulated_document 80.8% similar

Integration test function that validates end date extraction from a simulated contract document containing an explicit term clause, using a two-step LLM-based analysis process.
From: /tf/active/vicechatdev/contract_validity_analyzer/test_simulated_document.py
function test_ocr_retry_logic 74.5% similar

Tests the OCR retry logic for extracting contract end dates by first attempting normal text extraction, then falling back to OCR-based extraction if the end date is not found.
From: /tf/active/vicechatdev/contract_validity_analyzer/test_ocr_retry.py
function test_with_simulated_content 71.8% similar

Tests LLM-based contract analysis prompts using simulated NDA content containing a term clause to verify extraction of contract dates and metadata.
From: /tf/active/vicechatdev/contract_validity_analyzer/test_local_document.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            def test_single_document():
    """Test end date extraction for a specific document."""
    logger = setup_test_logging()
    logger.info("Starting focused test on single document for end date extraction")
    
    try:
        # Load configuration
        config = Config()
        
        # Initialize components
        fc_client = FileCloudClient(config.get_section('filecloud'))
        doc_processor = DocumentProcessor(config.get_section('document_processing'))
        llm_client = LLMClient(config.get_section('llm'))
        
        # Connect to FileCloud
        if not fc_client.connect():
            logger.error("Failed to connect to FileCloud")
            return False
            
        logger.info(f"Testing document: {TEST_DOCUMENT_PATH}")
        
        try:
            # First, search for the document to get its details
            logger.info("Searching for document...")
            documents = fc_client.search_documents()
            target_doc = None
            
            for doc in documents:
                if "241029_VICEBIO LEUKOCARE MTA_executed.pdf" in doc['filename']:
                    target_doc = doc
                    logger.info(f"Found document: {doc}")
                    break
            
            if not target_doc:
                logger.error("Document not found in search results")
                return False
            
            # Download the document
            logger.info("Downloading document...")
            local_path = fc_client.download_document(target_doc['full_path'])
            
            if not local_path or not os.path.exists(local_path):
                logger.error("Failed to download document")
                return False
            
            # Extract text from document
            logger.info("Extracting text from document...")
            document_text = doc_processor.extract_text(local_path)
            
            if not document_text or len(document_text.strip()) < 100:
                logger.error("No meaningful text extracted from document")
                logger.info(f"Extracted text length: {len(document_text) if document_text else 0}")
                return False
            
            logger.info(f"Successfully extracted {len(document_text)} characters of text")
            
            # Show first 1000 characters of extracted text for debugging
            logger.info("=" * 60)
            logger.info("EXTRACTED TEXT PREVIEW:")
            logger.info("=" * 60)
            logger.info(document_text[:1000])
            logger.info("=" * 60)
            
            # Step 1: Test expiry date finding
            logger.info("STEP 1: Testing expiry date extraction...")
            step1_result = llm_client._find_expiry_dates(document_text, "241029_VICEBIO LEUKOCARE MTA_executed.pdf")
            
            logger.info("STEP 1 RESULT:")
            logger.info("-" * 40)
            expiry_analysis = step1_result.get('expiry_analysis', 'No analysis available')
            logger.info(expiry_analysis)
            logger.info("-" * 40)
            
            # Step 2: Test complete contract analysis
            logger.info("STEP 2: Testing complete contract analysis...")
            step2_result = llm_client._extract_complete_contract_info(step1_result, document_text, "241029_VICEBIO LEUKOCARE MTA_executed.pdf")
            
            logger.info("STEP 2 RESULT:")
            logger.info("-" * 40)
            if step2_result.get('error'):
                logger.error(f"Step 2 failed: {step2_result.get('error')}")
            else:
                logger.info(f"Contract Type: {step2_result.get('contract_type', 'Unknown')}")
                logger.info(f"Third Parties: {step2_result.get('third_parties', [])}")
                logger.info(f"Start Date: {step2_result.get('start_date', 'Not found')}")
                logger.info(f"End Date: {step2_result.get('end_date', 'Not found')}")
                logger.info(f"Is In Effect: {step2_result.get('is_in_effect', 'Unknown')}")
                logger.info(f"Confidence: {step2_result.get('confidence', 0.0)}")
                logger.info(f"Analysis Notes: {step2_result.get('analysis_notes', 'None')}")
            logger.info("-" * 40)
            
            # Check if end date was successfully extracted
            end_date = step2_result.get('end_date')
            if end_date and end_date not in ['null', None, '']:
                logger.info("✓ SUCCESS: End date extracted successfully!")
                logger.info(f"End date found: {end_date}")
                return True
            else:
                logger.warning("✗ FAILED: End date not extracted")
                logger.info("Need to improve the prompts further")
                return False
            
            # Clean up downloaded file
            try:
                os.remove(local_path)
            except:
                pass
                
        except Exception as e:
            logger.error(f"Error processing document: {e}")
            return False
        
        # Disconnect from FileCloud
        fc_client.disconnect()
        
    except Exception as e:
        logger.error(f"Test failed with error: {e}")
        return False
                        

Improved Code

🔍 Code Extractor

function test_single_document

Purpose

Source Code

Return Value

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

function test_end_date_extraction 90.4% similar

function test_local_document 86.2% similar

function test_simulated_document 80.8% similar

function test_ocr_retry_logic 74.5% similar

function test_with_simulated_content 71.8% similar

function test_single_document

Purpose

Source Code

Return Value

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

function test_end_date_extraction 90.4% similar

function test_local_document 86.2% similar

function test_simulated_document 80.8% similar

function test_ocr_retry_logic 74.5% similar

function test_with_simulated_content 71.8% similar

✨ Improve Code: test_single_document

Code Comparison