class ExtensiveSearchManager_v2
Manages extensive search functionality including full document retrieval, summarization, and enhanced context gathering.
/tf/active/vicechatdev/data_capture_backup_18072025/OneCo_hybrid_RAG.py
3615 - 4471
moderate
Purpose
Manages extensive search functionality including full document retrieval, summarization, and enhanced context gathering.
Source Code
class ExtensiveSearchManager:
"""
Manages extensive search functionality including full document retrieval,
summarization, and enhanced context gathering.
"""
def __init__(self, session, chroma_client, api_key="", rag_instance=None):
self.session = session
self.chroma_client = chroma_client
self.api_key = api_key
self.rag_instance = rag_instance # Reference to main RAG instance for usage tracking
self.summarizer = ChatOpenAI(
model="gpt-4o-mini",
temperature=0,
max_tokens=2000,
api_key=api_key
)
# Set up tokenizer for counting
self.tokenizer = tiktoken.get_encoding("cl100k_base")
# Initialize caches
self.document_cache = {}
self.summary_cache = {}
def count_tokens(self, text):
"""Count tokens in text"""
return len(self.tokenizer.encode(text))
def get_full_document(self, chunk_metadata, collection_name):
"""
Retrieve the full document that a chunk belongs to by using Neo4j to find all chunks
from the same document and reconstruct the complete document.
Args:
chunk_metadata: Metadata from the chunk containing bibtex path or document info
collection_name: Name of the ChromaDB collection
Returns:
str: Full document text ordered by chunk sequence or None if not found
"""
# Extract bibtex path or try to find document identifier
bibtex_path = chunk_metadata.get('bibtex', '')
if not bibtex_path:
return None
# Create cache key
cache_key = f"{collection_name}_{bibtex_path}"
if cache_key in self.document_cache:
return self.document_cache[cache_key]
try:
# The bibtex path in metadata should correspond to the document path
# First, try to find the document directly by path
query = """
MATCH (d:Document)
WHERE d.Path = $bibtex_path OR d.Name CONTAINS $doc_name OR toString(d.UID) = $bibtex_path
MATCH (d)-[:CHUNK]->(c)
WHERE c:Text_chunk OR c:Table_chunk
RETURN c.Text AS content, c.UID AS chunk_uid, c.Name AS chunk_name, d.UID AS doc_uid, d.Name AS doc_name
ORDER BY c.Name
"""
# Extract potential document name from bibtex path
doc_name = bibtex_path.split('/')[-1] if '/' in bibtex_path else bibtex_path
results = self.session.run(query, {
"bibtex_path": bibtex_path,
"doc_name": doc_name
})
chunks = []
doc_info = None
for record in results:
if record["content"]:
chunks.append(record["content"])
if not doc_info:
doc_info = {
"doc_uid": record["doc_uid"],
"doc_name": record["doc_name"]
}
# If direct path matching didn't work, try fuzzy matching on document content paths
if not chunks:
# Extract filename without extension for broader matching
if '.' in doc_name:
base_name = doc_name.rsplit('.', 1)[0]
else:
base_name = doc_name
fuzzy_query = """
MATCH (d:Document)-[:CHUNK]->(c)
WHERE (c:Text_chunk OR c:Table_chunk)
AND (d.Name CONTAINS $base_name OR d.Path CONTAINS $base_name
OR any(path_part IN split(d.Path, '/') WHERE path_part CONTAINS $base_name))
RETURN c.Text AS content, c.UID AS chunk_uid, c.Name AS chunk_name,
d.UID AS doc_uid, d.Name AS doc_name, d.Path AS doc_path
ORDER BY d.UID, c.Name
LIMIT 100
"""
results = self.session.run(fuzzy_query, {"base_name": base_name})
# Group by document to find the best match
doc_candidates = {}
for record in results:
doc_uid = record["doc_uid"]
if doc_uid not in doc_candidates:
doc_candidates[doc_uid] = {
"chunks": [],
"doc_name": record["doc_name"],
"doc_path": record["doc_path"]
}
if record["content"]:
doc_candidates[doc_uid]["chunks"].append(record["content"])
# Pick the document with the most chunks (likely the most complete)
if doc_candidates:
best_doc = max(doc_candidates.items(), key=lambda x: len(x[1]["chunks"]))
chunks = best_doc[1]["chunks"]
doc_info = {
"doc_uid": best_doc[0],
"doc_name": best_doc[1]["doc_name"],
"doc_path": best_doc[1]["doc_path"]
}
print(f" 📄 Fuzzy matched document: {doc_info['doc_name']} with {len(chunks)} chunks")
if chunks:
# Concatenate all chunks to reconstruct the full document
full_document = "\n\n".join(chunks)
self.document_cache[cache_key] = full_document
print(f" ✅ Full document retrieved: {len(chunks)} chunks, {len(full_document)} chars")
if doc_info:
print(f" 📄 Document: {doc_info['doc_name']} (UID: {doc_info['doc_uid']})")
return full_document
else:
print(f" ⚠️ No chunks found for document path: {bibtex_path}")
except Exception as e:
print(f" ❌ Error retrieving full document from Neo4j: {e}")
return None
def get_full_document_neo4j(self, chunk_uid):
"""
Retrieve the full document from Neo4j that a chunk belongs to.
Updated to match the actual Neo4j schema from offline_docstore_multi_vice.py
Args:
chunk_uid: UID of the text/table chunk
Returns:
str: Full document text ordered by chunk sequence or None if not found
"""
cache_key = f"neo4j_{chunk_uid}"
if cache_key in self.document_cache:
return self.document_cache[cache_key]
try:
# Query based on the actual schema: Document-[:CHUNK]->Text_chunk/Table_chunk
query = """
MATCH (chunk {UID: $chunk_uid})<-[:CHUNK]-(doc:Document)
MATCH (doc)-[:CHUNK]->(all_chunks)
WHERE all_chunks:Text_chunk OR all_chunks:Table_chunk
RETURN all_chunks.Text AS content, all_chunks.UID AS chunk_uid,
all_chunks.Name AS chunk_name, doc.UID AS doc_uid, doc.Name AS doc_name
ORDER BY all_chunks.Name
"""
results = self.session.run(query, {"chunk_uid": chunk_uid})
chunks = []
doc_info = None
for record in results:
if record["content"]:
chunks.append(record["content"])
if not doc_info:
doc_info = {
"doc_uid": record["doc_uid"],
"doc_name": record["doc_name"]
}
if chunks:
# Concatenate all chunks to reconstruct the full document
full_document = "\n\n".join(chunks)
self.document_cache[cache_key] = full_document
print(f"✅ Neo4j full document retrieved: {len(chunks)} chunks, {len(full_document)} chars from {doc_info['doc_name']}")
return full_document
else:
print(f"⚠️ No Neo4j document found for chunk UID: {chunk_uid}")
except Exception as e:
print(f"❌ Error retrieving full document from Neo4j: {e}")
return None
def summarize_document(self, document_text, query_context, max_tokens=1500):
"""
Summarize a full document with focus on the query context.
Args:
document_text: Full document text to summarize
query_context: The user's query to focus the summary
max_tokens: Maximum tokens for the summary
Returns:
str: Summarized document text
"""
print(f" 🤖 Using small LLM (gpt-4o-mini) for document summarization")
print(f" • Input document length: {self.count_tokens(document_text):,} tokens")
print(f" • Target summary length: {max_tokens:,} tokens")
print(f" • Query-focused: '{query_context[:100]}{'...' if len(query_context) > 100 else ''}'")
# Track usage if RAG instance is available
if self.rag_instance:
self.rag_instance.small_llm_usage["document_summarization"] += 1
self.rag_instance.small_llm_usage["total_calls"] += 1
# Check cache first
cache_key = f"{hash(document_text[:500])}_{hash(query_context)}"
if cache_key in self.summary_cache:
print(f" ✅ Using cached summary")
return self.summary_cache[cache_key]
# If document is already short enough, return as is
if self.count_tokens(document_text) <= max_tokens:
print(f" ✅ Document already under token limit, using full text")
return document_text
# Create summarization prompt
prompt = f"""
You are a document summarization expert. Please summarize the following document with particular focus on information relevant to this query: "{query_context}"
Instructions:
- Maintain all key factual information relevant to the query
- Preserve important technical details, numbers, and citations
- Keep the summary under {max_tokens} tokens
- Structure the summary logically with clear sections if applicable
- Retain any critical background information needed to understand the main points
Document to summarize:
{document_text}
Summary:
"""
try:
print(f" 🔄 Generating query-focused summary...")
response = self.summarizer.invoke(prompt)
summary = response.content.strip()
summary_tokens = self.count_tokens(summary)
print(f" ✅ Summary generated: {summary_tokens:,} tokens ({100*summary_tokens/self.count_tokens(document_text):.1f}% of original)")
# Cache the summary
self.summary_cache[cache_key] = summary
return summary
except Exception as e:
print(f" ❌ Error summarizing document: {e}")
print(f" 🔄 Fallback: truncating document to {max_tokens} tokens")
# Fallback: truncate the document
tokens = self.tokenizer.encode(document_text)
if len(tokens) > max_tokens:
truncated_tokens = tokens[:max_tokens]
return self.tokenizer.decode(truncated_tokens)
return document_text
def remove_duplicate_documents(self, documents_with_metadata, similarity_threshold=0.85):
"""
Remove duplicate documents based on content similarity.
Args:
documents_with_metadata: List of (metadata, document_text) tuples
similarity_threshold: Threshold for considering documents as duplicates
Returns:
List of unique documents
"""
if len(documents_with_metadata) <= 1:
return documents_with_metadata
unique_docs = []
seen_embeddings = []
try:
# Use OpenAI embeddings for similarity comparison
from openai import OpenAI
client = OpenAI(api_key=self.api_key)
for metadata, doc_text in documents_with_metadata:
# Generate embedding for current document
if self.count_tokens(doc_text) > 8192:
# Use first part of document for embedding if too long
embed_text = self.tokenizer.decode(self.tokenizer.encode(doc_text)[:8192])
else:
embed_text = doc_text
response = client.embeddings.create(
model="text-embedding-3-small",
input=embed_text
)
current_embedding = np.array(response.data[0].embedding)
# Normalize embedding
current_embedding = current_embedding / np.linalg.norm(current_embedding)
# Check for similarity with existing documents
is_duplicate = False
for seen_emb in seen_embeddings:
similarity = np.dot(current_embedding, seen_emb)
if similarity >= similarity_threshold:
is_duplicate = True
break
if not is_duplicate:
unique_docs.append((metadata, doc_text))
seen_embeddings.append(current_embedding)
return unique_docs
except Exception as e:
print(f"Error in duplicate removal: {e}")
# Fallback: return all documents
return documents_with_metadata
def create_initial_summary(self, document_text, query_context, target_tokens=2000):
"""
Create an initial comprehensive summary from the first document.
Args:
document_text: Text of the first document
query_context: The user's query for context
target_tokens: Target length for the initial summary
Returns:
str: Initial comprehensive summary
"""
print(f" 🤖 Using small LLM (gpt-4o-mini) for initial comprehensive summary")
print(f" • Creating foundation summary from first document")
# Track usage if RAG instance is available
if self.rag_instance:
self.rag_instance.small_llm_usage["document_summarization"] += 1
self.rag_instance.small_llm_usage["total_calls"] += 1
# If document is short enough, use as foundation
if self.count_tokens(document_text) <= target_tokens:
print(f" ✅ Document under target length, using as foundation")
return f"**Comprehensive Summary: {query_context}**\n\n{document_text}"
prompt = f"""
You are creating the initial comprehensive summary for the user's query. This will be the foundation that gets enriched with additional documents.
User Query: "{query_context}"
Instructions:
- Create a structured, comprehensive summary that directly addresses the user's query
- Organize information into clear sections with headers
- Include all relevant facts, figures, procedures, and findings from this document
- Use a format that can be easily expanded with additional information
- Keep under {target_tokens} tokens but be thorough
- Use markdown formatting for structure
Document to summarize:
{document_text}
Comprehensive Summary:
"""
try:
print(f" 🔄 Generating foundation summary...")
response = self.summarizer.invoke(prompt)
summary = response.content.strip()
summary_tokens = self.count_tokens(summary)
print(f" ✅ Foundation summary created: {summary_tokens:,} tokens")
return summary
except Exception as e:
print(f" ❌ Error creating initial summary: {e}")
# Fallback: truncate document
tokens = self.tokenizer.encode(document_text)
if len(tokens) > target_tokens:
truncated_tokens = tokens[:target_tokens]
return f"**Comprehensive Summary: {query_context}**\n\n" + self.tokenizer.decode(truncated_tokens)
return f"**Comprehensive Summary: {query_context}**\n\n{document_text}"
def enrich_summary(self, existing_summary, new_document, query_context, target_tokens=8000):
"""
Enrich an existing summary with information from a new document.
Args:
existing_summary: Current comprehensive summary
new_document: New document text to integrate
query_context: User's query for context
target_tokens: Target length for enriched summary
Returns:
str: Enriched comprehensive summary
"""
print(f" 🤖 Using small LLM (gpt-4o-mini) for summary enrichment")
print(f" • Integrating new document into existing summary")
# Track usage if RAG instance is available
if self.rag_instance:
self.rag_instance.small_llm_usage["document_summarization"] += 1
self.rag_instance.small_llm_usage["total_calls"] += 1
existing_tokens = self.count_tokens(existing_summary)
new_doc_tokens = self.count_tokens(new_document)
print(f" • Existing summary: {existing_tokens:,} tokens")
print(f" • New document: {new_doc_tokens:,} tokens")
print(f" • Target tokens: {target_tokens:,}")
# Calculate available space for expansion
available_expansion = target_tokens - existing_tokens
print(f" • Available expansion space: {available_expansion:,} tokens")
prompt = f"""
You are enriching a comprehensive summary with new information. Your task is to integrate the new document's relevant information into the existing summary while expanding its scope and detail.
User Query: "{query_context}"
Instructions:
- EXPAND the existing summary by integrating relevant information from the new document
- Add new sections if the new document covers different aspects
- Enrich existing sections with additional details, data, and insights from the new document
- Avoid redundancy - don't repeat information already covered
- Maintain the structured, organized format
- Prioritize information most relevant to the user's query
- TARGET LENGTH: approximately {target_tokens} tokens (you should use most of this space to provide comprehensive coverage)
- Preserve all important details, facts, figures, and specific findings
- When adding information, be specific about sources and context
CURRENT COMPREHENSIVE SUMMARY ({existing_tokens:,} tokens):
{existing_summary}
NEW DOCUMENT TO INTEGRATE ({new_doc_tokens:,} tokens):
{new_document}
ENRICHED COMPREHENSIVE SUMMARY (target: ~{target_tokens:,} tokens):
"""
try:
print(f" 🔄 Enriching summary with new information...")
response = self.summarizer.invoke(prompt)
enriched_summary = response.content.strip()
enriched_tokens = self.count_tokens(enriched_summary)
compression_ratio = (existing_tokens + new_doc_tokens) / enriched_tokens if enriched_tokens > 0 else 1
print(f" ✅ Summary enriched: {enriched_tokens:,} tokens (compression: {compression_ratio:.1f}x)")
return enriched_summary
except Exception as e:
print(f" ❌ Error enriching summary: {e}")
# Fallback: append key information from new document
fallback_addition = f"\n\n**Additional Information:**\n{new_document[:1000]}{'...' if len(new_document) > 1000 else ''}"
combined = existing_summary + fallback_addition
# Truncate if too long
if self.count_tokens(combined) > target_tokens:
tokens = self.tokenizer.encode(combined)
truncated_tokens = tokens[:target_tokens]
return self.tokenizer.decode(truncated_tokens)
return combined
def consolidate_with_large_llm(self, summary, additional_documents, query_context, target_tokens=8000):
"""
Use the large LLM to consolidate a summary with a batch of additional documents.
This is more efficient than incremental enrichment for larger batches.
Args:
summary: Current summary to consolidate
additional_documents: List of document texts to integrate
query_context: User's query for context
target_tokens: Target length for consolidated summary
Returns:
str: Consolidated comprehensive summary
"""
print(f" 🧠 Using large LLM for batch consolidation")
print(f" • Consolidating summary with {len(additional_documents)} documents")
# Track usage if RAG instance is available
if self.rag_instance:
self.rag_instance.large_llm_usage += 1
summary_tokens = self.count_tokens(summary)
docs_text = "\n\n---DOCUMENT SEPARATOR---\n\n".join(additional_documents)
docs_tokens = self.count_tokens(docs_text)
print(f" • Current summary: {summary_tokens:,} tokens")
print(f" • Additional documents: {docs_tokens:,} tokens")
print(f" • Target output: {target_tokens:,} tokens")
# Use the large LLM (gpt-4o) for consolidation
from langchain_openai import ChatOpenAI
large_llm = ChatOpenAI(
model="gpt-4o",
temperature=0,
max_tokens=target_tokens
)
prompt = f"""
You are consolidating a comprehensive research summary with additional documents. Your task is to create an enhanced, well-structured summary that integrates all the information effectively.
User Query: "{query_context}"
Instructions:
- Create a comprehensive, well-organized summary that addresses the user's query
- Integrate information from the current summary and ALL additional documents
- Organize information logically with clear sections and subsections
- Include specific details, data, findings, and insights from all sources
- Avoid redundancy while ensuring completeness
- Maintain scientific accuracy and preserve important technical details
- Target length: approximately {target_tokens} tokens
- Structure the output with clear headings and bullet points where appropriate
CURRENT SUMMARY ({summary_tokens:,} tokens):
{summary}
ADDITIONAL DOCUMENTS TO INTEGRATE ({len(additional_documents)} documents, {docs_tokens:,} tokens):
{docs_text}
CONSOLIDATED COMPREHENSIVE SUMMARY (target: ~{target_tokens:,} tokens):
"""
try:
print(f" 🔄 Consolidating with large LLM...")
response = large_llm.invoke(prompt)
consolidated_summary = response.content.strip()
consolidated_tokens = self.count_tokens(consolidated_summary)
compression_ratio = (summary_tokens + docs_tokens) / consolidated_tokens if consolidated_tokens > 0 else 1
print(f" ✅ Summary consolidated: {consolidated_tokens:,} tokens (compression: {compression_ratio:.1f}x)")
return consolidated_summary
except Exception as e:
print(f" ❌ Error consolidating with large LLM: {e}")
# Fallback to incremental enrichment
current_summary = summary
for doc in additional_documents:
current_summary = self.enrich_summary(current_summary, doc, query_context, target_tokens)
return current_summary
def consolidate_with_large_llm_and_citations(self, summary, additional_documents, document_metadata, query_context, target_tokens=8000, start_block_num=1, use_inline_citations=True):
"""
Use the large LLM to consolidate a summary with a batch of additional documents, with optional citations.
Args:
summary: Current summary to consolidate
additional_documents: List of document texts to integrate
document_metadata: List of metadata for each document
query_context: User's query for context
target_tokens: Target length for consolidated summary
start_block_num: Starting block number for citations
use_inline_citations: Whether to include inline citations (False for extensive search mode)
Returns:
str: Consolidated comprehensive summary with or without inline citations
"""
print(f" 🧠 Using large LLM for batch consolidation {'with citations' if use_inline_citations else 'without citations'}")
print(f" • Consolidating summary with {len(additional_documents)} documents")
# Track usage if RAG instance is available
if self.rag_instance:
self.rag_instance.large_llm_usage += 1
summary_tokens = self.count_tokens(summary) if summary else 0
# Prepare documents - with or without block numbers for citation
if use_inline_citations:
numbered_documents = []
block_nums = []
for i, (doc, metadata) in enumerate(zip(additional_documents, document_metadata)):
# Calculate the actual block number that will be used for this document
# The first block is the comprehensive summary, then individual sources follow
block_num = start_block_num + 1 + i # +1 because comprehensive summary takes the first block
block_nums.append(block_num)
title = metadata.get('title', f"Document {i+1}")
numbered_doc = f"**[block {block_num}] {title}**\n{doc}"
numbered_documents.append(numbered_doc)
else:
# For extensive search mode: simple document preparation without block numbers
numbered_documents = []
block_nums = []
for i, (doc, metadata) in enumerate(zip(additional_documents, document_metadata)):
title = metadata.get('title', f"Document {i+1}")
numbered_doc = f"**{title}**\n{doc}"
numbered_documents.append(numbered_doc)
docs_text = "\n\n---DOCUMENT SEPARATOR---\n\n".join(numbered_documents)
docs_tokens = self.count_tokens(docs_text)
print(f" • Current summary: {summary_tokens:,} tokens")
print(f" • Additional documents: {docs_tokens:,} tokens")
print(f" • Target output: {target_tokens:,} tokens")
if use_inline_citations:
print(f" • Block numbers: {block_nums}")
# Use the large LLM (gpt-4o) for consolidation
from langchain_openai import ChatOpenAI
large_llm = ChatOpenAI(
model="gpt-4o",
temperature=0,
max_tokens=target_tokens
)
current_summary_part = f"\n\nCURRENT SUMMARY ({summary_tokens:,} tokens):\n{summary}" if summary else ""
if use_inline_citations:
# Standard mode with inline citations
prompt = f"""
You are consolidating a comprehensive research summary with additional documents. Your task is to create an enhanced, well-structured summary that integrates all the information effectively WITH PROPER INLINE CITATIONS.
User Query: "{query_context}"
CRITICAL CITATION INSTRUCTIONS:
- You MUST include inline citations using the block numbers provided: [block X]
- When you reference information from a document, immediately cite it: [block X]
- Multiple sources for the same point: [block X, block Y]
- Every factual claim, data point, or specific finding MUST be cited
- Citations should be placed at the end of sentences or claims they support
CONTENT INSTRUCTIONS:
- Create a comprehensive, well-organized summary that addresses the user's query
- Integrate information from the current summary and ALL additional documents
- Organize information logically with clear sections and subsections
- Include specific details, data, findings, and insights from all sources
- Avoid redundancy while ensuring completeness
- Maintain scientific accuracy and preserve important technical details
- Target length: approximately {target_tokens} tokens
- Structure the output with clear headings and bullet points where appropriate{current_summary_part}
ADDITIONAL DOCUMENTS TO INTEGRATE ({len(additional_documents)} documents, {docs_tokens:,} tokens):
{docs_text}
CONSOLIDATED COMPREHENSIVE SUMMARY WITH CITATIONS (target: ~{target_tokens:,} tokens):
"""
else:
# Extensive search mode: Generate citations for tracking but will strip them later
prompt = f"""
You are consolidating a comprehensive research summary with additional documents. Your task is to create an enhanced, well-structured summary that integrates all the information effectively WITH PROPER INLINE CITATIONS for source tracking.
User Query: "{query_context}"
CRITICAL CITATION INSTRUCTIONS:
- You MUST include inline citations using the block numbers provided: [block X]
- When you reference information from a document, immediately cite it: [block X]
- Multiple sources for the same point: [block X, block Y]
- Every factual claim, data point, or specific finding MUST be cited
- Citations should be placed at the end of sentences or claims they support
- These citations will be used for source tracking and then processed for final formatting
CONTENT INSTRUCTIONS:
- Create a comprehensive, well-organized summary that addresses the user's query
- Integrate information from the current summary and ALL additional documents
- Organize information logically with clear sections and subsections
- Include specific details, data, findings, and insights from all sources
- Avoid redundancy while ensuring completeness
- Maintain scientific accuracy and preserve important technical details
- Target length: approximately {target_tokens} tokens
- Structure the output with clear headings and bullet points where appropriate{current_summary_part}
ADDITIONAL DOCUMENTS TO INTEGRATE ({len(additional_documents)} documents, {docs_tokens:,} tokens):
{docs_text}
CONSOLIDATED COMPREHENSIVE SUMMARY WITH CITATIONS (target: ~{target_tokens:,} tokens):
"""
try:
print(f" 🔄 Consolidating with large LLM {'and citations' if use_inline_citations else 'with citations for tracking (will be processed)'}...")
response = large_llm.invoke(prompt)
consolidated_summary = response.content.strip()
# In extensive search mode: extract citations and clean text
if not use_inline_citations:
print(f" 🔧 Processing citations for extensive search mode...")
# Extract all citations from the text for source tracking
import re
citations_found = re.findall(r'\[block \d+\]', consolidated_summary)
unique_citations = list(set(citations_found))
print(f" 📋 Citations found for source tracking: {len(unique_citations)} unique citations")
# Strip all citations from the text for clean reading
clean_summary = re.sub(r'\[block \d+\]', '', consolidated_summary)
# Clean up any double spaces left by citation removal
clean_summary = re.sub(r'\s+', ' ', clean_summary)
# Clean up spacing around punctuation
clean_summary = re.sub(r'\s+([.,;:])', r'\1', clean_summary)
consolidated_summary = clean_summary.strip()
print(f" ✅ Citations extracted and text cleaned for extensive search mode")
# Store citation info for reference building (if needed by calling code)
if hasattr(response, 'citations_extracted'):
response.citations_extracted = unique_citations
else:
# Add citation info as attribute for later use
consolidated_summary = consolidated_summary + f"\n\n<!-- CITATIONS_EXTRACTED: {','.join(unique_citations)} -->"
consolidated_tokens = self.count_tokens(consolidated_summary)
compression_ratio = (summary_tokens + docs_tokens) / consolidated_tokens if consolidated_tokens > 0 else 1
print(f" ✅ Summary consolidated {'with citations' if use_inline_citations else 'and cleaned'}: {consolidated_tokens:,} tokens (compression: {compression_ratio:.1f}x)")
return consolidated_summary
except Exception as e:
print(f" ❌ Error consolidating with large LLM: {e}")
# Fallback to simple concatenation
if use_inline_citations:
# Fallback with basic citations
if summary:
fallback = f"{summary}\n\n**Additional Information:**\n"
else:
fallback = "**Comprehensive Summary:**\n\n"
for i, doc in enumerate(additional_documents):
block_num = start_block_num + i
title = document_metadata[i].get('title', f'Document {i+1}')
fallback += f"\n**From {title} [block {block_num}]:**\n{doc[:500]}{'...' if len(doc) > 500 else ''}\n"
else:
# Fallback without citations (extensive search mode)
if summary:
fallback = f"{summary}\n\n**Additional Information:**\n"
else:
fallback = "**Comprehensive Summary:**\n\n"
for i, doc in enumerate(additional_documents):
title = document_metadata[i].get('title', f'Document {i+1}')
fallback += f"\n**From {title}:**\n{doc[:500]}{'...' if len(doc) > 500 else ''}\n"
return fallback
def process_documents_with_source_tracking(self, documents, query_context, target_summary_tokens=8000, batch_size=10, use_inline_citations=False, disable_citations=False):
"""
Process documents with improved source tracking and batch processing using large LLM.
Args:
documents: List of (metadata, content) tuples
query_context: User's query for context
target_summary_tokens: Target length for final summary
batch_size: Number of documents to process in each batch with large LLM
use_inline_citations: Whether to include inline citations (False for extensive search mode)
disable_citations: Whether to completely disable citation logic (True for extensive search mode)
Returns:
tuple: (comprehensive_summary, source_mapping, individual_summaries)
"""
print(f" 📚 Processing {len(documents)} documents with source tracking")
print(f" • Batch size: {batch_size} documents per large LLM call")
print(f" • Target summary tokens: {target_summary_tokens:,}")
if disable_citations:
print(f" • Citations: Completely disabled (extensive search mode)")
else:
print(f" • Inline citations: {'Enabled' if use_inline_citations else 'Disabled (extensive search mode)'}")
source_mapping = {} # Maps content sections to source documents
individual_summaries = [] # List of individual document summaries
comprehensive_summary = ""
doc_batch = []
doc_metadata_batch = []
documents_processed = 0
for i, (metadata, content) in enumerate(documents):
documents_processed += 1
doc_batch.append(content)
doc_metadata_batch.append(metadata)
# Store individual document info for source tracking
doc_id = f"doc_{i+1}"
individual_summaries.append({
'id': doc_id,
'metadata': metadata,
'content_preview': content[:200] + "..." if len(content) > 200 else content
})
# Process batch when we reach batch_size or end of documents
if len(doc_batch) >= batch_size or i == len(documents) - 1:
print(f" 🔄 Processing batch {(i//batch_size)+1}: documents {i+2-len(doc_batch)} to {i+1}")
if comprehensive_summary == "":
# First batch: create initial comprehensive summary
if len(doc_batch) == 1:
# Single document: use create_initial_summary
comprehensive_summary = self.create_initial_summary(
doc_batch[0],
query_context,
target_tokens=min(target_summary_tokens // 4, 3000)
)
else:
# Multiple documents: use large LLM for batch processing
if disable_citations:
# Use simple consolidation without citations
comprehensive_summary = self.consolidate_with_large_llm(
"",
doc_batch,
query_context,
target_tokens=min(target_summary_tokens // 2, 4000)
)
else:
# Use citation-enabled consolidation
comprehensive_summary = self.consolidate_with_large_llm_and_citations(
"",
doc_batch,
doc_metadata_batch,
query_context,
target_tokens=min(target_summary_tokens // 2, 4000),
start_block_num=self.rag_instance.block_counter if self.rag_instance else 1,
use_inline_citations=use_inline_citations
)
else:
# Subsequent batches: consolidate with existing summary
if disable_citations:
# Use simple consolidation without citations
comprehensive_summary = self.consolidate_with_large_llm(
comprehensive_summary,
doc_batch,
query_context,
target_tokens=target_summary_tokens
)
else:
# Use citation-enabled consolidation
comprehensive_summary = self.consolidate_with_large_llm_and_citations(
comprehensive_summary,
doc_batch,
doc_metadata_batch,
query_context,
target_tokens=target_summary_tokens,
start_block_num=self.rag_instance.block_counter if self.rag_instance else 1,
use_inline_citations=use_inline_citations
)
# Track which documents contributed to current summary section
batch_start = i + 2 - len(doc_batch)
batch_end = i + 1
source_mapping[f"batch_{(i//batch_size)+1}"] = {
'documents': list(range(batch_start, batch_end + 1)),
'metadata': doc_metadata_batch.copy()
}
# Clear batch for next iteration
doc_batch = []
doc_metadata_batch = []
print(f" ✅ Batch processed: {self.count_tokens(comprehensive_summary):,} tokens")
# Check if we've reached a good size
if self.count_tokens(comprehensive_summary) > (target_summary_tokens * 0.9):
print(f" 🎯 Reached target summary size with {documents_processed} documents")
break
print(f" ✅ Document processing complete:")
print(f" • Documents processed: {documents_processed}/{len(documents)}")
print(f" • Final summary: {self.count_tokens(comprehensive_summary):,} tokens")
print(f" • Source batches: {len(source_mapping)}")
return comprehensive_summary, source_mapping, individual_summaries
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
bases: Parameter of type
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self, session, chroma_client, api_key, rag_instance)
Purpose: Internal method: init
Parameters:
session: Parameterchroma_client: Parameterapi_key: Parameterrag_instance: Parameter
Returns: None
count_tokens(self, text)
Purpose: Count tokens in text
Parameters:
text: Parameter
Returns: None
get_full_document(self, chunk_metadata, collection_name)
Purpose: Retrieve the full document that a chunk belongs to by using Neo4j to find all chunks from the same document and reconstruct the complete document. Args: chunk_metadata: Metadata from the chunk containing bibtex path or document info collection_name: Name of the ChromaDB collection Returns: str: Full document text ordered by chunk sequence or None if not found
Parameters:
chunk_metadata: Parametercollection_name: Parameter
Returns: See docstring for return details
get_full_document_neo4j(self, chunk_uid)
Purpose: Retrieve the full document from Neo4j that a chunk belongs to. Updated to match the actual Neo4j schema from offline_docstore_multi_vice.py Args: chunk_uid: UID of the text/table chunk Returns: str: Full document text ordered by chunk sequence or None if not found
Parameters:
chunk_uid: Parameter
Returns: See docstring for return details
summarize_document(self, document_text, query_context, max_tokens)
Purpose: Summarize a full document with focus on the query context. Args: document_text: Full document text to summarize query_context: The user's query to focus the summary max_tokens: Maximum tokens for the summary Returns: str: Summarized document text
Parameters:
document_text: Parameterquery_context: Parametermax_tokens: Parameter
Returns: See docstring for return details
remove_duplicate_documents(self, documents_with_metadata, similarity_threshold)
Purpose: Remove duplicate documents based on content similarity. Args: documents_with_metadata: List of (metadata, document_text) tuples similarity_threshold: Threshold for considering documents as duplicates Returns: List of unique documents
Parameters:
documents_with_metadata: Parametersimilarity_threshold: Parameter
Returns: See docstring for return details
create_initial_summary(self, document_text, query_context, target_tokens)
Purpose: Create an initial comprehensive summary from the first document. Args: document_text: Text of the first document query_context: The user's query for context target_tokens: Target length for the initial summary Returns: str: Initial comprehensive summary
Parameters:
document_text: Parameterquery_context: Parametertarget_tokens: Parameter
Returns: See docstring for return details
enrich_summary(self, existing_summary, new_document, query_context, target_tokens)
Purpose: Enrich an existing summary with information from a new document. Args: existing_summary: Current comprehensive summary new_document: New document text to integrate query_context: User's query for context target_tokens: Target length for enriched summary Returns: str: Enriched comprehensive summary
Parameters:
existing_summary: Parameternew_document: Parameterquery_context: Parametertarget_tokens: Parameter
Returns: See docstring for return details
consolidate_with_large_llm(self, summary, additional_documents, query_context, target_tokens)
Purpose: Use the large LLM to consolidate a summary with a batch of additional documents. This is more efficient than incremental enrichment for larger batches. Args: summary: Current summary to consolidate additional_documents: List of document texts to integrate query_context: User's query for context target_tokens: Target length for consolidated summary Returns: str: Consolidated comprehensive summary
Parameters:
summary: Parameteradditional_documents: Parameterquery_context: Parametertarget_tokens: Parameter
Returns: See docstring for return details
consolidate_with_large_llm_and_citations(self, summary, additional_documents, document_metadata, query_context, target_tokens, start_block_num, use_inline_citations)
Purpose: Use the large LLM to consolidate a summary with a batch of additional documents, with optional citations. Args: summary: Current summary to consolidate additional_documents: List of document texts to integrate document_metadata: List of metadata for each document query_context: User's query for context target_tokens: Target length for consolidated summary start_block_num: Starting block number for citations use_inline_citations: Whether to include inline citations (False for extensive search mode) Returns: str: Consolidated comprehensive summary with or without inline citations
Parameters:
summary: Parameteradditional_documents: Parameterdocument_metadata: Parameterquery_context: Parametertarget_tokens: Parameterstart_block_num: Parameteruse_inline_citations: Parameter
Returns: See docstring for return details
process_documents_with_source_tracking(self, documents, query_context, target_summary_tokens, batch_size, use_inline_citations, disable_citations)
Purpose: Process documents with improved source tracking and batch processing using large LLM. Args: documents: List of (metadata, content) tuples query_context: User's query for context target_summary_tokens: Target length for final summary batch_size: Number of documents to process in each batch with large LLM use_inline_citations: Whether to include inline citations (False for extensive search mode) disable_citations: Whether to completely disable citation logic (True for extensive search mode) Returns: tuple: (comprehensive_summary, source_mapping, individual_summaries)
Parameters:
documents: Parameterquery_context: Parametertarget_summary_tokens: Parameterbatch_size: Parameteruse_inline_citations: Parameterdisable_citations: Parameter
Returns: See docstring for return details
Required Imports
from typing import List
from typing import Any
from typing import Dict
import os
import panel as pn
Usage Example
# Example usage:
# result = ExtensiveSearchManager(bases)
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class ExtensiveSearchManager_v1 99.5% similar
-
class ExtensiveSearchManager 98.8% similar
-
function extensive_mode_example 53.0% similar
-
class ReferenceManager_v3 52.2% similar
-
class ReferenceManager_v4 51.8% similar