class QueryBasedExtractor
A class that extracts relevant information from documents using a small LLM (Language Model), designed for Extensive and Full Reading modes in RAG systems.
/tf/active/vicechatdev/docchat/rag_engine.py
92 - 327
moderate
Purpose
QueryBasedExtractor performs targeted information extraction from single or multiple documents based on user queries. It uses a small LLM (default: gpt-4o-mini) to extract only the most relevant passages without summarization, maintaining original wording. The class handles token counting, text chunking, document truncation, and consolidation of extractions from multiple sources. It integrates with a parent RAG engine for usage tracking and ensures extracted content fits within token limits.
Source Code
class QueryBasedExtractor:
"""
Extract relevant information from documents using a small LLM
Used in Extensive and Full Reading modes
"""
def __init__(self, api_key: str, model_name: str = "gpt-4o-mini",
max_output_tokens: int = 2048, parent_rag=None):
"""
Initialize extractor
Args:
api_key: OpenAI API key
model_name: Small LLM model for extraction
max_output_tokens: Maximum tokens in extracted output
parent_rag: Reference to parent RAG engine for usage tracking
"""
self.api_key = api_key
self.model_name = model_name
self.max_output_tokens = max_output_tokens
self.tokenizer = tiktoken.get_encoding("cl100k_base")
self.parent_rag = parent_rag # For usage tracking
# Use shared factory to ensure GPT-5 compatibility for small LLM as well
self.llm = get_llm_instance(
model_name=model_name,
temperature=0,
max_tokens=max_output_tokens
)
def count_tokens(self, text: str) -> int:
"""Count tokens in text"""
return len(self.tokenizer.encode(text))
def chunk_text(self, text: str, max_chunk_tokens: int) -> List[str]:
"""
Split text into chunks that fit within token limit
Args:
text: Text to chunk
max_chunk_tokens: Maximum tokens per chunk
Returns:
List of text chunks
"""
# Try to split on paragraphs first
paragraphs = text.split('\n\n')
chunks = []
current_chunk = []
current_tokens = 0
for para in paragraphs:
para_tokens = self.count_tokens(para)
# If single paragraph is too large, split it further
if para_tokens > max_chunk_tokens:
# If we have accumulated content, save it
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = []
current_tokens = 0
# Split large paragraph by sentences
sentences = para.split('. ')
for sentence in sentences:
sent_tokens = self.count_tokens(sentence)
if current_tokens + sent_tokens > max_chunk_tokens:
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = [sentence]
current_tokens = sent_tokens
else:
current_chunk.append(sentence)
current_tokens += sent_tokens
else:
# Normal paragraph
if current_tokens + para_tokens > max_chunk_tokens:
# Save current chunk and start new one
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
current_chunk = [para]
current_tokens = para_tokens
else:
current_chunk.append(para)
current_tokens += para_tokens
# Add remaining content
if current_chunk:
chunks.append('\n\n'.join(current_chunk))
return chunks
def extract_from_document(self, document_text: str, query: str) -> str:
"""
Extract relevant information from a document based on query
Args:
document_text: Full document text
query: User query to guide extraction
Returns:
Extracted relevant information
"""
# Track usage if parent RAG is available
if self.parent_rag:
self.parent_rag.small_llm_usage["document_extraction"] += 1
self.parent_rag.small_llm_usage["total_calls"] += 1
# If document is already small enough, return as-is
if self.count_tokens(document_text) <= self.max_output_tokens:
return document_text
# Calculate token budget for document text in the extraction prompt
# Model limit for the extraction LLM (using small LLM)
model_limit = 128000 # GPT-4o-mini limit
reserved_for_output = self.max_output_tokens # 8000 tokens
safety_margin = 1000
# Base prompt template tokens (without document text)
base_prompt = f"""You are performing targeted information extraction. Extract ONLY the most relevant
information from the provided document that directly addresses the user's query.
IMPORTANT INSTRUCTIONS:
- DO NOT summarize or paraphrase - extract exact relevant passages
- Maintain original wording and details crucial for answering the query
- Include complete sentences and necessary context
- Extract passages in order of relevance
- If the document doesn't contain relevant information, state that clearly
- Maximum extraction length: {self.max_output_tokens} tokens
USER QUERY:
{query}
DOCUMENT TEXT:
[DOCUMENT_PLACEHOLDER]
EXTRACTED RELEVANT INFORMATION:
"""
base_tokens = self.count_tokens(base_prompt)
# Available tokens for document text
max_doc_tokens = model_limit - reserved_for_output - safety_margin - base_tokens
# Truncate document if needed
doc_tokens = self.count_tokens(document_text)
if doc_tokens > max_doc_tokens:
logger.warning(f"Document too large ({doc_tokens} tokens), truncating to {max_doc_tokens} tokens for extraction")
# Truncate using tokenizer
tokens = self.tokenizer.encode(document_text)
truncated_tokens = tokens[:max_doc_tokens]
document_text = self.tokenizer.decode(truncated_tokens)
document_text += "\n\n[... document truncated due to length ...]"
# Create extraction prompt with size-controlled document
prompt = f"""You are performing targeted information extraction. Extract ONLY the most relevant
information from the provided document that directly addresses the user's query.
IMPORTANT INSTRUCTIONS:
- DO NOT summarize or paraphrase - extract exact relevant passages
- Maintain original wording and details crucial for answering the query
- Include complete sentences and necessary context
- Extract passages in order of relevance
- If the document doesn't contain relevant information, state that clearly
- Maximum extraction length: {self.max_output_tokens} tokens
USER QUERY:
{query}
DOCUMENT TEXT:
{document_text}
EXTRACTED RELEVANT INFORMATION:
"""
response = self.llm.invoke(prompt)
return response.content.strip()
def extract_from_multiple_documents(self, documents: List[Dict[str, str]],
query: str) -> str:
"""
Extract and consolidate information from multiple documents
Args:
documents: List of documents with 'text' and 'metadata'
query: User query
Returns:
Consolidated extracted information
"""
extractions = []
for i, doc in enumerate(documents):
logger.info(f"Extracting from document {i+1}/{len(documents)}: {doc.get('file_name', 'Unknown')}")
extracted = self.extract_from_document(doc['text'], query)
if extracted and extracted.strip():
# Add document attribution
file_name = doc.get('file_name', f'Document {i+1}')
extractions.append(f"### From: {file_name}\n\n{extracted}")
# Combine all extractions
combined = "\n\n---\n\n".join(extractions)
# If still too long, do consolidation pass
if self.count_tokens(combined) > self.max_output_tokens * 2:
return self._consolidate_extractions(combined, query)
return combined
def _consolidate_extractions(self, combined_text: str, query: str) -> str:
"""Consolidate multiple extractions into a coherent summary"""
prompt = f"""You are consolidating information extracted from multiple documents.
Create a coherent synthesis that includes the most important and relevant information
to answer the user's query, avoiding redundancy.
INSTRUCTIONS:
- Focus on the most relevant information
- Maintain original wording for key facts and details
- Remove redundant information
- Organize information logically
- Keep document attributions when significant
- Maximum length: {self.max_output_tokens * 2} tokens
USER QUERY:
{query}
EXTRACTED INFORMATION FROM MULTIPLE DOCUMENTS:
{combined_text}
CONSOLIDATED INFORMATION:
"""
response = self.llm.invoke(prompt)
return response.content.strip()
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
api_key: OpenAI API key required for authenticating with the LLM service. Must be a valid API key string.
model_name: Name of the small LLM model to use for extraction. Defaults to 'gpt-4o-mini'. Should be a model identifier compatible with the llm_factory.get_llm_instance function.
max_output_tokens: Maximum number of tokens allowed in the extracted output. Defaults to 2048. Controls the size of extracted information to prevent exceeding model limits.
parent_rag: Optional reference to the parent RAG engine object. Used for tracking small LLM usage statistics. Can be None if usage tracking is not needed.
Return Value
Instantiation returns a QueryBasedExtractor object configured with the specified LLM and token limits. Key method returns: extract_from_document() returns a string of extracted relevant information; extract_from_multiple_documents() returns a consolidated string combining extractions from all documents; count_tokens() returns an integer token count; chunk_text() returns a list of text chunk strings.
Class Interface
Methods
__init__(self, api_key: str, model_name: str = 'gpt-4o-mini', max_output_tokens: int = 2048, parent_rag=None)
Purpose: Initialize the QueryBasedExtractor with LLM configuration and token limits
Parameters:
api_key: OpenAI API key for authenticationmodel_name: LLM model identifier (default: 'gpt-4o-mini')max_output_tokens: Maximum tokens in extracted output (default: 2048)parent_rag: Optional parent RAG engine reference for usage tracking
Returns: None (constructor)
count_tokens(self, text: str) -> int
Purpose: Count the number of tokens in a given text string using tiktoken encoding
Parameters:
text: Text string to count tokens for
Returns: Integer count of tokens in the text
chunk_text(self, text: str, max_chunk_tokens: int) -> List[str]
Purpose: Split text into chunks that fit within a specified token limit, preserving paragraph and sentence boundaries
Parameters:
text: Text to be chunkedmax_chunk_tokens: Maximum number of tokens allowed per chunk
Returns: List of text chunk strings, each within the token limit
extract_from_document(self, document_text: str, query: str) -> str
Purpose: Extract relevant information from a single document based on a user query, maintaining original wording
Parameters:
document_text: Full text content of the document to extract fromquery: User query that guides what information to extract
Returns: String containing extracted relevant passages from the document
extract_from_multiple_documents(self, documents: List[Dict[str, str]], query: str) -> str
Purpose: Extract and consolidate information from multiple documents, adding document attribution
Parameters:
documents: List of document dictionaries, each containing 'text' and optionally 'file_name' keysquery: User query to guide extraction across all documents
Returns: Consolidated string combining extractions from all documents with source attribution
_consolidate_extractions(self, combined_text: str, query: str) -> str
Purpose: Private method to consolidate multiple extractions into a coherent synthesis when combined text is too long
Parameters:
combined_text: Combined text from multiple document extractionsquery: Original user query for context
Returns: Consolidated and deduplicated string of extracted information
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
api_key |
str | OpenAI API key used for LLM authentication | instance |
model_name |
str | Name of the LLM model used for extraction (e.g., 'gpt-4o-mini') | instance |
max_output_tokens |
int | Maximum number of tokens allowed in extracted output | instance |
tokenizer |
tiktoken.Encoding | Tiktoken tokenizer instance using cl100k_base encoding for token counting | instance |
parent_rag |
Optional[object] | Reference to parent RAG engine for tracking small LLM usage statistics | instance |
llm |
LLM instance | LLM instance created via get_llm_instance factory, configured with temperature=0 and max_tokens | instance |
Dependencies
tiktokenlangchain_communitylogging
Required Imports
import tiktoken
from typing import List, Dict
import logging
from llm_factory import get_llm_instance
Conditional/Optional Imports
These imports are only needed under specific conditions:
from llm_factory import get_llm_instance
Condition: Required for creating LLM instances with GPT-5 compatibility
Required (conditional)Usage Example
# Initialize the extractor
api_key = 'your-openai-api-key'
extractor = QueryBasedExtractor(api_key=api_key, model_name='gpt-4o-mini', max_output_tokens=2048)
# Extract from a single document
document_text = 'Long document content here...'
query = 'What are the main findings about climate change?'
extracted_info = extractor.extract_from_document(document_text, query)
print(extracted_info)
# Extract from multiple documents
documents = [
{'text': 'Document 1 content...', 'file_name': 'report1.pdf'},
{'text': 'Document 2 content...', 'file_name': 'report2.pdf'}
]
consolidated = extractor.extract_from_multiple_documents(documents, query)
print(consolidated)
# Count tokens in text
token_count = extractor.count_tokens('Some text to count')
print(f'Token count: {token_count}')
# Chunk large text
chunks = extractor.chunk_text('Very long text...', max_chunk_tokens=1000)
for i, chunk in enumerate(chunks):
print(f'Chunk {i+1}: {chunk[:100]}...')
Best Practices
- Always provide a valid OpenAI API key during instantiation
- Set max_output_tokens based on your downstream processing needs and model limits
- Use parent_rag parameter if you need to track LLM usage statistics across your application
- The class automatically handles document truncation when content exceeds token limits
- For multiple documents, the class adds document attribution (file names) to help track information sources
- The extractor prioritizes exact passage extraction over summarization to maintain accuracy
- Token counting uses tiktoken's cl100k_base encoding, which is compatible with GPT-4 and GPT-3.5 models
- When processing multiple documents, if combined extractions exceed 2x max_output_tokens, automatic consolidation occurs
- The class uses temperature=0 for deterministic extraction results
- Document chunking preserves paragraph boundaries when possible, falling back to sentence-level splitting for large paragraphs
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class QueryBasedExtractor_v2 90.7% similar
-
class QueryBasedExtractor_v1 90.2% similar
-
class RegulatoryExtractor 64.6% similar
-
class QueryParser 62.7% similar
-
class DocumentExtractor 58.1% similar