class QueryBasedExtractor_v1
A class that performs targeted information extraction from text using LLM-based query-guided extraction, with support for handling long documents through chunking and token management.
/tf/active/vicechatdev/vice_ai/hybrid_rag_engine.py
86 - 322
complex
Purpose
QueryBasedExtractor is designed to extract relevant information from text documents based on user-provided queries. It uses a small LLM (default: gpt-4o-mini) to intelligently extract passages that directly address specific queries without summarization. The class handles token counting, manages API calls with retry logic, and automatically chunks very long texts that exceed model context limits. It's particularly useful for reducing large documents to their most relevant portions while maintaining original wording and context.
Source Code
class QueryBasedExtractor:
def __init__(self, max_output_tokens=1024, api_key=None, model_name="gpt-4o-mini"):
"""
Initialize the extractor with configuration for a small LLM.
Args:
max_output_tokens: Maximum tokens for the extracted output
api_key: API key for the LLM service
model_name: Small LLM model to use
"""
self.max_output_tokens = max_output_tokens
self.api_key = api_key
self.model_name = model_name
# Set up tiktoken encoder for token counting
import tiktoken
self.tokenizer = tiktoken.get_encoding("cl100k_base")
# Set up OpenAI client if API key is provided
if api_key:
import openai
import os
os.environ["OPENAI_API_KEY"] = api_key
self.client = openai.OpenAI(api_key=api_key)
def count_tokens(self, text):
"""Count tokens in a string."""
return len(self.tokenizer.encode(text))
def call_llm(self, prompt):
"""
Call the LLM with the prompt.
Args:
prompt: The formatted prompt for extraction
Returns:
Extracted text from the LLM
"""
from langchain_openai import ChatOpenAI
import time
max_retries = 3
retry_delay = 2
for attempt in range(max_retries):
try:
# Use LangChain's ChatOpenAI for consistency with OneCo_hybrid_RAG
llm = ChatOpenAI(
model=self.model_name,
temperature=0,
max_tokens=self.max_output_tokens,
request_timeout=60 # 60 second timeout
)
print(f" 🤖 Calling {self.model_name} (attempt {attempt + 1}/{max_retries})...")
response = llm.invoke(prompt)
print(f" ✅ LLM response received ({len(response.content)} chars)")
return response.content
except Exception as e:
error_msg = str(e)
print(f" ⚠️ LLM call failed (attempt {attempt + 1}/{max_retries}): {error_msg[:200]}")
if attempt < max_retries - 1:
print(f" ⏳ Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
retry_delay *= 2 # Exponential backoff
else:
print(f" ❌ All retry attempts failed")
# Return empty string instead of crashing
return ""
def create_extraction_prompt(self, queries, text):
"""
Create a prompt for targeted information extraction based on queries.
Args:
queries: List of queries to guide the extraction
text: Text to extract from
Returns:
Formatted prompt string
"""
formatted_queries = "\n".join([f"- {q}" for q in queries])
# Design an extraction-focused prompt based on OneCo_hybrid_RAG style
prompt = f"""
You are performing targeted information extraction. Given the queries below, extract ONLY the most relevant
passages from the provided text that directly address these queries.
IMPORTANT INSTRUCTIONS:
- DO NOT summarize or paraphrase - extract the exact relevant passages
- Maintain original wording and details crucial for answering the queries
- Include complete sentences and necessary context around key points
- Extract passages in order of relevance to the queries
- If important details are in different parts of the text, include all relevant sections
- Extract ONLY information relevant to the queries
- The extraction MUST be self-contained and make sense on its own
- Maximum output length: {self.max_output_tokens} tokens
QUERIES:
{formatted_queries}
TEXT TO EXTRACT FROM:
{text}
RELEVANT EXTRACTED INFORMATION:
"""
return prompt
def extract(self, text, queries):
"""
Extract relevant information from text based on queries.
Args:
text: Text to extract from
queries: List of queries to guide extraction
Returns:
Extracted relevant information
"""
# Check text length to determine if extraction is needed
text_tokens = self.count_tokens(text)
# If text is already under token limit, just return it
if text_tokens <= self.max_output_tokens:
print("Text is within token limit, no extraction needed.")
return text
# Create extraction prompt
prompt = self.create_extraction_prompt(queries, text)
# Check prompt size to ensure it fits in model context
prompt_tokens = self.count_tokens(prompt)
# For very large texts that won't fit in model context, we need chunking
if prompt_tokens > 100000: # Assuming context limit of a small model
return self.process_long_text(text, queries)
# Otherwise, do direct extraction
print(f" 📝 Extracting information from text ({len(text)} chars, {prompt_tokens} tokens)...")
print(f" 🎯 Using model: {self.model_name} with max_tokens={self.max_output_tokens}")
result = self.call_llm(prompt)
print(f" ✅ Extraction complete ({len(result)} chars)")
return result
def process_long_text(self, text, queries):
"""
Process very long text by splitting into chunks and extracting from each.
Args:
text: Long text to process
queries: List of queries for extraction
Returns:
Combined extraction from all chunks
"""
# Calculate how much space we need for queries and prompt template
query_text = "\n".join([f"- {q}" for q in queries])
prompt_template = self.create_extraction_prompt([], "")
fixed_tokens = self.count_tokens(prompt_template) + self.count_tokens(query_text)
# Calculate available space for text in each chunk
# 7000 is a conservative estimate of context window for small model like gpt-4o-mini
# Adjust based on the actual model being used
available_tokens = 100000 - fixed_tokens - 100 # 100 token buffer
# Split text into chunks that fit in context window
text_tokens = self.tokenizer.encode(text)
chunks = []
for i in range(0, len(text_tokens), available_tokens):
chunk_tokens = text_tokens[i:i+available_tokens]
chunk_text = self.tokenizer.decode(chunk_tokens)
chunks.append(chunk_text)
# Process each chunk and collect extractions
all_extractions = []
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)}")
# Create an extraction prompt for this chunk
chunk_prompt = f"""
You are performing targeted information extraction. Extract ONLY the most relevant
passages from this text chunk that directly address the queries below.
IMPORTANT CONTEXT:
- This is chunk {i+1} of {len(chunks)} from a larger document
- Extract only information relevant to the queries
- DO NOT summarize - extract exact relevant passages
- Maintain original wording and crucial details
- Maximum extraction length: {self.max_output_tokens // len(chunks)} tokens
QUERIES:
{query_text}
TEXT CHUNK {i+1}/{len(chunks)}:
{chunk}
RELEVANT EXTRACTED INFORMATION:
"""
extracted = self.call_llm(chunk_prompt)
if extracted.strip():
all_extractions.append(extracted.strip())
# Combine all extractions
combined = "\n\n".join(all_extractions)
# If combined extractions are still too long, do a second pass
if self.count_tokens(combined) > self.max_output_tokens:
consolidation_prompt = f"""
You are performing final extraction consolidation. You have extracts from different parts
of a document that address the queries below.
Your task is to create a single coherent extract that includes ONLY the most important and
relevant passages to answer the queries, while avoiding redundancy.
IMPORTANT INSTRUCTIONS:
- Focus only on the most relevant information for the queries
- Maintain original wording from the extracts
- Remove redundant information that appears in multiple extracts
- Create a coherent, self-contained extract
- Maximum output length: {self.max_output_tokens} tokens
QUERIES:
{query_text}
EXTRACTS TO CONSOLIDATE:
{combined}
FINAL CONSOLIDATED EXTRACT:
"""
return self.call_llm(consolidation_prompt)
return combined
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
max_output_tokens: Maximum number of tokens allowed in the extracted output. Default is 1024. This controls the length of the extraction and is used to determine if extraction is needed at all (texts shorter than this are returned as-is).
api_key: OpenAI API key for authentication. If provided, sets up the OpenAI client and environment variable. Can be None if the API key is already set in the environment.
model_name: Name of the OpenAI model to use for extraction. Default is 'gpt-4o-mini'. Should be a model that supports chat completions via LangChain's ChatOpenAI interface.
Return Value
The __init__ method returns a QueryBasedExtractor instance. The extract() method returns a string containing the extracted relevant information from the input text. If the text is already within token limits, it returns the original text. For long texts, it returns consolidated extractions from multiple chunks. The call_llm() method returns the LLM's response as a string, or an empty string if all retries fail.
Class Interface
Methods
__init__(self, max_output_tokens=1024, api_key=None, model_name='gpt-4o-mini')
Purpose: Initialize the QueryBasedExtractor with LLM configuration and set up tokenizer and OpenAI client
Parameters:
max_output_tokens: Maximum tokens for extracted output (default: 1024)api_key: OpenAI API key (default: None)model_name: LLM model name (default: 'gpt-4o-mini')
Returns: None - initializes instance attributes
count_tokens(self, text: str) -> int
Purpose: Count the number of tokens in a given text string using tiktoken encoder
Parameters:
text: String to count tokens in
Returns: Integer count of tokens in the text
call_llm(self, prompt: str) -> str
Purpose: Call the LLM with a prompt, implementing retry logic with exponential backoff for robustness
Parameters:
prompt: The formatted prompt string to send to the LLM
Returns: String containing the LLM's response content, or empty string if all retries fail
create_extraction_prompt(self, queries: list, text: str) -> str
Purpose: Create a formatted prompt for targeted information extraction based on queries and source text
Parameters:
queries: List of query strings to guide the extractiontext: Source text to extract information from
Returns: Formatted prompt string ready to send to the LLM
extract(self, text: str, queries: list) -> str
Purpose: Main extraction method that extracts relevant information from text based on queries, automatically handling short texts, normal texts, and very long texts requiring chunking
Parameters:
text: Source text to extract information fromqueries: List of query strings to guide extraction
Returns: String containing extracted relevant information that addresses the queries
process_long_text(self, text: str, queries: list) -> str
Purpose: Process very long texts that exceed model context limits by splitting into chunks, extracting from each chunk, and consolidating results
Parameters:
text: Long source text that exceeds context windowqueries: List of query strings for extraction guidance
Returns: String containing consolidated extraction from all chunks, potentially with a second consolidation pass if needed
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
max_output_tokens |
int | Maximum number of tokens allowed in the extracted output | instance |
api_key |
str or None | OpenAI API key for authentication | instance |
model_name |
str | Name of the OpenAI model to use for extraction | instance |
tokenizer |
tiktoken.Encoding | Tiktoken encoder instance (cl100k_base) for counting tokens in text | instance |
client |
openai.OpenAI | OpenAI client instance, only created if api_key is provided | instance |
Dependencies
tiktokenopenailangchain_openaitime
Required Imports
import tiktoken
import openai
import os
from langchain_openai import ChatOpenAI
import time
Conditional/Optional Imports
These imports are only needed under specific conditions:
import tiktoken
Condition: imported in __init__ when instance is created
Required (conditional)import openai
Condition: imported in __init__ only if api_key is provided
Optionalimport os
Condition: imported in __init__ only if api_key is provided
Optionalfrom langchain_openai import ChatOpenAI
Condition: imported in call_llm method when LLM is invoked
Required (conditional)import time
Condition: imported in call_llm method for retry delays
Required (conditional)Usage Example
# Basic usage
extractor = QueryBasedExtractor(
max_output_tokens=1024,
api_key='your-openai-api-key',
model_name='gpt-4o-mini'
)
# Define queries to guide extraction
queries = [
'What are the main findings of the study?',
'What methodology was used?',
'What are the conclusions?'
]
# Extract relevant information from a long document
long_text = '''Your long document text here...'''
extracted_info = extractor.extract(long_text, queries)
print(extracted_info)
# Count tokens in text
token_count = extractor.count_tokens(long_text)
print(f'Token count: {token_count}')
# For very long documents, the class automatically handles chunking
very_long_text = '''Extremely long document...'''
extracted = extractor.extract(very_long_text, queries)
Best Practices
- Always provide an API key either through the constructor or as an environment variable before calling extract()
- The class automatically determines if extraction is needed based on token count - texts under max_output_tokens are returned unchanged
- For very long documents (>100k tokens), the class automatically chunks the text and processes each chunk separately
- The call_llm method implements exponential backoff retry logic (3 attempts) to handle transient API failures
- Use specific, focused queries for better extraction results - vague queries may result in less relevant extractions
- The extractor preserves original wording rather than summarizing, making it suitable for maintaining factual accuracy
- Token counting uses tiktoken's cl100k_base encoding, which is compatible with GPT-4 and GPT-3.5 models
- The class sets temperature=0 for deterministic extraction results
- For production use, consider monitoring token usage as extraction can consume significant tokens for large documents
- The 60-second timeout per LLM call may need adjustment for very large extractions
- When processing long texts, the class divides max_output_tokens among chunks, then consolidates if needed
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class QueryBasedExtractor_v2 97.9% similar
-
class QueryBasedExtractor 90.2% similar
-
class QueryParser 63.7% similar
-
class DocumentExtractor 60.4% similar
-
class RegulatoryExtractor 59.3% similar