class VersionComparisonService
A service class that compares two versions of a document using LLM-based analysis, implementing smart segmentation and chunking for handling large documents efficiently.
/tf/active/vicechatdev/CDocs/utils/version_comparison.py
14 - 485
complex
Purpose
This class provides comprehensive document version comparison capabilities by segmenting large documents into manageable chunks, aligning corresponding segments between versions using similarity matching, and generating detailed analysis reports using LLM. It handles documents of varying sizes by intelligently splitting them at natural boundaries (headings, paragraphs, sentences) while respecting token limits, then produces both segment-level and executive-level summaries of changes including additions, deletions, and modifications.
Source Code
class VersionComparisonService:
"""
Service to compare two versions of a document using LLM-based analysis.
Implements smart segmentation and chunking for large documents.
"""
def __init__(self,
model_name: str = "gpt-4o",
temperature: float = 0,
api_key: Optional[str] = None):
"""
Initialize the comparison service.
Args:
model_name: Name of the LLM model to use
temperature: Temperature setting for the LLM
api_key: OpenAI API key (optional, defaults to env variable)
"""
# Set up LLM
self.model_name = model_name
self.temperature = temperature
# Use provided API key or get from environment
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
# Set up tokenizer for counting
self.tokenizer = tiktoken.get_encoding("cl100k_base")
# Create LLM instance
self.llm = ChatOpenAI(
model=model_name,
temperature=temperature
)
# Define chunk sizes
self.max_chunk_size = 8000 # Max tokens for comparison
self.max_context_size = 15000 # Max tokens for full context
# Define prompt templates
self.segment_comparison_template = """
You are a document comparison expert analyzing two versions of the same document.
# Version Information
Version A: {version_a_number}
Version B: {version_b_number}
Document Title: {document_title}
# Document Segments
## Segment from Version A:
```
{segment_a}
```
## Segment from Version B:
```
{segment_b}
```
# Analysis Instructions:
1. Identify and summarize the differences between these segments
2. Note any content that was added, removed, or modified
3. Categorize changes as: style changes, content additions, content removals, or content modifications
4. Highlight important substantive changes vs. minor formatting changes
5. Be specific about exactly what changed and how it changed
# Output Format:
Provide your analysis in markdown format with these sections:
1. **Summary of Changes** - Brief overview of the key differences
2. **Detailed Changes** - Specific differences with examples
3. **Impact Assessment** - How these changes affect the document's meaning or purpose
Focus on being precise and concise.
"""
self.overall_summary_template = """
You are a document version comparison expert. You've analyzed multiple segments of two different document versions and now need to create a comprehensive executive summary.
# Version Information
Version A: {version_a_number}
Version B: {version_b_number}
Document Title: {document_title}
# Segment Analysis Results
{segment_results}
# Analysis Instructions:
Create a comprehensive executive summary of the differences between Version A and Version B based on the segment analyses provided.
1. Synthesize the segment analyses into a coherent overview of document changes
2. Categorize and quantify the types of changes (additions, deletions, modifications)
3. Highlight the most significant substantive changes
4. Identify patterns in the changes that reveal the overall purpose of the revision
5. Note any sections that remained unchanged, if mentioned in the analyses
# Output Format:
Provide your summary in markdown format with these sections:
1. **Executive Summary** - Brief overview of key changes (2-3 paragraphs)
2. **Major Changes** - List of the most significant changes
3. **Change Statistics** - Approximate quantification of change types
4. **Change Patterns** - Overall patterns and purpose of the revision
5. **Unchanged Elements** - Any significant parts that remained consistent
Be specific, analytical, and focus on the business impact of these changes.
"""
def count_tokens(self, text: str) -> int:
"""Count tokens in a string."""
return len(self.tokenizer.encode(text))
def compare_documents(self,
doc_a: str,
doc_b: str,
version_a_number: str,
version_b_number: str,
document_title: str = "Document") -> str:
"""
Compare two document versions and generate a comprehensive analysis.
Args:
doc_a: Text content of the first document version
doc_b: Text content of the second document version
version_a_number: Version number/identifier for first document
version_b_number: Version number/identifier for second document
document_title: Title of the document
Returns:
Markdown-formatted comparison report
"""
# Check if we're comparing identical documents
if doc_a == doc_b:
return "## No Changes Detected\n\nThe two document versions are identical."
# Segment the documents into comparable chunks
segments_a = self._segment_document(doc_a)
segments_b = self._segment_document(doc_b)
# Calculate similarity matrix for optimal segment alignment
aligned_segments = self._align_segments(segments_a, segments_b)
# Compare each pair of aligned segments
segment_results = []
logger.info(f"Comparing {len(aligned_segments)} segment pairs...")
for i, (segment_a_idx, segment_b_idx) in enumerate(aligned_segments):
segment_a = segments_a[segment_a_idx] if segment_a_idx is not None else ""
segment_b = segments_b[segment_b_idx] if segment_b_idx is not None else ""
# Skip if both segments are empty
if not segment_a and not segment_b:
continue
# Log progress
logger.info(f"Comparing segment pair {i+1}/{len(aligned_segments)}")
# Analyze this segment pair
segment_result = self._compare_segments(
segment_a,
segment_b,
version_a_number,
version_b_number,
document_title
)
segment_results.append(segment_result)
# Generate overall summary
if not segment_results:
return "## Comparison Error\n\nNo comparable segments were found between the two document versions."
# Generate final summary from all segment results
return self._generate_overall_summary(
segment_results,
version_a_number,
version_b_number,
document_title
)
def _segment_document(self, doc: str) -> List[str]:
"""
Segment a document into manageable chunks for comparison.
Uses natural text boundaries (paragraphs, sections) for divisions
while ensuring chunks don't exceed token limits.
Args:
doc: Document text to segment
Returns:
List of text segments
"""
# Handle empty document
if not doc or doc.strip() == "":
return [""]
# Clean the document text (normalize line endings)
clean_doc = doc.replace('\r\n', '\n').replace('\r', '\n')
# First try to split by section headings (markdown headings or numbered headings)
heading_pattern = r'(?:\n|^)(?:#{1,6} |\d+\.\d* )'
section_splits = re.split(f'({heading_pattern})', clean_doc)
# If we got meaningful sections
if len(section_splits) > 2:
# Recombine heading markers with their content
sections = []
for i in range(0, len(section_splits) - 1, 2):
if i + 1 < len(section_splits):
sections.append(section_splits[i] + section_splits[i + 1])
else:
sections.append(section_splits[i])
# Add any trailing content
if len(section_splits) % 2 == 1:
sections.append(section_splits[-1])
else:
# Fall back to paragraph splitting
sections = re.split(r'\n{2,}', clean_doc)
# Now combine sections into chunks that don't exceed token limits
chunks = []
current_chunk = ""
current_tokens = 0
for section in sections:
section_tokens = self.count_tokens(section)
# If a single section exceeds our limit, split it further
if section_tokens > self.max_chunk_size:
# Add the current chunk if it's not empty
if current_chunk:
chunks.append(current_chunk)
current_chunk = ""
current_tokens = 0
# Split long section by sentences
sentences = re.split(r'(?<=[.!?])\s+', section)
for sentence in sentences:
sentence_tokens = self.count_tokens(sentence)
# If adding this sentence would exceed the limit, start a new chunk
if current_tokens + sentence_tokens > self.max_chunk_size:
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence
current_tokens = sentence_tokens
else:
# If a single sentence is too long, force-chunk it
chunks.append(sentence)
else:
if current_chunk:
current_chunk += " " + sentence
else:
current_chunk = sentence
current_tokens += sentence_tokens
else:
# If adding this section would exceed the limit, start a new chunk
if current_tokens + section_tokens > self.max_chunk_size:
chunks.append(current_chunk)
current_chunk = section
current_tokens = section_tokens
else:
if current_chunk:
current_chunk += "\n\n" + section
else:
current_chunk = section
current_tokens += section_tokens
# Add the final chunk if there is one
if current_chunk:
chunks.append(current_chunk)
return chunks
def _align_segments(self, segments_a: List[str], segments_b: List[str]) -> List[Tuple[Optional[int], Optional[int]]]:
"""
Align segments from two documents based on similarity.
Args:
segments_a: Segments from document A
segments_b: Segments from document B
Returns:
List of tuples with aligned segment indices (None for gaps)
"""
# If either list is empty, handle specially
if not segments_a:
return [(None, i) for i in range(len(segments_b))]
if not segments_b:
return [(i, None) for i in range(len(segments_a))]
# Calculate similarity matrix
similarity_matrix = []
for segment_a in segments_a:
row = []
for segment_b in segments_b:
similarity = SequenceMatcher(None, segment_a, segment_b).ratio()
row.append(similarity)
similarity_matrix.append(row)
# Create optimal alignment using greedy approach
aligned_pairs = []
used_a = set()
used_b = set()
# First pass: Get high-confidence matches
for _ in range(min(len(segments_a), len(segments_b))):
best_similarity = 0.15 # Minimum similarity threshold
best_pair = None
for i in range(len(segments_a)):
if i in used_a:
continue
for j in range(len(segments_b)):
if j in used_b:
continue
if similarity_matrix[i][j] > best_similarity:
best_similarity = similarity_matrix[i][j]
best_pair = (i, j)
if best_pair:
aligned_pairs.append(best_pair)
used_a.add(best_pair[0])
used_b.add(best_pair[1])
else:
break
# Second pass: Handle unmatched segments
# Preserve order of segments for unmatched parts
all_aligned_pairs = []
# Sort aligned pairs by the index in document A
aligned_pairs.sort()
# Process the aligned pairs in order
last_a = -1
last_b = -1
for a_idx, b_idx in aligned_pairs:
# Add any missing segments from A
for i in range(last_a + 1, a_idx):
all_aligned_pairs.append((i, None))
# Add any missing segments from B
for j in range(last_b + 1, b_idx):
all_aligned_pairs.append((None, j))
# Add the matched pair
all_aligned_pairs.append((a_idx, b_idx))
last_a = a_idx
last_b = b_idx
# Add any trailing segments from A
for i in range(last_a + 1, len(segments_a)):
all_aligned_pairs.append((i, None))
# Add any trailing segments from B
for j in range(last_b + 1, len(segments_b)):
all_aligned_pairs.append((None, j))
return all_aligned_pairs
def _compare_segments(self,
segment_a: str,
segment_b: str,
version_a_number: str,
version_b_number: str,
document_title: str) -> str:
"""
Compare two document segments using LLM analysis.
Args:
segment_a: Text segment from version A
segment_b: Text segment from version B
version_a_number: Version identifier for version A
version_b_number: Version identifier for version B
document_title: Title of the document
Returns:
LLM analysis of the differences
"""
# Skip comparison if segments are identical
if segment_a == segment_b:
return f"## Segment Comparison\n\nNo changes detected in this segment."
# Handle segments that exist in only one version
if not segment_a:
return f"## New Content Added\n\nThis segment exists only in Version {version_b_number}."
if not segment_b:
return f"## Content Removed\n\nThis segment existed in Version {version_a_number} but was removed."
# Prepare the prompt
prompt = ChatPromptTemplate.from_template(self.segment_comparison_template)
# Format the prompt with segment data
formatted_prompt = prompt.format(
segment_a=segment_a,
segment_b=segment_b,
version_a_number=version_a_number,
version_b_number=version_b_number,
document_title=document_title
)
# Get LLM response
try:
response = self.llm.invoke(formatted_prompt)
return response.content
except Exception as e:
logger.error(f"Error comparing segments: {e}")
logger.error(traceback.format_exc())
return f"## Error Comparing Segments\n\nAn error occurred: {str(e)}"
def _generate_overall_summary(self,
segment_results: List[str],
version_a_number: str,
version_b_number: str,
document_title: str) -> str:
"""
Generate an overall summary from individual segment comparisons.
Args:
segment_results: Results from segment comparisons
version_a_number: Version identifier for version A
version_b_number: Version identifier for version B
document_title: Title of the document
Returns:
Overall summary of document changes
"""
# Combine segment results with section dividers
combined_results = "\n\n---\n\n".join(segment_results)
# Check if we need to summarize (if the combined results are too long)
if self.count_tokens(combined_results) > self.max_context_size:
# If combined results are too long, extract key points for each segment
segment_summaries = []
for i, result in enumerate(segment_results):
# Extract just the summary section to reduce length
summary_match = re.search(r'(?:## |#\s)Summary[^\n]*\n(.*?)(?:(?:## |#\s)|$)',
result, re.DOTALL)
if summary_match:
segment_summaries.append(f"Segment {i+1} Summary: {summary_match.group(1).strip()}")
else:
# If no summary section, take the first 100 words
words = result.split()[:100]
segment_summaries.append(f"Segment {i+1}: {' '.join(words)}...")
combined_results = "\n\n".join(segment_summaries)
# Prepare the prompt for overall summary
prompt = ChatPromptTemplate.from_template(self.overall_summary_template)
# Format the prompt with segment results
formatted_prompt = prompt.format(
segment_results=combined_results,
version_a_number=version_a_number,
version_b_number=version_b_number,
document_title=document_title
)
# Get LLM response
try:
response = self.llm.invoke(formatted_prompt)
return response.content
except Exception as e:
logger.error(f"Error generating overall summary: {e}")
logger.error(traceback.format_exc())
return f"## Error Generating Summary\n\nAn error occurred: {str(e)}\n\n## Individual Segment Results\n\n{combined_results}"
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
model_name: Name of the OpenAI LLM model to use for analysis. Defaults to 'gpt-4o'. Common values include 'gpt-4', 'gpt-4-turbo', 'gpt-3.5-turbo'.
temperature: Temperature setting for LLM responses controlling randomness. Defaults to 0 for deterministic, consistent outputs. Range is 0.0 (deterministic) to 2.0 (very random).
api_key: Optional OpenAI API key. If provided, sets the OPENAI_API_KEY environment variable. If None, expects the key to already be set in environment variables.
Return Value
Instantiation returns a VersionComparisonService object configured with the specified LLM model and settings. The main compare_documents method returns a markdown-formatted string containing a comprehensive comparison report with executive summary, major changes, change statistics, patterns, and unchanged elements. Individual methods return: count_tokens returns int (token count), _segment_document returns List[str] (document segments), _align_segments returns List[Tuple[Optional[int], Optional[int]]] (aligned segment indices), _compare_segments returns str (markdown analysis), _generate_overall_summary returns str (markdown summary).
Class Interface
Methods
__init__(self, model_name: str = 'gpt-4o', temperature: float = 0, api_key: Optional[str] = None)
Purpose: Initialize the comparison service with LLM configuration, tokenizer, and prompt templates
Parameters:
model_name: Name of the OpenAI model to use (default: 'gpt-4o')temperature: LLM temperature setting for response randomness (default: 0)api_key: Optional OpenAI API key to set in environment
Returns: None - initializes the instance
count_tokens(self, text: str) -> int
Purpose: Count the number of tokens in a given text string using the cl100k_base encoding
Parameters:
text: The text string to count tokens for
Returns: Integer count of tokens in the text
compare_documents(self, doc_a: str, doc_b: str, version_a_number: str, version_b_number: str, document_title: str = 'Document') -> str
Purpose: Main public method to compare two document versions and generate a comprehensive analysis report
Parameters:
doc_a: Text content of the first document versiondoc_b: Text content of the second document versionversion_a_number: Version identifier for the first document (e.g., '1.0', 'Draft 1')version_b_number: Version identifier for the second document (e.g., '2.0', 'Final')document_title: Title of the document being compared (default: 'Document')
Returns: Markdown-formatted string containing comprehensive comparison report with executive summary, major changes, statistics, and patterns
_segment_document(self, doc: str) -> List[str]
Purpose: Private method to segment a document into manageable chunks using natural text boundaries while respecting token limits
Parameters:
doc: Document text to segment
Returns: List of text segments, each respecting the max_chunk_size token limit
_align_segments(self, segments_a: List[str], segments_b: List[str]) -> List[Tuple[Optional[int], Optional[int]]]
Purpose: Private method to align segments from two documents based on similarity matching using a greedy algorithm
Parameters:
segments_a: List of segments from document Asegments_b: List of segments from document B
Returns: List of tuples containing aligned segment indices, where None indicates a segment exists in only one version
_compare_segments(self, segment_a: str, segment_b: str, version_a_number: str, version_b_number: str, document_title: str) -> str
Purpose: Private method to compare two document segments using LLM analysis and generate detailed change description
Parameters:
segment_a: Text segment from version Asegment_b: Text segment from version Bversion_a_number: Version identifier for version Aversion_b_number: Version identifier for version Bdocument_title: Title of the document
Returns: Markdown-formatted string with LLM analysis of differences between the segments
_generate_overall_summary(self, segment_results: List[str], version_a_number: str, version_b_number: str, document_title: str) -> str
Purpose: Private method to generate an executive summary from individual segment comparison results
Parameters:
segment_results: List of markdown strings from individual segment comparisonsversion_a_number: Version identifier for version Aversion_b_number: Version identifier for version Bdocument_title: Title of the document
Returns: Markdown-formatted string with comprehensive executive summary synthesizing all segment analyses
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
model_name |
str | Name of the OpenAI LLM model being used | instance |
temperature |
float | Temperature setting for LLM responses | instance |
tokenizer |
tiktoken.Encoding | Tokenizer instance using cl100k_base encoding for counting tokens | instance |
llm |
ChatOpenAI | LangChain ChatOpenAI instance configured with the specified model and temperature | instance |
max_chunk_size |
int | Maximum token count for individual segment comparisons (default: 8000) | instance |
max_context_size |
int | Maximum token count for full context when generating overall summary (default: 15000) | instance |
segment_comparison_template |
str | Prompt template string for comparing individual document segments | instance |
overall_summary_template |
str | Prompt template string for generating executive summary from segment results | instance |
Dependencies
osredifflibloggingtiktokentracebacktypinglangchain_openailangchain
Required Imports
import os
import re
import difflib
import logging
import tiktoken
import traceback
from typing import List, Dict, Any, Tuple, Optional
from difflib import SequenceMatcher
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
Usage Example
import os
from version_comparison_service import VersionComparisonService
# Set API key
os.environ['OPENAI_API_KEY'] = 'your-api-key-here'
# Initialize the service
service = VersionComparisonService(
model_name='gpt-4o',
temperature=0
)
# Load document versions
with open('doc_v1.txt', 'r') as f:
doc_v1 = f.read()
with open('doc_v2.txt', 'r') as f:
doc_v2 = f.read()
# Compare documents
comparison_report = service.compare_documents(
doc_a=doc_v1,
doc_b=doc_v2,
version_a_number='1.0',
version_b_number='2.0',
document_title='Product Requirements Document'
)
# Output the report
print(comparison_report)
# Or save to file
with open('comparison_report.md', 'w') as f:
f.write(comparison_report)
Best Practices
- Always set OPENAI_API_KEY before instantiation or pass it as a parameter to avoid runtime errors
- Use temperature=0 for consistent, deterministic comparison results across multiple runs
- For very large documents (>100k tokens), expect multiple API calls and longer processing times
- The service is stateless after initialization - you can reuse the same instance for multiple document comparisons
- Provide meaningful version numbers and document titles as they appear in the generated reports
- Handle the returned markdown string appropriately - save to file, render in UI, or convert to other formats
- Monitor token usage and API costs when comparing very large documents as each segment pair requires an LLM call
- The service handles identical documents gracefully by returning early without API calls
- Segment alignment uses similarity matching - documents with completely restructured content may produce less intuitive alignments
- Error handling is built-in but check for error messages in the returned markdown if API calls fail
- The max_chunk_size (8000 tokens) and max_context_size (15000 tokens) are tuned for GPT-4 models and may need adjustment for other models
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentService 54.5% similar
-
function compare_document_versions 54.2% similar
-
class DataAnalysisService 52.9% similar
-
class DocumentVersion 52.6% similar
-
class DocumentVersion_v1 50.6% similar