ReferenceManager - Code Extractor

class ReferenceManager

Maturity: 51

Manages document references for inline citation and bibliography generation in a RAG (Retrieval-Augmented Generation) system.

File:
/tf/active/vicechatdev/fixed_project_victoria_generator.py

Lines:
49 - 217

Complexity:
moderate

Purpose

The ReferenceManager class provides a comprehensive system for tracking, citing, and formatting document references. It assigns unique reference numbers to documents, prevents duplicate references, generates inline citations, and creates formatted bibliographies with metadata. This is essential for maintaining academic-style citations in AI-generated content that draws from multiple source documents.

Source Code

class ReferenceManager:
    """
    Manages document references for inline citation and bibliography generation.
    """
    
    def __init__(self):
        self.references = {}  # Dictionary mapping reference number to reference info
        self.reference_counter = 1
        self.used_documents = set()  # Track which documents have been referenced
    
    def add_document(self, doc_id: str, content: str, metadata: Dict[str, Any]) -> int:
        """
        Add a document to the reference system and return its reference number.
        
        Args:
            doc_id: Unique document identifier
            content: Document content
            metadata: Document metadata
            
        Returns:
            Reference number for inline citation
        """
        # Check if document already has a reference
        for ref_num, ref_info in self.references.items():
            if ref_info['doc_id'] == doc_id:
                return ref_num
        
        # Create new reference
        ref_num = self.reference_counter
        
        # Create a clean content preview
        clean_content = content.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        # Remove multiple spaces
        clean_content = ' '.join(clean_content.split())
        preview = clean_content[:250] + ("..." if len(clean_content) > 250 else "")
        
        self.references[ref_num] = {
            'doc_id': doc_id,
            'source': self.format_source(metadata),
            'preview': preview,
            'metadata': metadata
        }
        self.reference_counter += 1
        self.used_documents.add(doc_id)
        
        return ref_num
    
    def format_source(self, metadata: Dict[str, Any]) -> str:
        """Format source information for display."""
        # Extract key information from metadata
        doc_type = metadata.get('type', metadata.get('document_type', 'Document'))
        doc_id = metadata.get('id', metadata.get('document_id', 'unknown'))
        title = metadata.get('title', metadata.get('filename', ''))
        author = metadata.get('author', metadata.get('creator', ''))
        date = metadata.get('date', metadata.get('created_date', ''))
        
        # Build formatted source string
        source_parts = []
        
        # Add title if available
        if title and title != 'unknown':
            source_parts.append(f'"{title}"')
        
        # Add author if available
        if author and author != 'unknown':
            source_parts.append(f"by {author}")
        
        # Add document type
        source_parts.append(f"({doc_type})")
        
        # Add date if available
        if date and date != 'unknown':
            source_parts.append(f"dated {date}")
        
        # Add document ID
        source_parts.append(f"[ID: {doc_id}]")
        
        return " ".join(source_parts)
    
    def get_citation(self, doc_id: str) -> str:
        """
        Get inline citation for a document.
        
        Args:
            doc_id: Document identifier
            
        Returns:
            Inline citation string like [1]
        """
        for ref_num, ref_info in self.references.items():
            if ref_info['doc_id'] == doc_id:
                return f"[{ref_num}]"
        return "[?]"  # Should not happen if add_document was called first
    
    def generate_bibliography(self) -> str:
        """
        Generate a formatted bibliography section.
        
        Returns:
            Formatted bibliography in markdown
        """
        if not self.references:
            return "\n## References\n\nNo references available.\n"
        
        bibliography = ["\n## References\n"]
        bibliography.append("*The following documents were referenced in generating the warranty disclosures:*\n")
        
        for ref_num in sorted(self.references.keys()):
            ref_info = self.references[ref_num]
            source = ref_info['source']
            preview = ref_info['preview']
            metadata = ref_info['metadata']
            
            # Main reference entry with improved formatting
            bibliography.append(f"**[{ref_num}]** {source}")
            
            # Content preview with better formatting
            if preview and preview.strip():
                clean_preview = preview.replace('\n', ' ').replace('\r', ' ')
                # Limit preview length and add ellipsis if needed
                if len(clean_preview) > 300:
                    clean_preview = clean_preview[:297] + "..."
                bibliography.append(f"    *Content preview:* {clean_preview}")
            
            # Add metadata in a more organized way
            metadata_items = []
            
            if metadata.get('document_type') and metadata['document_type'] != 'Document':
                metadata_items.append(f"*Document type:* {metadata['document_type']}")
            
            if metadata.get('date') and metadata['date'] != 'unknown':
                metadata_items.append(f"*Date:* {metadata['date']}")
            
            if metadata.get('source') and metadata['source'] != 'unknown':
                metadata_items.append(f"*Source location:* {metadata['source']}")
            
            if metadata.get('author') and metadata['author'] != 'unknown':
                metadata_items.append(f"*Author:* {metadata['author']}")
            
            if metadata.get('category'):
                metadata_items.append(f"*Category:* {metadata['category']}")
            
            if metadata.get('relevance_score'):
                score = float(metadata['relevance_score'])
                metadata_items.append(f"*Relevance:* {score:.2f}")
            
            if metadata.get('summary') and len(metadata['summary']) > 10:
                summary = metadata['summary'][:200] + ("..." if len(metadata['summary']) > 200 else "")
                metadata_items.append(f"*Summary:* {summary}")
            
            # Add metadata items with proper indentation
            for item in metadata_items:
                bibliography.append(f"    {item}")
            
            # Add spacing between references
            bibliography.append("")
        
        # Add footer note
        bibliography.append("---")
        bibliography.append(f"*Total references: {len(self.references)}*")
        bibliography.append("")
        
        return "\n".join(bibliography)
    
    def clear(self):
        """Clear all references."""
        self.references = {}
        self.reference_counter = 1
        self.used_documents = set()

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

__init__: No parameters required. Initializes an empty reference management system with a counter starting at 1 and empty tracking structures.

Return Value

Instantiation returns a ReferenceManager object. Key method returns: add_document() returns an integer reference number for inline citation; get_citation() returns a formatted citation string like '[1]'; generate_bibliography() returns a markdown-formatted string containing all references with metadata; clear() returns None.

Class Interface

Methods

`init(self)`

Purpose: Initialize a new ReferenceManager instance with empty reference tracking structures

Returns: None (constructor)

`add_document(self, doc_id: str, content: str, metadata: Dict[str, Any]) -> int`

Purpose: Add a document to the reference system and return its reference number. Prevents duplicates by checking if doc_id already exists.

Parameters:

doc_id: Unique document identifier string used to track and prevent duplicate references
content: Full document content string, which will be cleaned and truncated to create a preview
metadata: Dictionary containing document metadata such as type, title, author, date, source, category, relevance_score, and summary

Returns: Integer reference number (starting from 1) that can be used for inline citations. Returns existing reference number if document was already added.

`format_source(self, metadata: Dict[str, Any]) -> str`

Purpose: Format source information from metadata into a human-readable citation string

Parameters:

metadata: Dictionary containing document metadata fields like type, id, title, author, date, filename, creator, created_date

Returns: Formatted string combining title, author, document type, date, and ID in a readable citation format

`get_citation(self, doc_id: str) -> str`

Purpose: Get inline citation string for a previously added document

Parameters:

doc_id: Document identifier string that was used when calling add_document()

Returns: Formatted citation string like '[1]' or '[?]' if document was not found (should not happen if add_document was called first)

`generate_bibliography(self) -> str`

Purpose: Generate a complete formatted bibliography section with all referenced documents and their metadata

Returns: Markdown-formatted string containing a References section with numbered entries, source information, content previews, metadata details, and a footer with total reference count

`clear(self)`

Purpose: Clear all references and reset the reference counter to start fresh

Returns: None

Attributes

Name	Type	Description	Scope
`references`	Dict[int, Dict[str, Any]]	Dictionary mapping reference numbers (int) to reference information dictionaries containing 'doc_id', 'source', 'preview', and 'metadata' keys	instance
`reference_counter`	int	Counter tracking the next reference number to assign, starts at 1 and increments for each new unique document	instance
`used_documents`	set	Set of document IDs (strings) that have been referenced, used to track which documents have been added to the reference system	instance

Dependencies

typing

Required Imports

from typing import Dict, Any

Usage Example

from typing import Dict, Any

# Instantiate the reference manager
ref_manager = ReferenceManager()

# Add documents and get reference numbers
metadata1 = {
    'type': 'Technical Manual',
    'id': 'doc_001',
    'title': 'Product Warranty Guide',
    'author': 'Engineering Team',
    'date': '2024-01-15'
}
ref_num1 = ref_manager.add_document(
    doc_id='doc_001',
    content='This product comes with a 2-year warranty covering manufacturing defects...',
    metadata=metadata1
)

metadata2 = {
    'type': 'Policy Document',
    'id': 'doc_002',
    'title': 'Warranty Terms',
    'date': '2024-02-01'
}
ref_num2 = ref_manager.add_document(
    doc_id='doc_002',
    content='Extended warranty options are available for purchase...',
    metadata=metadata2
)

# Get inline citations
citation1 = ref_manager.get_citation('doc_001')  # Returns '[1]'
citation2 = ref_manager.get_citation('doc_002')  # Returns '[2]'

# Generate formatted bibliography
bibliography = ref_manager.generate_bibliography()
print(bibliography)

# Clear all references when done
ref_manager.clear()

Best Practices

Always call add_document() before get_citation() to ensure the document is registered in the reference system
Use the same doc_id consistently for the same document to prevent duplicate references
Call add_document() returns the reference number, which can be used immediately for inline citations
The class automatically prevents duplicate references by checking existing doc_ids
Call clear() when starting a new document or session to reset the reference counter
The reference counter starts at 1 and increments for each new unique document
Metadata should include at least 'id' or 'document_id' for proper tracking; other fields like 'title', 'author', 'date' enhance bibliography formatting
Content previews are automatically truncated to 250 characters with ellipsis
The generate_bibliography() method returns markdown-formatted text suitable for appending to generated documents
State is maintained across multiple add_document() calls, so instantiate once per document generation session

Similar Components

AI-powered semantic similarity - components with related functionality:

class ReferenceManager_v1 94.7% similar

Manages document references for inline citation and bibliography generation, tracking documents and generating formatted citations and bibliographies.
From: /tf/active/vicechatdev/improved_project_victoria_generator.py
class ReferenceManager_v2 74.7% similar

Manages extraction and formatting of references for LLM chat responses. Handles both file references and BibTeX citations, formatting them according to various academic citation styles.
From: /tf/active/vicechatdev/OneCo_hybrid_RAG copy.py
class ReferenceManager_v3 74.6% similar

Manages extraction and formatting of references for LLM chat responses. Handles both file references and BibTeX citations, formatting them according to various academic citation styles.
From: /tf/active/vicechatdev/OneCo_hybrid_RAG_old.py
class ReferenceManager_v4 74.4% similar

Manages extraction and formatting of references for LLM chat responses. Handles both file references and BibTeX citations, formatting them according to various academic citation styles.
From: /tf/active/vicechatdev/OneCo_hybrid_RAG.py
function parse_references_section 52.9% similar

Parses a formatted references section string and extracts structured data including reference numbers, sources, and content previews using regular expressions.
From: /tf/active/vicechatdev/improved_convert_disclosures_to_table.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class ReferenceManager:
    """
    Manages document references for inline citation and bibliography generation.
    """
    
    def __init__(self):
        self.references = {}  # Dictionary mapping reference number to reference info
        self.reference_counter = 1
        self.used_documents = set()  # Track which documents have been referenced
    
    def add_document(self, doc_id: str, content: str, metadata: Dict[str, Any]) -> int:
        """
        Add a document to the reference system and return its reference number.
        
        Args:
            doc_id: Unique document identifier
            content: Document content
            metadata: Document metadata
            
        Returns:
            Reference number for inline citation
        """
        # Check if document already has a reference
        for ref_num, ref_info in self.references.items():
            if ref_info['doc_id'] == doc_id:
                return ref_num
        
        # Create new reference
        ref_num = self.reference_counter
        
        # Create a clean content preview
        clean_content = content.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        # Remove multiple spaces
        clean_content = ' '.join(clean_content.split())
        preview = clean_content[:250] + ("..." if len(clean_content) > 250 else "")
        
        self.references[ref_num] = {
            'doc_id': doc_id,
            'source': self.format_source(metadata),
            'preview': preview,
            'metadata': metadata
        }
        self.reference_counter += 1
        self.used_documents.add(doc_id)
        
        return ref_num
    
    def format_source(self, metadata: Dict[str, Any]) -> str:
        """Format source information for display."""
        # Extract key information from metadata
        doc_type = metadata.get('type', metadata.get('document_type', 'Document'))
        doc_id = metadata.get('id', metadata.get('document_id', 'unknown'))
        title = metadata.get('title', metadata.get('filename', ''))
        author = metadata.get('author', metadata.get('creator', ''))
        date = metadata.get('date', metadata.get('created_date', ''))
        
        # Build formatted source string
        source_parts = []
        
        # Add title if available
        if title and title != 'unknown':
            source_parts.append(f'"{title}"')
        
        # Add author if available
        if author and author != 'unknown':
            source_parts.append(f"by {author}")
        
        # Add document type
        source_parts.append(f"({doc_type})")
        
        # Add date if available
        if date and date != 'unknown':
            source_parts.append(f"dated {date}")
        
        # Add document ID
        source_parts.append(f"[ID: {doc_id}]")
        
        return " ".join(source_parts)
    
    def get_citation(self, doc_id: str) -> str:
        """
        Get inline citation for a document.
        
        Args:
            doc_id: Document identifier
            
        Returns:
            Inline citation string like [1]
        """
        for ref_num, ref_info in self.references.items():
            if ref_info['doc_id'] == doc_id:
                return f"[{ref_num}]"
        return "[?]"  # Should not happen if add_document was called first
    
    def generate_bibliography(self) -> str:
        """
        Generate a formatted bibliography section.
        
        Returns:
            Formatted bibliography in markdown
        """
        if not self.references:
            return "\n## References\n\nNo references available.\n"
        
        bibliography = ["\n## References\n"]
        bibliography.append("*The following documents were referenced in generating the warranty disclosures:*\n")
        
        for ref_num in sorted(self.references.keys()):
            ref_info = self.references[ref_num]
            source = ref_info['source']
            preview = ref_info['preview']
            metadata = ref_info['metadata']
            
            # Main reference entry with improved formatting
            bibliography.append(f"**[{ref_num}]** {source}")
            
            # Content preview with better formatting
            if preview and preview.strip():
                clean_preview = preview.replace('\n', ' ').replace('\r', ' ')
                # Limit preview length and add ellipsis if needed
                if len(clean_preview) > 300:
                    clean_preview = clean_preview[:297] + "..."
                bibliography.append(f"    *Content preview:* {clean_preview}")
            
            # Add metadata in a more organized way
            metadata_items = []
            
            if metadata.get('document_type') and metadata['document_type'] != 'Document':
                metadata_items.append(f"*Document type:* {metadata['document_type']}")
            
            if metadata.get('date') and metadata['date'] != 'unknown':
                metadata_items.append(f"*Date:* {metadata['date']}")
            
            if metadata.get('source') and metadata['source'] != 'unknown':
                metadata_items.append(f"*Source location:* {metadata['source']}")
            
            if metadata.get('author') and metadata['author'] != 'unknown':
                metadata_items.append(f"*Author:* {metadata['author']}")
            
            if metadata.get('category'):
                metadata_items.append(f"*Category:* {metadata['category']}")
            
            if metadata.get('relevance_score'):
                score = float(metadata['relevance_score'])
                metadata_items.append(f"*Relevance:* {score:.2f}")
            
            if metadata.get('summary') and len(metadata['summary']) > 10:
                summary = metadata['summary'][:200] + ("..." if len(metadata['summary']) > 200 else "")
                metadata_items.append(f"*Summary:* {summary}")
            
            # Add metadata items with proper indentation
            for item in metadata_items:
                bibliography.append(f"    {item}")
            
            # Add spacing between references
            bibliography.append("")
        
        # Add footer note
        bibliography.append("---")
        bibliography.append(f"*Total references: {len(self.references)}*")
        bibliography.append("")
        
        return "\n".join(bibliography)
    
    def clear(self):
        """Clear all references."""
        self.references = {}
        self.reference_counter = 1
        self.used_documents = set()
                        

Improved Code

🔍 Code Extractor

class ReferenceManager

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self)`

`add_document(self, doc_id: str, content: str, metadata: Dict[str, Any]) -> int`

`format_source(self, metadata: Dict[str, Any]) -> str`

`get_citation(self, doc_id: str) -> str`

`generate_bibliography(self) -> str`

`clear(self)`

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class ReferenceManager_v1 94.7% similar

class ReferenceManager_v2 74.7% similar

class ReferenceManager_v3 74.6% similar

class ReferenceManager_v4 74.4% similar

function parse_references_section 52.9% similar

class ReferenceManager

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self)

add_document(self, doc_id: str, content: str, metadata: Dict[str, Any]) -> int

format_source(self, metadata: Dict[str, Any]) -> str

get_citation(self, doc_id: str) -> str

generate_bibliography(self) -> str

clear(self)

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

class ReferenceManager_v1 94.7% similar

class ReferenceManager_v2 74.7% similar

class ReferenceManager_v3 74.6% similar

class ReferenceManager_v4 74.4% similar

function parse_references_section 52.9% similar

✨ Improve Code: ReferenceManager

Code Comparison

`init(self)`

`add_document(self, doc_id: str, content: str, metadata: Dict[str, Any]) -> int`

`format_source(self, metadata: Dict[str, Any]) -> str`

`get_citation(self, doc_id: str) -> str`

`generate_bibliography(self) -> str`

`clear(self)`