DocumentComparator - Code Extractor

class DocumentComparator

Maturity: 41

A class that compares reMarkable cloud documents to analyze and identify structural differences between them, particularly useful for debugging document upload issues.

File:
/tf/active/vicechatdev/e-ink-llm/cloudtest/compare_documents.py

Lines:
13 - 235

Complexity:
complex

Purpose

DocumentComparator provides comprehensive analysis and comparison of reMarkable cloud documents by examining their root.docSchema entries, document schemas, components (metadata, content, pagedata), and structural properties. It's designed to help developers understand differences between documents created through the official app versus custom uploads, enabling debugging of document structure issues.

Source Code

class DocumentComparator:
    """Compare documents to find structural differences"""
    
    def __init__(self):
        # Load auth session
        auth = RemarkableAuth()
        self.session = auth.get_authenticated_session()
        
        if not self.session:
            raise RuntimeError("Failed to authenticate with reMarkable")
        
        print("🔄 Document Comparator Initialized")
    
    def get_root_info(self):
        """Get current root.docSchema info"""
        root_response = self.session.get("https://eu.tectonic.remarkable.com/sync/v4/root")
        root_response.raise_for_status()
        root_data = root_response.json()
        
        # Get root content
        root_content_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{root_data['hash']}")
        root_content_response.raise_for_status()
        root_content = root_content_response.text
        
        return root_data, root_content
    
    def analyze_document(self, doc_uuid: str, doc_name: str):
        """Analyze a single document structure"""
        print(f"\n📄 Analyzing {doc_name} ({doc_uuid[:8]}...)")
        print("=" * 50)
        
        # Get root info
        root_data, root_content = self.get_root_info()
        
        # Find document in root
        lines = root_content.strip().split('\n')
        doc_entry = None
        
        for line in lines[1:]:  # Skip version header
            if doc_uuid in line:
                parts = line.split(':')
                if len(parts) >= 5:
                    doc_entry = {
                        'hash': parts[0],
                        'flags': parts[1],
                        'uuid': parts[2],
                        'type': parts[3],
                        'size': parts[4],
                        'full_line': line
                    }
                    break
        
        if not doc_entry:
            print(f"❌ Document not found in root.docSchema")
            return None
        
        print(f"📋 ROOT.DOCSCHEMA ENTRY:")
        print(f"   Hash: {doc_entry['hash']}")
        print(f"   Flags: {doc_entry['flags']}")
        print(f"   UUID: {doc_entry['uuid']}")
        print(f"   Type: {doc_entry['type']}")
        print(f"   Size: {doc_entry['size']}")
        print(f"   Full line: {doc_entry['full_line']}")
        
        # Get document schema
        doc_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{doc_entry['hash']}")
        doc_response.raise_for_status()
        doc_content = doc_response.text
        doc_lines = doc_content.strip().split('\n')
        
        print(f"\n📄 DOCUMENT SCHEMA:")
        print(f"   Version: {doc_lines[0]}")
        print(f"   Components: {len(doc_lines) - 1}")
        
        components = {}
        for i, line in enumerate(doc_lines[1:], 1):
            parts = line.split(':')
            if len(parts) >= 5:
                component_type = parts[2].split('.')[-1]  # Get extension
                components[component_type] = {
                    'hash': parts[0],
                    'flags': parts[1],
                    'filename': parts[2],
                    'type': parts[3],
                    'size': parts[4],
                    'line': line
                }
                print(f"   Component {i}: {component_type} ({parts[4]} bytes)")
        
        # Analyze metadata
        if 'metadata' in components:
            metadata_hash = components['metadata']['hash']
            metadata_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{metadata_hash}")
            metadata_response.raise_for_status()
            metadata = json.loads(metadata_response.text)
            
            print(f"\n📝 METADATA:")
            for key, value in sorted(metadata.items()):
                print(f"   {key}: {value}")
        
        # Analyze content if exists
        if 'content' in components:
            content_hash = components['content']['hash']
            content_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{content_hash}")
            content_response.raise_for_status()
            content_data = content_response.text
            
            print(f"\n📄 CONTENT:")
            print(f"   Size: {len(content_data)} bytes")
            print(f"   Type: {type(content_data)}")
            if len(content_data) < 200:
                print(f"   Data: {content_data}")
            else:
                print(f"   Preview: {content_data[:100]}...")
        
        # Analyze pagedata if exists
        if 'pagedata' in components:
            pagedata_hash = components['pagedata']['hash']
            pagedata_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{pagedata_hash}")
            pagedata_response.raise_for_status()
            pagedata_data = pagedata_response.text
            
            print(f"\n📄 PAGEDATA:")
            print(f"   Size: {len(pagedata_data)} bytes")
            if len(pagedata_data) < 200:
                print(f"   Data: {pagedata_data}")
            else:
                print(f"   Preview: {pagedata_data[:100]}...")
        
        return {
            'entry': doc_entry,
            'schema_lines': doc_lines,
            'components': components,
            'metadata': metadata if 'metadata' in components else None
        }
    
    def compare_documents(self):
        """Compare the real invoice vs our uploaded document"""
        print(f"🔍 Comparing Real App Document vs Our Upload")
        print("=" * 60)
        
        # Analyze invoice document (real app) - Poulpharm invoice
        invoice_uuid = "cf2a3833-4a8f-4004-ab8d-8dc3c5f561bc"  # Poulpharm invoice
        invoice_data = self.analyze_document(invoice_uuid, "Poulpharm Invoice (Real App)")
        
        # Analyze our uploaded document
        upload_uuid = "206f5df3-07c2-4341-8afd-2b7362aefa91"
        upload_data = self.analyze_document(upload_uuid, "Our Upload")
        
        if not invoice_data or not upload_data:
            print(f"❌ Could not analyze both documents")
            return
        
        # Compare key differences
        print(f"\n🔍 KEY DIFFERENCES:")
        print("=" * 30)
        
        # Compare root entry flags
        invoice_flags = invoice_data['entry']['flags']
        upload_flags = upload_data['entry']['flags']
        if invoice_flags != upload_flags:
            print(f"📋 ROOT FLAGS DIFFER:")
            print(f"   Invoice: {invoice_flags}")
            print(f"   Upload:  {upload_flags}")
        
        # Compare document types
        invoice_type = invoice_data['entry']['type']
        upload_type = upload_data['entry']['type']
        if invoice_type != upload_type:
            print(f"📋 DOCUMENT TYPE DIFFERS:")
            print(f"   Invoice: {invoice_type}")
            print(f"   Upload:  {upload_type}")
        
        # Compare components
        invoice_components = set(invoice_data['components'].keys())
        upload_components = set(upload_data['components'].keys())
        
        missing_in_upload = invoice_components - upload_components
        extra_in_upload = upload_components - invoice_components
        
        if missing_in_upload:
            print(f"📄 COMPONENTS MISSING IN UPLOAD: {missing_in_upload}")
        if extra_in_upload:
            print(f"📄 EXTRA COMPONENTS IN UPLOAD: {extra_in_upload}")
        
        # Compare metadata keys
        if invoice_data['metadata'] and upload_data['metadata']:
            invoice_keys = set(invoice_data['metadata'].keys())
            upload_keys = set(upload_data['metadata'].keys())
            
            missing_keys = invoice_keys - upload_keys
            extra_keys = upload_keys - invoice_keys
            
            if missing_keys:
                print(f"📝 METADATA KEYS MISSING IN UPLOAD: {missing_keys}")
            if extra_keys:
                print(f"📝 EXTRA METADATA KEYS IN UPLOAD: {extra_keys}")
            
            # Compare specific metadata values
            common_keys = invoice_keys & upload_keys
            for key in common_keys:
                if invoice_data['metadata'][key] != upload_data['metadata'][key]:
                    print(f"📝 METADATA DIFFERS for '{key}':")
                    print(f"   Invoice: {invoice_data['metadata'][key]}")
                    print(f"   Upload:  {upload_data['metadata'][key]}")
        
        # Compare component flags and sizes
        for component in invoice_components & upload_components:
            invoice_comp = invoice_data['components'][component]
            upload_comp = upload_data['components'][component]
            
            if invoice_comp['flags'] != upload_comp['flags']:
                print(f"📄 COMPONENT FLAGS DIFFER for {component}:")
                print(f"   Invoice: {invoice_comp['flags']}")
                print(f"   Upload:  {upload_comp['flags']}")
            
            if invoice_comp['type'] != upload_comp['type']:
                print(f"📄 COMPONENT TYPE DIFFERS for {component}:")
                print(f"   Invoice: {invoice_comp['type']}")
                print(f"   Upload:  {upload_comp['type']}")
        
        print(f"\n💡 ANALYSIS COMPLETE")
        return invoice_data, upload_data

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

__init__: No parameters required. The constructor automatically initializes authentication with reMarkable cloud services using RemarkableAuth and establishes an authenticated session. Raises RuntimeError if authentication fails.

Return Value

Instantiation returns a DocumentComparator object with an authenticated session. The analyze_document method returns a dictionary containing document entry data, schema lines, components dictionary, and metadata (or None if analysis fails). The compare_documents method returns a tuple of (invoice_data, upload_data) dictionaries with full analysis results for both documents.

Class Interface

Methods

`init(self)`

Purpose: Initialize the DocumentComparator with an authenticated reMarkable session

Returns: None - initializes the instance with self.session attribute

`get_root_info(self) -> tuple`

Purpose: Retrieve the current root.docSchema information from reMarkable cloud

Returns: Tuple of (root_data: dict, root_content: str) where root_data contains the root document metadata and root_content is the raw schema content

`analyze_document(self, doc_uuid: str, doc_name: str) -> dict | None`

Purpose: Perform comprehensive analysis of a single document's structure, components, and metadata

Parameters:

doc_uuid: The UUID of the document to analyze (full UUID string)
doc_name: Human-readable name for the document (used in console output)

Returns: Dictionary with keys 'entry' (root entry data), 'schema_lines' (document schema lines), 'components' (dict of component types to their data), and 'metadata' (parsed metadata or None). Returns None if document not found.

`compare_documents(self) -> tuple | None`

Purpose: Compare two hardcoded documents (Poulpharm invoice vs uploaded document) to identify structural differences

Returns: Tuple of (invoice_data: dict, upload_data: dict) containing full analysis results for both documents, or None if either document cannot be analyzed

Attributes

Name	Type	Description	Scope
`session`	requests.Session	Authenticated HTTP session object for making API requests to reMarkable cloud services. Initialized in __init__ via RemarkableAuth.get_authenticated_session()	instance

Dependencies

json
requests
auth

Required Imports

import json
from auth import RemarkableAuth

Usage Example

# Initialize the comparator
comparator = DocumentComparator()

# Analyze a single document
doc_uuid = 'cf2a3833-4a8f-4004-ab8d-8dc3c5f561bc'
result = comparator.analyze_document(doc_uuid, 'My Document')
if result:
    print(f"Components: {result['components'].keys()}")
    print(f"Metadata: {result['metadata']}")

# Compare two documents (uses hardcoded UUIDs)
invoice_data, upload_data = comparator.compare_documents()

# Get current root info
root_data, root_content = comparator.get_root_info()

Best Practices

Always instantiate within a try-except block to handle authentication failures gracefully
The class maintains an authenticated session as instance state - reuse the same instance for multiple operations to avoid re-authentication
Call get_root_info() to ensure you're working with the latest document state before analysis
The compare_documents() method uses hardcoded UUIDs - modify these in the source or extend the method to accept parameters
Network errors may occur during API calls - implement retry logic or error handling around method calls
Large documents may take time to download and analyze - consider adding timeouts for production use
The session object is stateful and may expire - implement session refresh logic for long-running processes
Document UUIDs must exist in the reMarkable cloud; non-existent UUIDs will return None from analyze_document

Similar Components

AI-powered semantic similarity - components with related functionality:

function main_v113 71.6% similar

Analyzes and compares .content files for PDF documents stored in reMarkable cloud storage, identifying differences between working and non-working documents.
From: /tf/active/vicechatdev/e-ink-llm/cloudtest/analyze_content_files.py
class RealAppUploadAnalyzer 70.8% similar

Analyzes documents uploaded by the real reMarkable app by fetching and examining their structure, metadata, and components from the reMarkable cloud sync service.
From: /tf/active/vicechatdev/e-ink-llm/cloudtest/test_real_app_upload.py
function main_v94 67.5% similar

Entry point function that compares real versus uploaded documents using DocumentComparator and displays the comparison results with formatted output.
From: /tf/active/vicechatdev/e-ink-llm/cloudtest/compare_documents.py
class FolderDebugger 65.6% similar

A debugging utility class for analyzing and troubleshooting folder structure and visibility issues in the reMarkable cloud sync system.
From: /tf/active/vicechatdev/e-ink-llm/cloudtest/debug_gpt_in_folder.py
function verify_document_status 64.3% similar

Verifies the current status and metadata of a specific test document in the reMarkable cloud sync system by querying the sync API endpoints and analyzing the document's location and properties.
From: /tf/active/vicechatdev/e-ink-llm/cloudtest/verify_document_status.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class DocumentComparator:
    """Compare documents to find structural differences"""
    
    def __init__(self):
        # Load auth session
        auth = RemarkableAuth()
        self.session = auth.get_authenticated_session()
        
        if not self.session:
            raise RuntimeError("Failed to authenticate with reMarkable")
        
        print("🔄 Document Comparator Initialized")
    
    def get_root_info(self):
        """Get current root.docSchema info"""
        root_response = self.session.get("https://eu.tectonic.remarkable.com/sync/v4/root")
        root_response.raise_for_status()
        root_data = root_response.json()
        
        # Get root content
        root_content_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{root_data['hash']}")
        root_content_response.raise_for_status()
        root_content = root_content_response.text
        
        return root_data, root_content
    
    def analyze_document(self, doc_uuid: str, doc_name: str):
        """Analyze a single document structure"""
        print(f"\n📄 Analyzing {doc_name} ({doc_uuid[:8]}...)")
        print("=" * 50)
        
        # Get root info
        root_data, root_content = self.get_root_info()
        
        # Find document in root
        lines = root_content.strip().split('\n')
        doc_entry = None
        
        for line in lines[1:]:  # Skip version header
            if doc_uuid in line:
                parts = line.split(':')
                if len(parts) >= 5:
                    doc_entry = {
                        'hash': parts[0],
                        'flags': parts[1],
                        'uuid': parts[2],
                        'type': parts[3],
                        'size': parts[4],
                        'full_line': line
                    }
                    break
        
        if not doc_entry:
            print(f"❌ Document not found in root.docSchema")
            return None
        
        print(f"📋 ROOT.DOCSCHEMA ENTRY:")
        print(f"   Hash: {doc_entry['hash']}")
        print(f"   Flags: {doc_entry['flags']}")
        print(f"   UUID: {doc_entry['uuid']}")
        print(f"   Type: {doc_entry['type']}")
        print(f"   Size: {doc_entry['size']}")
        print(f"   Full line: {doc_entry['full_line']}")
        
        # Get document schema
        doc_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{doc_entry['hash']}")
        doc_response.raise_for_status()
        doc_content = doc_response.text
        doc_lines = doc_content.strip().split('\n')
        
        print(f"\n📄 DOCUMENT SCHEMA:")
        print(f"   Version: {doc_lines[0]}")
        print(f"   Components: {len(doc_lines) - 1}")
        
        components = {}
        for i, line in enumerate(doc_lines[1:], 1):
            parts = line.split(':')
            if len(parts) >= 5:
                component_type = parts[2].split('.')[-1]  # Get extension
                components[component_type] = {
                    'hash': parts[0],
                    'flags': parts[1],
                    'filename': parts[2],
                    'type': parts[3],
                    'size': parts[4],
                    'line': line
                }
                print(f"   Component {i}: {component_type} ({parts[4]} bytes)")
        
        # Analyze metadata
        if 'metadata' in components:
            metadata_hash = components['metadata']['hash']
            metadata_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{metadata_hash}")
            metadata_response.raise_for_status()
            metadata = json.loads(metadata_response.text)
            
            print(f"\n📝 METADATA:")
            for key, value in sorted(metadata.items()):
                print(f"   {key}: {value}")
        
        # Analyze content if exists
        if 'content' in components:
            content_hash = components['content']['hash']
            content_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{content_hash}")
            content_response.raise_for_status()
            content_data = content_response.text
            
            print(f"\n📄 CONTENT:")
            print(f"   Size: {len(content_data)} bytes")
            print(f"   Type: {type(content_data)}")
            if len(content_data) < 200:
                print(f"   Data: {content_data}")
            else:
                print(f"   Preview: {content_data[:100]}...")
        
        # Analyze pagedata if exists
        if 'pagedata' in components:
            pagedata_hash = components['pagedata']['hash']
            pagedata_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{pagedata_hash}")
            pagedata_response.raise_for_status()
            pagedata_data = pagedata_response.text
            
            print(f"\n📄 PAGEDATA:")
            print(f"   Size: {len(pagedata_data)} bytes")
            if len(pagedata_data) < 200:
                print(f"   Data: {pagedata_data}")
            else:
                print(f"   Preview: {pagedata_data[:100]}...")
        
        return {
            'entry': doc_entry,
            'schema_lines': doc_lines,
            'components': components,
            'metadata': metadata if 'metadata' in components else None
        }
    
    def compare_documents(self):
        """Compare the real invoice vs our uploaded document"""
        print(f"🔍 Comparing Real App Document vs Our Upload")
        print("=" * 60)
        
        # Analyze invoice document (real app) - Poulpharm invoice
        invoice_uuid = "cf2a3833-4a8f-4004-ab8d-8dc3c5f561bc"  # Poulpharm invoice
        invoice_data = self.analyze_document(invoice_uuid, "Poulpharm Invoice (Real App)")
        
        # Analyze our uploaded document
        upload_uuid = "206f5df3-07c2-4341-8afd-2b7362aefa91"
        upload_data = self.analyze_document(upload_uuid, "Our Upload")
        
        if not invoice_data or not upload_data:
            print(f"❌ Could not analyze both documents")
            return
        
        # Compare key differences
        print(f"\n🔍 KEY DIFFERENCES:")
        print("=" * 30)
        
        # Compare root entry flags
        invoice_flags = invoice_data['entry']['flags']
        upload_flags = upload_data['entry']['flags']
        if invoice_flags != upload_flags:
            print(f"📋 ROOT FLAGS DIFFER:")
            print(f"   Invoice: {invoice_flags}")
            print(f"   Upload:  {upload_flags}")
        
        # Compare document types
        invoice_type = invoice_data['entry']['type']
        upload_type = upload_data['entry']['type']
        if invoice_type != upload_type:
            print(f"📋 DOCUMENT TYPE DIFFERS:")
            print(f"   Invoice: {invoice_type}")
            print(f"   Upload:  {upload_type}")
        
        # Compare components
        invoice_components = set(invoice_data['components'].keys())
        upload_components = set(upload_data['components'].keys())
        
        missing_in_upload = invoice_components - upload_components
        extra_in_upload = upload_components - invoice_components
        
        if missing_in_upload:
            print(f"📄 COMPONENTS MISSING IN UPLOAD: {missing_in_upload}")
        if extra_in_upload:
            print(f"📄 EXTRA COMPONENTS IN UPLOAD: {extra_in_upload}")
        
        # Compare metadata keys
        if invoice_data['metadata'] and upload_data['metadata']:
            invoice_keys = set(invoice_data['metadata'].keys())
            upload_keys = set(upload_data['metadata'].keys())
            
            missing_keys = invoice_keys - upload_keys
            extra_keys = upload_keys - invoice_keys
            
            if missing_keys:
                print(f"📝 METADATA KEYS MISSING IN UPLOAD: {missing_keys}")
            if extra_keys:
                print(f"📝 EXTRA METADATA KEYS IN UPLOAD: {extra_keys}")
            
            # Compare specific metadata values
            common_keys = invoice_keys & upload_keys
            for key in common_keys:
                if invoice_data['metadata'][key] != upload_data['metadata'][key]:
                    print(f"📝 METADATA DIFFERS for '{key}':")
                    print(f"   Invoice: {invoice_data['metadata'][key]}")
                    print(f"   Upload:  {upload_data['metadata'][key]}")
        
        # Compare component flags and sizes
        for component in invoice_components & upload_components:
            invoice_comp = invoice_data['components'][component]
            upload_comp = upload_data['components'][component]
            
            if invoice_comp['flags'] != upload_comp['flags']:
                print(f"📄 COMPONENT FLAGS DIFFER for {component}:")
                print(f"   Invoice: {invoice_comp['flags']}")
                print(f"   Upload:  {upload_comp['flags']}")
            
            if invoice_comp['type'] != upload_comp['type']:
                print(f"📄 COMPONENT TYPE DIFFERS for {component}:")
                print(f"   Invoice: {invoice_comp['type']}")
                print(f"   Upload:  {upload_comp['type']}")
        
        print(f"\n💡 ANALYSIS COMPLETE")
        return invoice_data, upload_data
                        

Improved Code

🔍 Code Extractor

class DocumentComparator

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self)`

`get_root_info(self) -> tuple`

`analyze_document(self, doc_uuid: str, doc_name: str) -> dict | None`

`compare_documents(self) -> tuple | None`

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

function main_v113 71.6% similar

class RealAppUploadAnalyzer 70.8% similar

function main_v94 67.5% similar

class FolderDebugger 65.6% similar

function verify_document_status 64.3% similar

class DocumentComparator

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self)

get_root_info(self) -> tuple

analyze_document(self, doc_uuid: str, doc_name: str) -> dict | None

compare_documents(self) -> tuple | None

Attributes

Dependencies

Required Imports

Usage Example

Best Practices

Tags

Similar Components

function main_v113 71.6% similar

class RealAppUploadAnalyzer 70.8% similar

function main_v94 67.5% similar

class FolderDebugger 65.6% similar

function verify_document_status 64.3% similar

✨ Improve Code: DocumentComparator

Code Comparison

`init(self)`

`get_root_info(self) -> tuple`

`analyze_document(self, doc_uuid: str, doc_name: str) -> dict | None`

`compare_documents(self) -> tuple | None`