class RealAppUploadAnalyzer
Analyzes documents uploaded by the real reMarkable app by fetching and examining their structure, metadata, and components from the reMarkable cloud sync service.
/tf/active/vicechatdev/e-ink-llm/cloudtest/test_real_app_upload.py
71 - 381
complex
Purpose
This class provides comprehensive analysis of documents stored in the reMarkable cloud infrastructure. It authenticates with the reMarkable sync service, retrieves document schemas, fetches individual components (PDF files, metadata, content, pagedata), validates size consistency, and provides detailed reporting on document structure. The analyzer is designed to understand how the reMarkable app structures and stores documents in the cloud, including the relationship between root.docSchema entries, document schemas, and their constituent components.
Source Code
class RealAppUploadAnalyzer:
"""Analyzes documents uploaded by the real reMarkable app"""
def __init__(self):
self.base_dir = Path(__file__).parent
# Load auth session
from auth import RemarkableAuth
auth = RemarkableAuth()
self.session = auth.get_authenticated_session()
if not self.session:
raise RuntimeError("Failed to authenticate with reMarkable")
print("š Real App Upload Analyzer Initialized")
def analyze_real_app_document(self, target_name: str = "Pylontech force H3 datasheet") -> dict:
"""Analyze a document uploaded by the real reMarkable app"""
print(f"šÆ Analyzing Real App Document: '{target_name}'")
print("=" * 60)
try:
# Step 1: Get current root.docSchema from server
print(f"\nš Step 1: Fetching current root.docSchema...")
root_response = self.session.get("https://eu.tectonic.remarkable.com/sync/v4/root")
root_response.raise_for_status()
root_data = root_response.json()
current_root_hash = root_data['hash']
print(f"ā
Root hash: {current_root_hash}")
print(f"ā
Generation: {root_data.get('generation')}")
# Step 2: Fetch root.docSchema content
print(f"\nš Step 2: Fetching root.docSchema content...")
root_content_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{current_root_hash}")
root_content_response.raise_for_status()
root_content = root_content_response.text
print(f"ā
Root.docSchema size: {len(root_content)} bytes")
lines = root_content.strip().split('\n')
print(f"š Root.docSchema entries ({len(lines) - 1} total):")
for i, line in enumerate(lines):
if i == 0:
print(f" Version: {line}")
else:
print(f" Entry {i}: {line}")
# Step 3: Look for our target document
print(f"\nš Step 3: Looking for document containing '{target_name}'...")
target_documents = []
lines = root_content.strip().split('\n')
for line in lines[1:]: # Skip version header
if ':' in line:
parts = line.split(':')
if len(parts) >= 5:
doc_hash = parts[0]
doc_uuid = parts[2]
node_type = parts[3]
size = parts[4]
# Fetch the document's docSchema to check metadata
try:
doc_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{doc_hash}")
doc_response.raise_for_status()
doc_schema = doc_response.text
# Look for metadata component
doc_lines = doc_schema.strip().split('\n')
for doc_line in doc_lines[1:]:
if '.metadata' in doc_line and ':' in doc_line:
metadata_parts = doc_line.split(':')
if len(metadata_parts) >= 3:
metadata_hash = metadata_parts[0]
# Fetch metadata content
try:
metadata_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{metadata_hash}")
metadata_response.raise_for_status()
metadata = json.loads(metadata_response.text)
doc_name = metadata.get('visibleName', '')
if target_name.lower() in doc_name.lower():
target_documents.append({
'hash': doc_hash,
'uuid': doc_uuid,
'size': size,
'node_type': node_type,
'name': doc_name,
'line': line,
'metadata': metadata,
'docschema_size': len(doc_schema)
})
print(f"šÆ FOUND: '{doc_name}' (UUID: {doc_uuid[:8]}...)")
print(f" Root entry: {line}")
print(f" DocSchema size: {len(doc_schema)} bytes")
print(f" Root claimed size: {size} bytes")
print(f" Size match: {'ā
YES' if str(len(doc_schema)) == size else 'ā NO'}")
break
except:
continue
break
except:
continue
if not target_documents:
print(f"ā Document containing '{target_name}' not found")
return {'success': False, 'error': 'Document not found'}
# Step 4: Analyze the first matching document in detail
target_doc = target_documents[0]
print(f"\nš Step 4: Deep Analysis of '{target_doc['name']}'")
print(f" Document UUID: {target_doc['uuid']}")
print(f" Document hash: {target_doc['hash']}")
print(f" Node type: {target_doc['node_type']}")
print(f" Root.docSchema size claim: {target_doc['size']} bytes")
print(f" Actual docSchema size: {target_doc['docschema_size']} bytes")
print(f" Size consistency: {'ā
CORRECT' if str(target_doc['docschema_size']) == target_doc['size'] else 'ā MISMATCH'}")
# Step 5: Fetch and analyze the document's docSchema
print(f"\nš Step 5: Analyzing document's docSchema structure...")
doc_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{target_doc['hash']}")
doc_response.raise_for_status()
doc_schema_content = doc_response.text
lines = doc_schema_content.strip().split('\n')
print(f"š Full docSchema content ({len(doc_schema_content)} bytes):")
for i, line in enumerate(lines):
print(f" Line {i}: {line}")
# Step 6: Analyze each component
print(f"\nš Step 6: Analyzing each document component...")
lines = doc_schema_content.strip().split('\n')
version = lines[0]
print(f"š DocSchema version: {version}")
components = {}
component_sizes = []
for i, line in enumerate(lines[1:], 1):
if ':' in line:
parts = line.split(':')
if len(parts) >= 5:
comp_hash = parts[0]
comp_name = parts[2]
comp_size = int(parts[4])
component_sizes.append(comp_size)
print(f"\n š Component {i}: {comp_name}")
print(f" Hash: {comp_hash}")
print(f" Expected size: {comp_size}")
try:
comp_response = self.session.get(f"https://eu.tectonic.remarkable.com/sync/v3/files/{comp_hash}")
comp_response.raise_for_status()
actual_size = len(comp_response.content)
print(f" ā
Actual size: {actual_size} bytes")
print(f" š Size match: {'ā
YES' if actual_size == comp_size else 'ā NO'}")
components[comp_name] = {
'hash': comp_hash,
'expected_size': comp_size,
'actual_size': actual_size,
'content': comp_response.content
}
# Component-specific analysis
if comp_name.endswith('.pdf'):
print(f" š PDF content preview: {comp_response.content[:50]}")
if comp_response.content.startswith(b'%PDF'):
print(f" ā
Valid PDF header")
# Try to determine PDF size/pages
pdf_size_mb = len(comp_response.content) / (1024 * 1024)
print(f" š PDF file size: {pdf_size_mb:.2f} MB")
else:
print(f" ā Invalid PDF header")
elif comp_name.endswith('.metadata'):
try:
metadata_json = json.loads(comp_response.text)
print(f" ā
Valid JSON metadata")
print(f" š Name: {metadata_json.get('visibleName', 'N/A')}")
print(f" š Parent: {metadata_json.get('parent', 'root')}")
print(f" šļø Type: {metadata_json.get('type', 'N/A')}")
print(f" ā° Created: {metadata_json.get('createdTime', 'N/A')}")
print(f" ā° Modified: {metadata_json.get('lastModified', 'N/A')}")
# Check for content_data
if 'content_data' in comp_response.text:
print(f" š Has content_data field")
try:
# Try to extract content_data
import re
content_data_match = re.search(r'"content_data":\\s*"([^"]*)"', comp_response.text)
if content_data_match:
content_data_str = content_data_match.group(1)
print(f" š Content data: {content_data_str[:100]}...")
except:
pass
print(f" š Full metadata JSON:")
for key, value in metadata_json.items():
print(f" {key}: {value}")
except Exception as json_e:
print(f" ā Invalid JSON: {json_e}")
elif comp_name.endswith('.content'):
print(f" š Content preview: {comp_response.text[:100]}...")
elif comp_name.endswith('.pagedata'):
if comp_size == 0:
print(f" š Empty pagedata (as expected for PDFs)")
else:
print(f" š Pagedata preview: {comp_response.text[:100]}...")
except Exception as e:
print(f" ā Component error: {e}")
components[comp_name] = {
'hash': comp_hash,
'expected_size': comp_size,
'error': str(e)
}
# Step 7: Final analysis and comparison
print(f"\nš Step 7: Final Analysis")
print("=" * 50)
total_component_size = sum(component_sizes)
actual_docschema_size = len(doc_schema_content)
claimed_size = int(target_doc['size'])
print(f"š Size Analysis Results:")
print(f" Document name: {target_doc['name']}")
print(f" Root.docSchema claimed size: {claimed_size} bytes")
print(f" Actual docSchema size: {actual_docschema_size} bytes")
print(f" Sum of component sizes: {total_component_size} bytes")
print(f" ")
print(f" ā
Key Findings:")
print(f" ⢠Root size claim matches actual docSchema: {'ā
YES' if claimed_size == actual_docschema_size else 'ā NO'}")
print(f" ⢠Root size claim matches component sum: {'ā
YES' if claimed_size == total_component_size else 'ā NO'}")
print(f" ⢠DocSchema size matches component sum: {'ā
YES' if actual_docschema_size == total_component_size else 'ā NO'}")
# Determine the correct pattern
if claimed_size == actual_docschema_size:
print(f" šÆ CONCLUSION: Root.docSchema stores the actual docSchema file size")
elif claimed_size == total_component_size:
print(f" šÆ CONCLUSION: Root.docSchema stores the sum of component sizes")
else:
print(f" š¤ CONCLUSION: Unclear pattern - sizes don't match expected relationships")
print(f"\nš Component Breakdown:")
for name, details in components.items():
if 'error' not in details:
print(f" ā
{name}: {details['actual_size']} bytes")
else:
print(f" ā {name}: {details['error']}")
return {
'success': True,
'document_name': target_doc['name'],
'document_uuid': target_doc['uuid'],
'document_hash': target_doc['hash'],
'root_size_claim': claimed_size,
'actual_docschema_size': actual_docschema_size,
'total_component_size': total_component_size,
'size_claim_matches_docschema': claimed_size == actual_docschema_size,
'size_claim_matches_components': claimed_size == total_component_size,
'docschema_matches_components': actual_docschema_size == total_component_size,
'components': components,
'metadata': target_doc['metadata']
}
except Exception as e:
print(f"ā Analysis failed: {e}")
return {'success': False, 'error': str(e)}
def save_analysis_results(self, results: dict) -> Path:
"""Save analysis results to file"""
results_dir = self.base_dir / "test_results" / "real_app_analysis"
results_dir.mkdir(parents=True, exist_ok=True)
timestamp = int(time.time())
results_file = results_dir / f"real_app_analysis_{timestamp}.json"
with open(results_file, 'w') as f:
json.dump(results, f, indent=2, default=str)
print(f"\nš¾ Analysis results saved to: {results_file}")
return results_file
def save_raw_logs(self) -> Path:
"""Save raw HTTP logs"""
global raw_logs
if not raw_logs:
return None
logs_dir = self.base_dir / "test_results" / "real_app_analysis"
logs_dir.mkdir(parents=True, exist_ok=True)
timestamp = int(time.time())
log_file = logs_dir / f"real_app_requests_{timestamp}.json"
with open(log_file, 'w') as f:
json.dump(raw_logs, f, indent=2, default=str)
print(f"š Raw HTTP logs saved to: {log_file}")
return log_file
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
__init__: No parameters required. The constructor automatically initializes the base directory path and authenticates with the reMarkable service using the RemarkableAuth class. Raises RuntimeError if authentication fails.
Return Value
The class instantiation returns a RealAppUploadAnalyzer object. The main method analyze_real_app_document() returns a dictionary with keys: 'success' (bool), 'document_name', 'document_uuid', 'document_hash', 'root_size_claim', 'actual_docschema_size', 'total_component_size', size comparison booleans, 'components' (dict of component details), and 'metadata'. On failure, returns {'success': False, 'error': error_message}. The save methods return Path objects pointing to saved files.
Class Interface
Methods
__init__(self)
Purpose: Initializes the analyzer, sets up base directory, authenticates with reMarkable service, and creates an authenticated session
Returns: None - raises RuntimeError if authentication fails
analyze_real_app_document(self, target_name: str = 'Pylontech force H3 datasheet') -> dict
Purpose: Performs comprehensive analysis of a document uploaded by the reMarkable app, including fetching root schema, document schema, all components, and validating size consistency
Parameters:
target_name: Name or partial name of the document to analyze (case-insensitive substring match). Defaults to 'Pylontech force H3 datasheet'
Returns: Dictionary containing analysis results with keys: success (bool), document_name, document_uuid, document_hash, root_size_claim, actual_docschema_size, total_component_size, size comparison booleans, components (dict), and metadata. Returns {'success': False, 'error': str} on failure
save_analysis_results(self, results: dict) -> Path
Purpose: Saves analysis results to a JSON file in the test_results/real_app_analysis directory with timestamp
Parameters:
results: Dictionary containing analysis results from analyze_real_app_document method
Returns: Path object pointing to the saved JSON file
save_raw_logs(self) -> Path
Purpose: Saves raw HTTP request/response logs to a JSON file if the global raw_logs variable is populated
Returns: Path object pointing to the saved log file, or None if no logs are available
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
base_dir |
Path | Base directory path where the class file is located, used for resolving relative paths for saving results | instance |
session |
requests.Session | Authenticated HTTP session object for making requests to the reMarkable cloud API, obtained from RemarkableAuth | instance |
Dependencies
pathlibjsontimerequestsre
Required Imports
import os
import json
import time
from pathlib import Path
import requests
import re
Conditional/Optional Imports
These imports are only needed under specific conditions:
from auth import RemarkableAuth
Condition: Required for authentication with reMarkable service - must be available in the same directory or Python path
Required (conditional)Usage Example
# Instantiate the analyzer (requires authentication)
analyzer = RealAppUploadAnalyzer()
# Analyze a specific document by name
results = analyzer.analyze_real_app_document(target_name="Pylontech force H3 datasheet")
if results['success']:
print(f"Document: {results['document_name']}")
print(f"UUID: {results['document_uuid']}")
print(f"Size matches: {results['size_claim_matches_docschema']}")
# Save results to file
results_file = analyzer.save_analysis_results(results)
print(f"Results saved to: {results_file}")
# Optionally save raw HTTP logs
log_file = analyzer.save_raw_logs()
else:
print(f"Analysis failed: {results['error']}")
Best Practices
- Always instantiate within a try-except block to handle authentication failures gracefully
- The analyzer makes multiple HTTP requests to the reMarkable API - be mindful of rate limiting
- Ensure the RemarkableAuth module is properly configured before instantiation
- The analyze_real_app_document method performs extensive network operations and may take time for large documents
- Results should be saved using save_analysis_results() for persistence and later review
- The target_name parameter in analyze_real_app_document uses case-insensitive substring matching
- Component analysis includes validation of PDF headers, JSON metadata parsing, and size verification
- The class maintains a session object throughout its lifetime - reuse the same instance for multiple analyses
- Raw HTTP logs can be saved using save_raw_logs() if the global raw_logs variable is populated
- The analyzer expects documents to follow the reMarkable cloud storage structure with .metadata, .content, .pdf, and .pagedata components
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function analyze_pylontech_document 74.3% similar
-
function main_v62 74.2% similar
-
function main_v113 72.3% similar
-
class FixedUploadTest 71.1% similar
-
class DocumentComparator 70.8% similar