class SyncDiagnostics
A diagnostic class that analyzes and reports on synchronization issues between SharePoint and FileCloud, identifying missing files and root causes of sync failures.
/tf/active/vicechatdev/SPFCsync/deep_diagnostics.py
20 - 264
complex
Purpose
SyncDiagnostics provides comprehensive analysis of SharePoint to FileCloud synchronization operations. It retrieves documents from both systems, compares them, identifies discrepancies, analyzes root causes of missing files (such as API pagination limits, authentication issues, error handling problems), and provides actionable recommendations for fixing sync issues. This class is designed for troubleshooting and auditing sync operations.
Source Code
class SyncDiagnostics:
def __init__(self):
Config.setup_logging()
self.logger = logging.getLogger(__name__)
# Initialize clients
self.sp_client = SharePointGraphClient(
Config.SHAREPOINT_SITE_URL,
Config.AZURE_CLIENT_ID,
Config.AZURE_CLIENT_SECRET
)
self.fc_client = FileCloudClient(
Config.FILECLOUD_SERVER_URL,
Config.FILECLOUD_USERNAME,
Config.FILECLOUD_PASSWORD
)
def analyze_missing_files(self):
"""Comprehensive analysis of missing files and root causes."""
print("=" * 80)
print("SHAREPOINT TO FILECLOUD SYNC - ROOT CAUSE ANALYSIS")
print("=" * 80)
# 1. Analyze SharePoint document retrieval
print("\n1. ANALYZING SHAREPOINT DOCUMENT RETRIEVAL")
print("-" * 50)
sp_docs = self._analyze_sharepoint_retrieval()
# 2. Analyze FileCloud structure
print("\n2. ANALYZING FILECLOUD STRUCTURE")
print("-" * 50)
fc_structure = self._analyze_filecloud_structure()
# 3. Compare documents
print("\n3. COMPARING DOCUMENT LISTS")
print("-" * 50)
comparison = self._compare_documents(sp_docs, fc_structure)
# 4. Identify potential causes
print("\n4. ROOT CAUSE ANALYSIS")
print("-" * 50)
self._identify_root_causes(sp_docs, fc_structure, comparison)
# 5. Provide recommendations
print("\n5. RECOMMENDATIONS AND FIXES")
print("-" * 50)
self._provide_recommendations()
return {
'sharepoint_docs': sp_docs,
'filecloud_structure': fc_structure,
'comparison': comparison
}
def _analyze_sharepoint_retrieval(self):
"""Analyze SharePoint document retrieval for potential issues."""
try:
print("📊 Retrieving documents from SharePoint...")
docs = self.sp_client.get_all_documents("/")
print(f"✅ Retrieved {len(docs)} documents from SharePoint")
# Analyze document properties
file_types = {}
folder_distribution = {}
date_issues = []
size_distribution = {'small': 0, 'medium': 0, 'large': 0, 'huge': 0}
for doc in docs:
# File type analysis
file_type = doc.get('file_type', 'unknown')
file_types[file_type] = file_types.get(file_type, 0) + 1
# Folder distribution
folder = doc.get('folder_path', '/')
folder_distribution[folder] = folder_distribution.get(folder, 0) + 1
# Date analysis
modified = doc.get('modified', '')
if not modified or len(modified) == 4: # Year-only dates
date_issues.append(doc['name'])
# Size distribution
size = doc.get('size', 0)
if size < 1024 * 1024: # < 1MB
size_distribution['small'] += 1
elif size < 10 * 1024 * 1024: # < 10MB
size_distribution['medium'] += 1
elif size < 100 * 1024 * 1024: # < 100MB
size_distribution['large'] += 1
else: # >= 100MB
size_distribution['huge'] += 1
print(f"📁 Folder distribution: {len(folder_distribution)} folders")
print(f"📄 File types: {dict(sorted(file_types.items(), key=lambda x: x[1], reverse=True)[:10])}")
print(f"📅 Date issues: {len(date_issues)} files with problematic dates")
print(f"📏 Size distribution: {size_distribution}")
if date_issues:
print(f"⚠️ Files with date issues (first 10): {date_issues[:10]}")
return docs
except Exception as e:
print(f"❌ Error retrieving SharePoint documents: {e}")
return []
def _analyze_filecloud_structure(self):
"""Analyze FileCloud structure and count files."""
try:
print("📊 Analyzing FileCloud structure...")
# Get base path info
base_path = Config.FILECLOUD_BASE_PATH
print(f"🗂️ Base path: {base_path}")
# Count files in FileCloud (this is a simplified count)
# In practice, you'd need to implement a recursive file counter for FileCloud
fc_files = self._count_filecloud_files(base_path)
print(f"✅ Found approximately {fc_files} files in FileCloud")
return {'file_count': fc_files, 'base_path': base_path}
except Exception as e:
print(f"❌ Error analyzing FileCloud: {e}")
return {'file_count': 0, 'base_path': base_path}
def _count_filecloud_files(self, path):
"""Simple file counter for FileCloud (placeholder implementation)."""
# This is a simplified implementation
# In practice, you'd need to recursively traverse FileCloud directories
try:
file_info = self.fc_client.get_file_info(path)
if file_info:
return 1
return 0
except:
return 0
def _compare_documents(self, sp_docs, fc_structure):
"""Compare SharePoint and FileCloud document counts."""
sp_count = len(sp_docs)
fc_count = fc_structure.get('file_count', 0)
missing_count = sp_count - fc_count
percentage_missing = (missing_count / sp_count * 100) if sp_count > 0 else 0
print(f"📊 SharePoint documents: {sp_count}")
print(f"📊 FileCloud files: {fc_count}")
print(f"📊 Missing files: {missing_count} ({percentage_missing:.1f}%)")
return {
'sharepoint_count': sp_count,
'filecloud_count': fc_count,
'missing_count': missing_count,
'percentage_missing': percentage_missing
}
def _identify_root_causes(self, sp_docs, fc_structure, comparison):
"""Identify potential root causes for missing files."""
print("🔍 POTENTIAL ROOT CAUSES IDENTIFIED:")
causes_found = []
# 1. Check for Microsoft Graph API pagination limits
if len(sp_docs) >= 5000:
causes_found.append("API_PAGINATION_LIMIT")
print("❌ CRITICAL: Microsoft Graph API Pagination Issue")
print(" - The app retrieves documents without proper pagination")
print(" - Graph API typically returns max 200-5000 items per request")
print(" - Large SharePoint sites may have documents truncated")
# 2. Check for authentication token expiration
if comparison['missing_count'] > 1000:
causes_found.append("BULK_DOWNLOAD_FAILURES")
print("❌ CRITICAL: Bulk Download Failures")
print(" - Large numbers of files are failing to download")
print(" - This could be due to authentication token expiration")
print(" - Or download URL caching issues")
# 3. Check for error handling issues
causes_found.append("ERROR_HANDLING")
print("⚠️ WARNING: Error Handling Issues")
print(" - The app continues after download failures")
print(" - Individual file failures don't stop the sync")
print(" - Error statistics show successful completion despite failures")
# 4. Check for file size limits
if any(doc.get('size', 0) > 100 * 1024 * 1024 for doc in sp_docs):
causes_found.append("FILE_SIZE_LIMITS")
print("⚠️ WARNING: Large File Handling")
print(" - Some files are very large (>100MB)")
print(" - May cause timeout or memory issues")
# 5. Check for concurrent access issues
causes_found.append("CONCURRENT_ACCESS")
print("⚠️ WARNING: Concurrent Access Pattern")
print(" - App retrieves document list multiple times during sync")
print(" - This can cause inconsistencies if documents change")
# 6. Check for date parsing issues
date_problem_files = [doc for doc in sp_docs if len(doc.get('modified', '')) == 4]
if date_problem_files:
causes_found.append("DATE_PARSING")
print(f"⚠️ WARNING: Date Parsing Issues ({len(date_problem_files)} files)")
print(" - Some files have invalid date formats (year-only)")
print(" - This may cause comparison failures")
return causes_found
def _provide_recommendations(self):
"""Provide specific recommendations to fix the issues."""
print("🔧 RECOMMENDED FIXES:")
print("\n1. FIX MICROSOFT GRAPH API PAGINATION")
print(" - Implement proper pagination in _get_documents_recursive")
print(" - Use @odata.nextLink to retrieve all pages")
print(" - Add progress tracking for large document sets")
print("\n2. IMPROVE ERROR HANDLING AND RECOVERY")
print(" - Stop sync on critical errors (auth failures)")
print(" - Implement retry logic for failed downloads")
print(" - Add file-level success/failure tracking")
print("\n3. FIX DOCUMENT CACHING ISSUE")
print(" - Cache document list at start of sync")
print(" - Don't retrieve document list multiple times")
print(" - Use cached list for all download operations")
print("\n4. ADD COMPREHENSIVE VALIDATION")
print(" - Verify each file was actually uploaded to FileCloud")
print(" - Compare file sizes and checksums")
print(" - Generate detailed sync reports")
print("\n5. IMPLEMENT INCREMENTAL SYNC")
print(" - Track last successful sync timestamp")
print(" - Only sync files modified since last run")
print(" - Reduce load on both SharePoint and FileCloud")
print("\n6. ADD MONITORING AND ALERTING")
print(" - Alert when error rates exceed threshold")
print(" - Monitor sync completion rates")
print(" - Track missing file counts over time")
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
No constructor parameters: The __init__ method takes no parameters. It automatically initializes logging via Config.setup_logging() and creates client instances for SharePoint and FileCloud using configuration values from the Config class.
Return Value
The class instantiation returns a SyncDiagnostics object. The main method analyze_missing_files() returns a dictionary with keys: 'sharepoint_docs' (list of SharePoint documents), 'filecloud_structure' (dict with file count and base path), and 'comparison' (dict with count statistics and missing file analysis).
Class Interface
Methods
__init__(self)
Purpose: Initializes the SyncDiagnostics instance by setting up logging and creating SharePoint and FileCloud client instances
Returns: None - initializes instance attributes logger, sp_client, and fc_client
analyze_missing_files(self) -> dict
Purpose: Main entry point that performs comprehensive analysis of missing files between SharePoint and FileCloud, including retrieval, comparison, root cause analysis, and recommendations
Returns: Dictionary with keys 'sharepoint_docs' (list of document dicts), 'filecloud_structure' (dict with file_count and base_path), and 'comparison' (dict with sharepoint_count, filecloud_count, missing_count, percentage_missing)
_analyze_sharepoint_retrieval(self) -> list
Purpose: Retrieves all documents from SharePoint and analyzes their properties including file types, folder distribution, date issues, and size distribution
Returns: List of document dictionaries from SharePoint, or empty list on error. Each document contains properties like name, file_type, folder_path, modified, size
_analyze_filecloud_structure(self) -> dict
Purpose: Analyzes FileCloud structure and counts files in the configured base path
Returns: Dictionary with keys 'file_count' (int) and 'base_path' (str)
_count_filecloud_files(self, path: str) -> int
Purpose: Simple file counter for FileCloud at a given path (placeholder implementation that checks if path exists)
Parameters:
path: FileCloud path to check for file existence
Returns: Integer count: 1 if file exists at path, 0 otherwise
_compare_documents(self, sp_docs: list, fc_structure: dict) -> dict
Purpose: Compares document counts between SharePoint and FileCloud to calculate missing files and percentages
Parameters:
sp_docs: List of SharePoint documents from _analyze_sharepoint_retrievalfc_structure: FileCloud structure dict from _analyze_filecloud_structure
Returns: Dictionary with keys 'sharepoint_count', 'filecloud_count', 'missing_count', and 'percentage_missing'
_identify_root_causes(self, sp_docs: list, fc_structure: dict, comparison: dict) -> list
Purpose: Analyzes data to identify potential root causes of sync failures such as API pagination limits, bulk download failures, error handling issues, file size limits, concurrent access problems, and date parsing issues
Parameters:
sp_docs: List of SharePoint documentsfc_structure: FileCloud structure dictionarycomparison: Comparison results dictionary
Returns: List of cause identifiers (strings) such as 'API_PAGINATION_LIMIT', 'BULK_DOWNLOAD_FAILURES', 'ERROR_HANDLING', etc.
_provide_recommendations(self)
Purpose: Prints detailed recommendations for fixing identified sync issues including pagination fixes, error handling improvements, caching solutions, validation, incremental sync, and monitoring
Returns: None - prints recommendations to stdout
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
logger |
logging.Logger | Logger instance for the SyncDiagnostics class, initialized with __name__ | instance |
sp_client |
SharePointGraphClient | Client instance for interacting with SharePoint via Microsoft Graph API, initialized with site URL and Azure credentials from Config | instance |
fc_client |
FileCloudClient | Client instance for interacting with FileCloud, initialized with server URL and credentials from Config | instance |
Dependencies
loggingsysosdatetimejsontracebacksharepoint_graph_clientfilecloud_clientconfig
Required Imports
import sys
import os
import logging
from datetime import datetime
import json
import traceback
from sharepoint_graph_client import SharePointGraphClient
from filecloud_client import FileCloudClient
from config import Config
Usage Example
from sync_diagnostics import SyncDiagnostics
from config import Config
# Ensure Config is properly set up with required credentials
# Config.SHAREPOINT_SITE_URL = 'https://yoursite.sharepoint.com/sites/yoursite'
# Config.AZURE_CLIENT_ID = 'your-client-id'
# etc.
# Create diagnostics instance
diagnostics = SyncDiagnostics()
# Run comprehensive analysis
results = diagnostics.analyze_missing_files()
# Access results
print(f"SharePoint documents found: {len(results['sharepoint_docs'])}")
print(f"FileCloud files found: {results['filecloud_structure']['file_count']}")
print(f"Missing files: {results['comparison']['missing_count']}")
print(f"Percentage missing: {results['comparison']['percentage_missing']:.1f}%")
Best Practices
- Instantiate SyncDiagnostics only after ensuring all Config values are properly set with valid credentials
- Run analyze_missing_files() as the primary entry point for diagnostics - it orchestrates all analysis steps
- The class prints extensive diagnostic output to stdout, so capture or redirect output if needed for logging
- This class is read-only and does not modify SharePoint or FileCloud data - safe for production diagnostics
- The analysis can be time-consuming for large document sets due to API calls to both systems
- Private methods (_analyze_*, _compare_*, etc.) are internal and should not be called directly
- Results dictionary from analyze_missing_files() contains raw data for further programmatic analysis
- The class creates new client instances on each instantiation - consider connection pooling for repeated use
- Error handling is built-in but errors are logged and printed rather than raised, allowing partial analysis completion
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function main_v16 81.0% similar
-
class SharePointFileCloudSync 73.6% similar
-
function analyze_logs 68.6% similar
-
function test_filecloud_integration 68.2% similar
-
function dry_run_test 67.7% similar