function validate_and_fix_document_permissions
Validates and optionally fixes document sharing permissions for controlled documents in a Neo4j database, processing documents in configurable batches with detailed progress tracking and error handling.
/tf/active/vicechatdev/CDocs/utils/sharing_validator.py
22 - 170
complex
Purpose
This function performs a comprehensive audit of document sharing permissions across a document management system. It queries controlled documents from a Neo4j graph database, validates their permission settings, and can automatically fix any permission issues found. The function is designed for maintenance operations, data integrity checks, and permission reconciliation tasks. It supports filtering by document status, batch processing for performance, and provides detailed statistics about the validation process including success rates, errors, and processing speed.
Source Code
def validate_and_fix_document_permissions(
batch_size: int = 50,
status_filter: Optional[List[str]] = None,
fix_issues: bool = True
) -> Dict[str, Any]:
"""
Validate document sharing permissions and optionally fix any issues found.
Parameters
----------
batch_size : int, optional
Number of documents to process in each batch (default: 50)
status_filter : List[str], optional
Only process documents with these status codes (default: None = all documents)
fix_issues : bool, optional
If True, attempt to fix any permission issues found (default: True)
Returns
-------
Dict[str, Any]
Summary of validation results
"""
start_time = time.time()
logger.info("Starting document sharing permissions validation")
# Prepare status filter for query
status_filter_query = ""
params = {"limit": batch_size, "skip": 0}
if status_filter:
status_list = ", ".join([f"'{s}'" for s in status_filter])
status_filter_query = f"WHERE doc.status IN [{status_list}]"
# Build query to get documents with current versions
query = f"""
MATCH (doc:ControlledDocument)-[:CURRENT_VERSION]->(version:DocumentVersion)
{status_filter_query}
RETURN doc, version
ORDER BY doc.docNumber
SKIP $skip LIMIT $limit
"""
# Validation statistics
stats = {
"total_documents": 0,
"processed_documents": 0,
"permission_issues": 0,
"fixed_issues": 0,
"errors": 0,
"skipped": 0
}
# Get total document count for progress tracking
count_query = f"""
MATCH (doc:ControlledDocument)-[:CURRENT_VERSION]->(version:DocumentVersion)
{status_filter_query}
RETURN count(doc) as count
"""
count_result = db.run_query(count_query, {})
if count_result and 'count' in count_result[0]:
stats["total_documents"] = count_result[0]['count']
logger.info(f"Found {stats['total_documents']} documents to validate")
# Process documents in batches
has_more = True
skip = 0
while has_more:
params["skip"] = skip
results = db.run_query(query, params)
if not results or len(results) == 0:
has_more = False
break
logger.info(f"Processing batch of {len(results)} documents (skipped: {skip})")
# Process each document in the batch
for result in results:
doc_data = result.get('doc', {})
version_data = result.get('version', {})
try:
# Create model instances
document = ControlledDocument(data=doc_data)
# Track progress
stats["processed_documents"] += 1
# Log progress periodically
if stats["processed_documents"] % 50 == 0:
elapsed = time.time() - start_time
logger.info(
f"Progress: {stats['processed_documents']}/{stats['total_documents']} "
f"documents processed in {elapsed:.2f}s"
)
# Validate and fix permissions
if fix_issues:
try:
# Use the share controller to check and fix permissions
logger.debug(f"Checking permissions for document {document.doc_number}")
result = manage_document_permissions(document)
if not result.get('success', False):
stats["permission_issues"] += 1
logger.warning(
f"Permission issue with document {document.doc_number}: "
f"{result.get('message', 'Unknown error')}"
)
elif result.get('details') and len(result.get('details', [])) > 0:
# Count as fixed if any permissions were modified
stats["fixed_issues"] += 1
logger.info(f"Fixed permissions for document {document.doc_number}")
except Exception as perm_err:
stats["errors"] += 1
logger.error(f"Error fixing permissions for document {document.uid}: {str(perm_err)}")
else:
# Just validate without fixing
# (In a real implementation, you would add validation logic here)
pass
except Exception as e:
stats["errors"] += 1
logger.error(f"Error processing document {doc_data.get('UID')}: {str(e)}")
# Move to the next batch
skip += len(results)
# Check if we should continue (stop if we've processed all docs)
if len(results) < batch_size:
has_more = False
# Calculate elapsed time
elapsed_time = time.time() - start_time
# Complete the statistics
stats["elapsed_time"] = f"{elapsed_time:.2f}s"
stats["documents_per_second"] = stats["processed_documents"] / elapsed_time if elapsed_time > 0 else 0
logger.info(f"Completed document sharing permissions validation in {elapsed_time:.2f} seconds")
logger.info(f"Processed {stats['processed_documents']} documents")
logger.info(f"Found {stats['permission_issues']} permission issues")
logger.info(f"Fixed {stats['fixed_issues']} permission issues")
logger.info(f"Encountered {stats['errors']} errors")
return stats
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
batch_size |
int | 50 | positional_or_keyword |
status_filter |
Optional[List[str]] | None | positional_or_keyword |
fix_issues |
bool | True | positional_or_keyword |
Parameter Details
batch_size: Controls how many documents are processed in each database query iteration. Default is 50. Larger values improve performance but increase memory usage. Smaller values are better for systems with limited resources or when processing very large document sets. Must be a positive integer.
status_filter: Optional list of document status codes (strings) to filter which documents are processed. If None (default), all documents are processed regardless of status. Common status values might include 'draft', 'approved', 'archived', etc. Only documents matching one of the provided statuses will be validated.
fix_issues: Boolean flag determining whether the function should attempt to repair permission issues automatically. If True (default), the function calls manage_document_permissions to fix problems. If False, the function only validates and reports issues without making changes, useful for dry-run audits.
Return Value
Type: Dict[str, Any]
Returns a dictionary containing comprehensive validation statistics with the following keys: 'total_documents' (int: total count of documents matching the filter), 'processed_documents' (int: number successfully processed), 'permission_issues' (int: count of documents with permission problems), 'fixed_issues' (int: count of successfully repaired permissions), 'errors' (int: count of processing errors), 'skipped' (int: count of skipped documents), 'elapsed_time' (str: formatted time like '45.23s'), and 'documents_per_second' (float: processing throughput rate).
Dependencies
loggingtimetypingdatetimeCDocs.models.documentCDocs.models.user_extensionsCDocs.controllers.share_controllerCDocs.db.schema_managerCDocs.db
Required Imports
import logging
import time
from typing import Dict, List, Any, Optional
from CDocs.models.document import ControlledDocument
from CDocs.controllers.share_controller import manage_document_permissions
from CDocs import db
Usage Example
# Basic usage - validate and fix all documents
results = validate_and_fix_document_permissions()
print(f"Processed {results['processed_documents']} documents")
print(f"Fixed {results['fixed_issues']} permission issues")
# Validate only approved documents without fixing
results = validate_and_fix_document_permissions(
batch_size=100,
status_filter=['approved', 'published'],
fix_issues=False
)
print(f"Found {results['permission_issues']} issues in approved documents")
# Process large dataset with smaller batches
results = validate_and_fix_document_permissions(
batch_size=25,
fix_issues=True
)
print(f"Processing rate: {results['documents_per_second']:.2f} docs/sec")
print(f"Total time: {results['elapsed_time']}")
Best Practices
- Always test with fix_issues=False first to understand the scope of permission issues before making changes
- Use appropriate batch_size based on available memory and database performance - start with default 50 and adjust as needed
- Monitor the returned statistics, especially 'errors' count, to identify systemic issues
- Run during off-peak hours for large document sets to minimize impact on system performance
- Ensure proper logging configuration to capture detailed progress and error information
- Use status_filter to process documents incrementally by status rather than all at once for very large datasets
- Verify database backup exists before running with fix_issues=True on production data
- The function uses SKIP/LIMIT pagination which may have performance implications on very large datasets - consider adding indexes on docNumber
- Check that manage_document_permissions function is properly configured and tested before relying on automatic fixes
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function check_document_permissions_on_startup 73.5% similar
-
function check_document_permissions_on_startup_v1 62.7% similar
-
function manage_document_permissions 60.7% similar
-
function get_document_permissions 59.1% similar
-
function validate_schema 58.3% similar