class DocumentProcessor
A comprehensive document processing class that converts documents to PDF, adds audit trails, applies security features (watermarks, signatures, hashing), and optionally converts to PDF/A format with document protection.
/tf/active/vicechatdev/document_auditor/src/document_processor.py
16 - 175
complex
Purpose
DocumentProcessor serves as the main orchestrator for processing documents with complete audit trails and security features. It coordinates multiple specialized components to convert documents to PDF, generate audit pages with signature images, merge documents, apply watermarks, convert to PDF/A compliance formats, generate and embed cryptographic hashes for integrity verification, and apply document protection with encryption. This class is designed for scenarios requiring document compliance, security, and audit trail requirements such as regulatory submissions, legal documents, or enterprise document management systems.
Source Code
class DocumentProcessor:
"""Main class for processing documents with audit trails and security features"""
def __init__(self, config_path=None):
"""Initialize the document processor with optional configuration"""
self.logger = logging.getLogger(__name__)
# Load configuration if provided
self.config = {}
if config_path and os.path.exists(config_path):
with open(config_path, 'r') as f:
self.config = json.load(f)
# Initialize components
self.converter = DocumentConverter()
self.audit_generator = AuditPageGenerator()
self.merger = DocumentMerger()
self.hash_generator = HashGenerator()
self.watermarker = Watermarker()
self.signature_manager = SignatureManager(
signatures_dir=self.config.get('signatures_directory', 'signatures')
)
self.pdfa_converter = PDFAConverter()
self.document_protector = DocumentProtector()
def process_document(self, original_doc_path, json_path, output_path,
watermark_image=None, include_signatures=True,
convert_to_pdfa=True, compliance_level='2b',
finalize=True):
"""
Process document with audit data, adding security features
Args:
original_doc_path (str): Path to the original document
json_path (str): Path to JSON file with audit data
output_path (str): Path where final document will be saved
watermark_image (str, optional): Path to watermark image
include_signatures (bool): Whether to include signature images
convert_to_pdfa (bool): Whether to convert to PDF/A format
compliance_level (str): PDF/A compliance level ('1b', '2b', '3b')
finalize (bool): Whether to protect the document from further editing
Returns:
str: Path to the final document
"""
self.logger.info(f"Processing document: {original_doc_path}")
# Load JSON data
with open(json_path, 'r') as f:
json_data = json.load(f)
# Create temporary file paths with unique timestamps
timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
temp_dir = os.path.dirname(output_path) or '.'
temp_pdf_path = os.path.join(temp_dir, f"temp_document_{timestamp}.pdf")
temp_audit_path = os.path.join(temp_dir, f"temp_audit_{timestamp}.pdf")
temp_merged_path = os.path.join(temp_dir, f"temp_merged_{timestamp}.pdf")
temp_watermarked_path = os.path.join(temp_dir, f"temp_watermarked_{timestamp}.pdf")
try:
# Step 1: Convert original document to PDF if needed
self.logger.info("Converting document to PDF")
pdf_path = self.converter.convert_to_pdf(original_doc_path, temp_pdf_path)
# Step 2: Generate audit page with signatures if requested
self.logger.info("Generating audit page")
if include_signatures:
# Get signature paths for each reviewer and approver
self._add_signature_paths_to_json(json_data)
audit_path = self.audit_generator.generate_audit_page(json_data, temp_audit_path)
# Step 3: Merge PDFs
self.logger.info("Merging documents")
merged_path = self.merger.merge_pdfs(pdf_path, audit_path, temp_merged_path)
# Step 4: Add watermark if provided
current_pdf = merged_path
if watermark_image and os.path.exists(watermark_image):
self.logger.info("Adding watermark")
current_pdf = self.watermarker.add_watermark(
merged_path,
watermark_image,
temp_watermarked_path,
scale=0.2, # Make the watermark 20% of original size
opacity=0.1, # 10% opacity (very subtle)
position="center" # Center of the page
)
# Step 5: Convert to PDF/A if requested
final_pdf = current_pdf
if convert_to_pdfa:
self.logger.info(f"Converting to PDF/A-{compliance_level}")
final_pdf = self.pdfa_converter.convert_to_pdfa(current_pdf, output_path, compliance_level)
else:
# Just copy to the final output location
shutil.copy(current_pdf, output_path)
final_pdf = output_path
# Step 6: Generate and embed document hash as the VERY LAST step
self.logger.info("Generating document hash")
# Calculate content-based hash of the final document
hash_value = self.hash_generator.generate_hash(final_pdf)
self.logger.info(f"Generated document hash: {hash_value}")
# Embed the hash into the document
self.hash_generator.embed_hash(final_pdf, hash_value)
# Store the hash for later verification
self._last_document_hash = hash_value
# Verify the hash immediately to ensure it's working
verification = self.hash_generator.verify_hash(final_pdf)
if not verification:
self.logger.warning("Internal hash verification failed - this may cause issues later")
# Step 7: Apply document protection
if finalize:
self.logger.info("Applying document protection")
protected_path, owner_password = self.document_protector.protect_document(
pdf_path=final_pdf,
allow_printing=True,
allow_copying=True,
encryption_level=2 # 128-bit encryption
)
# Store the password for potential retrieval
self._last_owner_password = owner_password
self.logger.info(f"Document protected with owner password: {owner_password}")
self.logger.info("IMPORTANT: Store this password securely for future administrative access")
self.logger.info(f"Final document created at: {output_path}")
return output_path
finally:
# Clean up temporary files
for path in [temp_pdf_path, temp_audit_path, temp_merged_path, temp_watermarked_path]:
if os.path.exists(path):
try:
os.remove(path)
except Exception as e:
self.logger.warning(f"Failed to remove temporary file {path}: {e}")
def _add_signature_paths_to_json(self, json_data):
"""Add signature image paths to the JSON data"""
# Process reviewers
for review in json_data.get('reviews', []):
reviewer_name = review.get('reviewer_name')
if reviewer_name:
sig_path = self.signature_manager.get_signature_path(reviewer_name)
review['signature_path'] = sig_path if sig_path else None
# Process approvers
for approval in json_data.get('approvals', []):
approver_name = approval.get('approver_name')
if approver_name:
sig_path = self.signature_manager.get_signature_path(approver_name)
approval['signature_path'] = sig_path if sig_path else None
return json_data
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
config_path: Optional path to a JSON configuration file. If provided and the file exists, configuration settings are loaded from it. The config can specify 'signatures_directory' for signature image storage location. If None or file doesn't exist, default configuration is used with empty config dictionary.
Return Value
The __init__ method returns a DocumentProcessor instance. The main process_document method returns a string containing the path to the final processed document. Instance attributes _last_document_hash and _last_owner_password store the hash value and owner password from the most recent document processing operation.
Class Interface
Methods
__init__(self, config_path=None)
Purpose: Initialize the DocumentProcessor with optional configuration and instantiate all required component objects
Parameters:
config_path: Optional string path to JSON configuration file containing settings like 'signatures_directory'
Returns: None - initializes the instance with logger, config, and all component objects (converter, audit_generator, merger, hash_generator, watermarker, signature_manager, pdfa_converter, document_protector)
process_document(self, original_doc_path, json_path, output_path, watermark_image=None, include_signatures=True, convert_to_pdfa=True, compliance_level='2b', finalize=True) -> str
Purpose: Main method to process a document through the complete pipeline: conversion, audit page generation, merging, watermarking, PDF/A conversion, hash generation, and document protection
Parameters:
original_doc_path: String path to the original document file to be processed (can be various formats like DOCX, etc.)json_path: String path to JSON file containing audit trail data with reviews and approvalsoutput_path: String path where the final processed document will be savedwatermark_image: Optional string path to watermark image file. If None or file doesn't exist, watermarking is skippedinclude_signatures: Boolean flag to include signature images in the audit page (default: True)convert_to_pdfa: Boolean flag to convert final document to PDF/A format (default: True)compliance_level: String specifying PDF/A compliance level: '1b', '2b', or '3b' (default: '2b')finalize: Boolean flag to apply document protection with encryption (default: True)
Returns: String containing the path to the final processed document (same as output_path parameter). Also sets instance attributes _last_document_hash and _last_owner_password
_add_signature_paths_to_json(self, json_data) -> dict
Purpose: Private helper method to add signature image paths to the JSON audit data for reviewers and approvers
Parameters:
json_data: Dictionary containing audit data with 'reviews' and 'approvals' arrays, each containing reviewer_name or approver_name fields
Returns: Modified dictionary with 'signature_path' fields added to each review and approval entry, containing paths to signature images or None if not found
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
logger |
logging.Logger | Logger instance for recording processing steps, warnings, and errors throughout document processing | instance |
config |
dict | Configuration dictionary loaded from config_path JSON file, or empty dict if no config provided. Can contain settings like 'signatures_directory' | instance |
converter |
DocumentConverter | Component instance responsible for converting various document formats to PDF | instance |
audit_generator |
AuditPageGenerator | Component instance for generating audit trail pages from JSON data with signatures | instance |
merger |
DocumentMerger | Component instance for merging the original PDF with the audit page PDF | instance |
hash_generator |
HashGenerator | Component instance for generating cryptographic hashes of documents and embedding them for integrity verification | instance |
watermarker |
Watermarker | Component instance for applying watermark images to PDF documents with configurable scale, opacity, and position | instance |
signature_manager |
SignatureManager | Component instance for managing and retrieving signature image files for reviewers and approvers | instance |
pdfa_converter |
PDFAConverter | Component instance for converting PDFs to PDF/A compliance formats (1b, 2b, 3b) | instance |
document_protector |
DocumentProtector | Component instance for applying encryption and access restrictions to PDF documents | instance |
_last_document_hash |
str | Stores the cryptographic hash value of the most recently processed document. Set during process_document execution. Should be stored externally for later verification | instance |
_last_owner_password |
str | Stores the owner password generated for the most recently protected document. Set when finalize=True. Must be stored securely for administrative access | instance |
Dependencies
osjsonloggingdatetimeshutildocument_converteraudit_page_generatordocument_mergersecurity.hash_generatorsecurity.watermarksecurity.signature_managerutils.pdf_utilssecurity.document_protection
Required Imports
import os
import json
import logging
from datetime import datetime
import shutil
from document_converter import DocumentConverter
from audit_page_generator import AuditPageGenerator
from document_merger import DocumentMerger
from security.hash_generator import HashGenerator
from security.watermark import Watermarker
from security.signature_manager import SignatureManager
from utils.pdf_utils import PDFAConverter
from security.document_protection import DocumentProtector
Usage Example
# Basic usage
processor = DocumentProcessor()
output_path = processor.process_document(
original_doc_path='document.docx',
json_path='audit_data.json',
output_path='final_document.pdf'
)
# Advanced usage with all features
processor = DocumentProcessor(config_path='config.json')
output_path = processor.process_document(
original_doc_path='report.docx',
json_path='audit_trail.json',
output_path='final_report.pdf',
watermark_image='company_logo.png',
include_signatures=True,
convert_to_pdfa=True,
compliance_level='2b',
finalize=True
)
print(f'Document hash: {processor._last_document_hash}')
print(f'Owner password: {processor._last_owner_password}')
Best Practices
- Always provide valid paths for original_doc_path, json_path, and output_path when calling process_document
- Store the _last_owner_password securely after document protection - it's required for administrative access to protected documents
- The _last_document_hash should be stored externally for later verification of document integrity
- Ensure the output directory has write permissions as the class creates multiple temporary files during processing
- The JSON audit data file must contain properly structured 'reviews' and 'approvals' arrays with reviewer_name and approver_name fields
- Watermark images should exist at the specified path before processing, or set watermark_image=None to skip watermarking
- Call process_document only once per document - the class maintains state (_last_document_hash, _last_owner_password) that gets overwritten
- Temporary files are automatically cleaned up in the finally block, but ensure sufficient disk space during processing
- The finalize parameter should be set to True for production documents to apply encryption and protection
- PDF/A compliance levels '1b', '2b', or '3b' should be chosen based on your archival requirements
- The class uses a logger, so configure logging before instantiation to capture processing details and warnings
- Signature images must be available in the signatures directory (configured via config file or default 'signatures' folder)
- The process follows a strict order: convert → audit page → merge → watermark → PDF/A → hash → protect. Do not attempt to modify this order.
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function main_v48 72.7% similar
-
class DocumentProcessor_v2 72.2% similar
-
class DocumentProcessor_v1 72.0% similar
-
class ControlledDocumentConverter 71.7% similar
-
function test_document_processing 67.6% similar