class ControlledDocumentConverter
A comprehensive document converter class that transforms controlled documents into archived PDFs with signature pages, audit trails, hash-based integrity verification, and PDF/A compliance for long-term archival.
/tf/active/vicechatdev/CDocs/utils/document_converter.py
29 - 325
complex
Purpose
This class orchestrates the complete workflow for converting controlled documents (Word, PowerPoint, etc.) into secure, auditable, archival-quality PDFs. It handles document conversion, generates signature pages with approver information, creates audit trail reports, merges multiple PDFs, embeds cryptographic hashes for integrity verification, converts to PDF/A format for archival compliance, and optionally adds watermarks for obsolete documents. The class is designed for regulated environments requiring document control, traceability, and long-term preservation.
Source Code
class ControlledDocumentConverter:
"""Converts controlled documents to different formats with audit trail"""
def __init__(self, temp_dir: Optional[str] = None,
signature_dir: Optional[str] = None):
"""
Initialize the document converter
Parameters
----------
temp_dir : str, optional
Directory for temporary files
signature_dir : str, optional
Directory containing signature image files
"""
self.temp_dir = temp_dir or tempfile.mkdtemp()
os.makedirs(self.temp_dir, exist_ok=True)
self.signature_dir = signature_dir
if signature_dir and not os.path.exists(signature_dir):
logger.warning(f"Signature directory does not exist: {signature_dir}")
# Initialize components
self.document_converter = DocumentConverter()
# Initialize PDF utilities with safe initialization
try:
self.pdf_generator = PDFGenerator()
except KeyError as e:
# Handle style already defined error
logger.warning(f"Style definition error in PDFGenerator initialization: {e}")
# Use a fallback approach to get a PDF generator without re-registering styles
from CDocs.utils.pdf_utils import get_pdf_generator
self.pdf_generator = get_pdf_generator(skip_styles=True)
self.pdf_manipulator = PDFManipulator()
self.audit_generator = AuditPageGenerator()
self.hash_generator = HashGenerator()
self.pdfa_converter = PDFAConverter()
logger.debug(f"Initialized ControlledDocumentConverter with temp_dir: {self.temp_dir}")
def convert_to_pdf(self, input_path: str, output_path: str) -> str:
"""
Convert a document to PDF using document_auditor
Parameters
----------
input_path : str
Path to the input document
output_path : str
Path where the PDF should be saved
Returns
-------
str
Path to the converted PDF
"""
logger.info(f"Converting document to PDF: {input_path}")
try:
return self.document_converter.convert_to_pdf(input_path, output_path)
except Exception as e:
logger.error(f"Error converting document to PDF: {str(e)}")
raise
def generate_signature_page(self,
output_path: str,
doc_number: str,
title: str,
revision: str,
approvers: List[Dict[str, str]],
approved_date: str) -> str:
"""
Generate a signature page for the document
Parameters
----------
output_path : str
Path where the signature page should be saved
doc_number : str
Document number or identifier
title : str
Document title
revision : str
Document revision or version
approvers : List[Dict[str, str]]
List of approvers with name, role, date and signature info
approved_date : str
Date when the document was approved
Returns
-------
str
Path to the generated signature page
"""
logger.info(f"Generating signature page for document: {doc_number}")
try:
return self.pdf_generator.generate_certificate_page(
output_path=output_path,
doc_number=doc_number,
title=title,
revision=revision,
approvers=approvers,
approved_date=approved_date,
signature_dir=self.signature_dir
)
except Exception as e:
logger.error(f"Error generating signature page: {str(e)}")
raise
def generate_audit_trail(self,
output_path: str,
doc_number: str,
title: str,
revision: str,
audit_data: List[Dict[str, Any]],
audit_date: str) -> str:
"""
Generate an audit trail page for the document
Parameters
----------
output_path : str
Path where the audit trail should be saved
doc_number : str
Document number or identifier
title : str
Document title
revision : str
Document revision or version
audit_data : List[Dict[str, Any]]
Audit trail data with timestamps, users, and actions
audit_date : str
Date when the audit report was generated
Returns
-------
str
Path to the generated audit trail
"""
logger.info(f"Generating audit trail for document: {doc_number}")
try:
return self.pdf_generator.generate_audit_report(
output_path=output_path,
doc_number=doc_number,
title=title,
revision=revision,
audit_data=audit_data,
audit_date=audit_date,
auditor="CDocs System"
)
except Exception as e:
logger.error(f"Error generating audit trail: {str(e)}")
raise
def create_archived_pdf(self,
input_path: str,
output_path: str,
document_data: Dict[str, Any],
audit_data: List[Dict[str, Any]]) -> str:
"""
Create an archived PDF with signature page and audit trail
Parameters
----------
input_path : str
Path to the input document (docx, pptx, etc.)
output_path : str
Path where the archived PDF should be saved
document_data : Dict[str, Any]
Document metadata including title, number, version, etc.
audit_data : List[Dict[str, Any]]
Audit trail data with timestamps, users, and actions
Returns
-------
str
Path to the created archived PDF
"""
logger.info(f"Creating archived PDF for document: {document_data.get('title', '')}")
# Create temp directory for the process
process_dir = os.path.join(self.temp_dir, f"archive_{os.path.basename(input_path)}")
os.makedirs(process_dir, exist_ok=True)
try:
# Step 1: Convert the document to PDF
doc_pdf_path = os.path.join(process_dir, "document.pdf")
self.convert_to_pdf(input_path, doc_pdf_path)
# Step 2: Generate signature page
signature_path = os.path.join(process_dir, "signatures.pdf")
# Extract approvers from audit data
approvers = self._extract_approvers_from_audit(audit_data)
approved_date = document_data.get('approved_date',
datetime.datetime.now().strftime('%Y-%m-%d'))
self.generate_signature_page(
output_path=signature_path,
doc_number=document_data.get('doc_number', ''),
title=document_data.get('title', ''),
revision=document_data.get('version', ''),
approvers=approvers,
approved_date=approved_date
)
# Step 3: Generate audit trail
audit_path = os.path.join(process_dir, "audit_trail.pdf")
self.generate_audit_trail(
output_path=audit_path,
doc_number=document_data.get('doc_number', ''),
title=document_data.get('title', ''),
revision=document_data.get('version', ''),
audit_data=audit_data,
audit_date=datetime.datetime.now().strftime('%Y-%m-%d')
)
# Step 4: Merge PDFs
merged_pdf_path = os.path.join(process_dir, "merged.pdf")
merge_pdfs(
input_paths=[doc_pdf_path, signature_path, audit_path],
output_path=merged_pdf_path
)
# Step 5: Add document security and convert to PDF/A for archiving
secured_pdf_path = os.path.join(process_dir, "secured.pdf")
# Add document hash for integrity
self.hash_generator.compute_hash(merged_pdf_path)
self.hash_generator.embed_hash(merged_pdf_path, secured_pdf_path)
# Convert to PDF/A
archived_pdf_path = os.path.join(process_dir, "archived.pdf")
self.pdfa_converter.convert_to_pdfa(secured_pdf_path, archived_pdf_path, "2b")
# Step 6: Add watermark if the document is obsolete
final_pdf_path = archived_pdf_path
if document_data.get('status') == 'obsolete':
watermarked_pdf_path = os.path.join(process_dir, "watermarked.pdf")
self.pdf_manipulator.add_watermark(
input_path=archived_pdf_path,
output_path=watermarked_pdf_path,
watermark_text="OBSOLETE",
opacity=0.4,
color="red"
)
final_pdf_path = watermarked_pdf_path
# Copy to final destination
shutil.copy(final_pdf_path, output_path)
logger.info(f"Successfully created archived PDF: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error creating archived PDF: {str(e)}")
raise
finally:
# Clean up temporary directory
try:
shutil.rmtree(process_dir)
except:
pass
def _extract_approvers_from_audit(self, audit_data: List[Dict[str, Any]]) -> List[Dict[str, str]]:
"""
Extract approver information from audit data
Parameters
----------
audit_data : List[Dict[str, Any]]
Audit trail data with approval information
Returns
-------
List[Dict[str, str]]
List of approvers formatted for signature page
"""
approvers = []
# Process audit data to extract approvals
for entry in audit_data:
if entry.get('action') == 'approve':
approver = {
'name': entry.get('user_name', entry.get('user', 'Unknown')),
'role': entry.get('role', 'Approver'),
'date': entry.get('timestamp', '').split(' ')[0] # Extract date part
}
approvers.append(approver)
return approvers
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
temp_dir: Optional directory path for storing temporary files during conversion processes. If not provided, a system temporary directory is created automatically. This directory is used for intermediate files during PDF generation, merging, and conversion operations.
signature_dir: Optional directory path containing signature image files for approvers. These images are embedded in the signature page. If the directory doesn't exist, a warning is logged but initialization continues. Can be None if signature images are not required.
Return Value
The constructor returns an instance of ControlledDocumentConverter with initialized components for document conversion, PDF generation, manipulation, audit trail creation, hash generation, and PDF/A conversion. Key methods return file paths (str) to the generated or converted documents. The create_archived_pdf method returns the path to the final archived PDF containing the original document, signature page, and audit trail.
Class Interface
Methods
__init__(self, temp_dir: Optional[str] = None, signature_dir: Optional[str] = None)
Purpose: Initialize the document converter with temporary and signature directories, and set up all required components (PDF generator, manipulator, audit generator, hash generator, PDF/A converter)
Parameters:
temp_dir: Optional directory for temporary files; creates system temp dir if not providedsignature_dir: Optional directory containing signature image files for embedding in signature pages
Returns: None (constructor)
convert_to_pdf(self, input_path: str, output_path: str) -> str
Purpose: Convert a document (Word, PowerPoint, Excel, etc.) to PDF format using the document converter
Parameters:
input_path: Path to the input document file to convertoutput_path: Path where the converted PDF should be saved
Returns: Path to the converted PDF file (same as output_path)
generate_signature_page(self, output_path: str, doc_number: str, title: str, revision: str, approvers: List[Dict[str, str]], approved_date: str) -> str
Purpose: Generate a formatted signature page PDF containing document metadata and approver information with optional signature images
Parameters:
output_path: Path where the signature page PDF should be saveddoc_number: Document number or identifier for the controlled documenttitle: Full title of the documentrevision: Document revision or version numberapprovers: List of dictionaries with keys 'name', 'role', 'date' for each approverapproved_date: Date when the document was officially approved (YYYY-MM-DD format)
Returns: Path to the generated signature page PDF
generate_audit_trail(self, output_path: str, doc_number: str, title: str, revision: str, audit_data: List[Dict[str, Any]], audit_date: str) -> str
Purpose: Generate an audit trail report PDF documenting all actions, timestamps, and users involved in the document lifecycle
Parameters:
output_path: Path where the audit trail PDF should be saveddoc_number: Document number or identifiertitle: Document titlerevision: Document revision or versionaudit_data: List of audit entries with keys like 'action', 'user', 'timestamp', 'details'audit_date: Date when the audit report was generated (YYYY-MM-DD format)
Returns: Path to the generated audit trail PDF
create_archived_pdf(self, input_path: str, output_path: str, document_data: Dict[str, Any], audit_data: List[Dict[str, Any]]) -> str
Purpose: Orchestrate the complete workflow to create an archived PDF: convert document, generate signature page, create audit trail, merge all PDFs, embed hash, convert to PDF/A, and optionally add watermark for obsolete documents
Parameters:
input_path: Path to the input document file (docx, pptx, xlsx, etc.)output_path: Path where the final archived PDF should be saveddocument_data: Dictionary with keys: 'doc_number', 'title', 'version', 'approved_date', 'status' (optional)audit_data: List of audit entries documenting document lifecycle; entries with action='approve' are used for signature page
Returns: Path to the created archived PDF (same as output_path)
_extract_approvers_from_audit(self, audit_data: List[Dict[str, Any]]) -> List[Dict[str, str]]
Purpose: Internal helper method to extract and format approver information from audit trail data for use in signature page generation
Parameters:
audit_data: List of audit entries; entries with action='approve' are extracted
Returns: List of dictionaries with keys 'name', 'role', 'date' formatted for signature page
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
temp_dir |
str | Directory path for storing temporary files during conversion processes | instance |
signature_dir |
Optional[str] | Directory path containing signature image files for embedding in signature pages | instance |
document_converter |
DocumentConverter | Component for converting various document formats to PDF | instance |
pdf_generator |
PDFGenerator | Component for generating PDF pages including signature and audit pages | instance |
pdf_manipulator |
PDFManipulator | Component for manipulating PDFs including adding watermarks | instance |
audit_generator |
AuditPageGenerator | Component for generating audit trail pages | instance |
hash_generator |
HashGenerator | Component for computing and embedding cryptographic hashes for document integrity verification | instance |
pdfa_converter |
PDFAConverter | Component for converting PDFs to PDF/A format for long-term archival compliance | instance |
Dependencies
osloggingtempfileshutilsystypingdatetimeCDocs.utils.pdf_utilsCDocs.document_auditor.src.document_converterCDocs.document_auditor.src.audit_page_generatorCDocs.document_auditor.src.security.hash_generatorCDocs.document_auditor.src.utils.pdf_utils
Required Imports
import os
import logging
import tempfile
import shutil
import sys
from typing import Dict, Any, Optional, List, Tuple
import datetime
from CDocs.utils.pdf_utils import PDFGenerator, PDFManipulator, merge_pdfs
from CDocs.document_auditor.src.document_converter import DocumentConverter
from CDocs.document_auditor.src.audit_page_generator import AuditPageGenerator
from CDocs.document_auditor.src.security.hash_generator import HashGenerator
from CDocs.document_auditor.src.utils.pdf_utils import PDFAConverter
Conditional/Optional Imports
These imports are only needed under specific conditions:
from CDocs.utils.pdf_utils import get_pdf_generator
Condition: only if PDFGenerator initialization fails with KeyError (style already defined error)
OptionalUsage Example
import os
import datetime
from controlled_document_converter import ControlledDocumentConverter
# Initialize converter with optional directories
converter = ControlledDocumentConverter(
temp_dir='/tmp/doc_conversion',
signature_dir='/path/to/signatures'
)
# Simple document to PDF conversion
converter.convert_to_pdf(
input_path='document.docx',
output_path='document.pdf'
)
# Generate signature page
approvers = [
{'name': 'John Doe', 'role': 'Manager', 'date': '2024-01-15'},
{'name': 'Jane Smith', 'role': 'Director', 'date': '2024-01-16'}
]
converter.generate_signature_page(
output_path='signatures.pdf',
doc_number='DOC-001',
title='Quality Manual',
revision='1.0',
approvers=approvers,
approved_date='2024-01-16'
)
# Create complete archived PDF with audit trail
document_data = {
'doc_number': 'DOC-001',
'title': 'Quality Manual',
'version': '1.0',
'approved_date': '2024-01-16',
'status': 'active'
}
audit_data = [
{'action': 'create', 'user': 'jdoe', 'user_name': 'John Doe', 'timestamp': '2024-01-10 10:00:00'},
{'action': 'approve', 'user': 'jsmith', 'user_name': 'Jane Smith', 'role': 'Director', 'timestamp': '2024-01-16 14:30:00'}
]
archived_pdf = converter.create_archived_pdf(
input_path='document.docx',
output_path='archived_document.pdf',
document_data=document_data,
audit_data=audit_data
)
Best Practices
- Always provide a temp_dir with sufficient disk space for large document conversions
- Ensure signature_dir contains properly named signature image files matching approver identifiers
- Call methods in the correct order: convert_to_pdf before merging, generate pages before merging
- Use create_archived_pdf for complete workflow rather than calling individual methods manually
- Handle exceptions from all methods as they propagate errors from underlying conversion tools
- Clean up temporary files by allowing the class to manage temp_dir lifecycle
- Provide complete audit_data with 'approve' actions to populate signature pages correctly
- Set document status to 'obsolete' in document_data to automatically apply watermarks
- Ensure input documents are in supported formats (docx, pptx, xlsx, etc.)
- The class creates and manages temporary directories automatically; avoid manual cleanup during processing
- PDF/A conversion may fail if input PDFs contain unsupported features; validate source documents
- Hash embedding provides integrity verification but doesn't encrypt the document
- Signature images should be in common formats (PNG, JPG) and appropriately sized
- Audit data timestamps should be in 'YYYY-MM-DD HH:MM:SS' format for proper parsing
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentConverter 73.6% similar
-
class PDFAConverter 73.2% similar
-
class DocumentProcessor 71.7% similar
-
class PDFConverter 70.2% similar
-
class DocumentConverter_v1 68.3% similar