class HashGenerator
A class that provides cryptographic hashing functionality for PDF documents, including hash generation, embedding, and verification for document integrity checking.
/tf/active/vicechatdev/document_auditor/src/security/hash_generator.py
11 - 215
complex
Purpose
The HashGenerator class manages the complete lifecycle of document integrity verification for PDF files. It generates SHA-256 hashes based on PDF content (text and page dimensions), embeds these hashes into PDF metadata and as visible text, and verifies document integrity by comparing embedded hashes with recalculated values. The class uses content-based hashing rather than binary file hashing to focus on document content integrity, with fallback to binary hashing if content extraction fails.
Source Code
class HashGenerator:
"""Handles cryptographic hashing for document integrity"""
def __init__(self):
self.logger = logging.getLogger(__name__)
# Store the most recently generated hash
self._last_hash = None
def generate_hash(self, file_path):
"""
Generate a SHA-256 hash of the file content
Args:
file_path (str): Path to the file to hash
Returns:
str: Hexadecimal representation of the hash
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
# Generate the hash
hash_value = self._generate_content_hash(file_path)
# Store for later verification
self._last_hash = hash_value
return hash_value
def _generate_binary_hash(self, file_path):
"""Generate hash of binary file content"""
# Use a 1MB buffer for large files
buffer_size = 1024 * 1024
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
for byte_block in iter(lambda: f.read(buffer_size), b""):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
def _generate_content_hash(self, pdf_path):
"""Generate hash based on PDF content rather than the file itself"""
content_digest = hashlib.sha256()
try:
# Open the PDF
doc = fitz.open(pdf_path)
# Process each page
for page_num in range(len(doc)):
page = doc[page_num]
# Add page text to hash
text = page.get_text("text")
content_digest.update(text.encode('utf-8'))
# Add page dimensions to hash
dimensions = f"page{page_num}:{page.rect.width}x{page.rect.height}"
content_digest.update(dimensions.encode('utf-8'))
# Close the document
doc.close()
return content_digest.hexdigest()
except Exception as e:
self.logger.error(f"Error generating content hash: {e}")
# Fall back to binary hash if content hash fails
return self._generate_binary_hash(pdf_path)
def embed_hash(self, pdf_path, hash_value):
"""
Embed the hash value into the PDF metadata
Args:
pdf_path (str): Path to the PDF file
hash_value (str): Hash value to embed
Returns:
str: Path to the PDF with embedded hash
"""
try:
# Create a temporary file for output
temp_output = f"{pdf_path}.hash_tmp"
# First embed hash in metadata using pikepdf
with pikepdf.open(pdf_path) as pdf:
# Create hash metadata
hash_metadata = {
"hash": hash_value,
"algorithm": "SHA-256",
"timestamp": datetime.now().isoformat(),
"description": "Document integrity verification hash"
}
# Convert to JSON string
hash_json = json.dumps(hash_metadata)
# Add to document info dictionary
pdf.docinfo["/DocumentHash"] = hash_json
# Save to temporary file
pdf.save(temp_output)
# Now add the hash as visible text on the last page
try:
# Use a separate file for annotation to avoid 'save to original' issue
temp_with_annot = f"{pdf_path}.annot_tmp"
# Open the PDF we just saved with pikepdf
doc = fitz.open(temp_output)
# Get the last page
last_page = doc[-1]
# Create a timestamp
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Add hash information as text (not as an annotation)
rect = fitz.Rect(50, last_page.rect.height - 30, last_page.rect.width - 50, last_page.rect.height - 5)
# Add text directly to the page instead of using an annotation
try:
last_page.insert_textbox(
rect,
f"Document Integrity Hash (SHA-256): {hash_value}\nGenerated: {timestamp}",
fontname="Helvetica",
fontsize=8,
color=(0, 0, 0.8), # Dark blue
align=0 # Left alignment
)
except Exception as text_error:
self.logger.warning(f"Could not add text box: {text_error}")
# Save with the added text
doc.save(temp_with_annot)
doc.close()
# Replace our temp file
shutil.move(temp_with_annot, temp_output)
except Exception as e:
self.logger.warning(f"Could not add hash annotation: {e}")
# Continue with just metadata
# Replace original with the new version
shutil.move(temp_output, pdf_path)
self.logger.info(f"Hash embedded in PDF: {pdf_path}")
return pdf_path
except Exception as e:
self.logger.error(f"Error embedding hash in PDF: {e}")
raise
def verify_hash(self, pdf_path):
"""
Verify the hash embedded in the PDF against a fresh calculation
or against the stored hash
Args:
pdf_path (str): Path to the PDF file
Returns:
bool: True if hash matches, False otherwise
"""
try:
# Extract the embedded hash from metadata
embedded_hash = None
try:
with pikepdf.open(pdf_path) as pdf:
if "/DocumentHash" in pdf.docinfo:
hash_json = pdf.docinfo["/DocumentHash"]
hash_metadata = json.loads(str(hash_json))
embedded_hash = hash_metadata.get("hash")
except Exception as e:
self.logger.warning(f"Could not extract hash from metadata: {e}")
if not embedded_hash:
self.logger.warning(f"No hash found in PDF: {pdf_path}")
return False
# Use the stored hash if available, otherwise calculate a fresh one
if self._last_hash:
calculated_hash = self._last_hash
self.logger.info("Using stored hash for verification")
else:
# Calculate a fresh content-based hash of the document
calculated_hash = self._generate_content_hash(pdf_path)
self.logger.info("Generated fresh hash for verification")
# Compare hashes
result = embedded_hash == calculated_hash
if result:
self.logger.info(f"Hash verification successful for {pdf_path}")
else:
self.logger.warning(f"Hash verification failed: embedded={embedded_hash}, calculated={calculated_hash}")
return result
except Exception as e:
self.logger.error(f"Error verifying hash: {e}")
return False
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
__init__: The constructor takes no parameters. It initializes a logger instance for the class and sets up internal state to track the most recently generated hash value.
Return Value
Instantiation returns a HashGenerator object. Key method returns: generate_hash() returns a hexadecimal string representation of the SHA-256 hash; embed_hash() returns the path to the modified PDF file; verify_hash() returns a boolean indicating whether the embedded hash matches the calculated hash.
Class Interface
Methods
__init__(self)
Purpose: Initialize the HashGenerator instance with a logger and internal state
Returns: None - initializes the instance
generate_hash(self, file_path: str) -> str
Purpose: Generate a SHA-256 hash of the PDF file content and store it internally
Parameters:
file_path: String path to the PDF file to hash. Must exist and be readable.
Returns: Hexadecimal string representation of the SHA-256 hash
_generate_binary_hash(self, file_path: str) -> str
Purpose: Private method to generate a hash of the binary file content using 1MB buffer chunks
Parameters:
file_path: String path to the file to hash
Returns: Hexadecimal string representation of the SHA-256 hash of the binary content
_generate_content_hash(self, pdf_path: str) -> str
Purpose: Private method to generate a hash based on PDF content (text and page dimensions) rather than binary file data, with fallback to binary hash on error
Parameters:
pdf_path: String path to the PDF file to process
Returns: Hexadecimal string representation of the SHA-256 content hash
embed_hash(self, pdf_path: str, hash_value: str) -> str
Purpose: Embed the hash value into the PDF metadata and add visible text on the last page, modifying the file in-place
Parameters:
pdf_path: String path to the PDF file to modifyhash_value: Hexadecimal hash string to embed in the PDF
Returns: String path to the modified PDF file (same as input path)
verify_hash(self, pdf_path: str) -> bool
Purpose: Verify the hash embedded in the PDF against the stored hash (if available) or a freshly calculated hash
Parameters:
pdf_path: String path to the PDF file to verify
Returns: Boolean: True if the embedded hash matches the calculated/stored hash, False otherwise
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
logger |
logging.Logger | Logger instance for recording operations, warnings, and errors throughout the hashing process | instance |
_last_hash |
str or None | Private attribute storing the most recently generated hash value, used for verification without recalculation. Initially None. | instance |
Dependencies
logginghashlibpikepdfosjsondatetimetempfilefitzshutil
Required Imports
import logging
import hashlib
import pikepdf
import os
import json
from datetime import datetime
import tempfile
import fitz
import shutil
Usage Example
# Instantiate the hash generator
hash_gen = HashGenerator()
# Generate a hash for a PDF document
pdf_path = 'document.pdf'
hash_value = hash_gen.generate_hash(pdf_path)
print(f'Generated hash: {hash_value}')
# Embed the hash into the PDF
modified_path = hash_gen.embed_hash(pdf_path, hash_value)
print(f'Hash embedded in: {modified_path}')
# Verify the hash later
is_valid = hash_gen.verify_hash(pdf_path)
if is_valid:
print('Document integrity verified')
else:
print('Document has been modified')
# Alternative workflow: generate and embed in one go
hash_gen2 = HashGenerator()
hash_val = hash_gen2.generate_hash('another.pdf')
hash_gen2.embed_hash('another.pdf', hash_val)
# Verify using stored hash
hash_gen2.verify_hash('another.pdf')
Best Practices
- Always call generate_hash() before embed_hash() to ensure the hash value is calculated and stored
- The class stores the last generated hash in _last_hash, which is used by verify_hash() if available, avoiding recalculation
- Call verify_hash() on the same instance that called generate_hash() to leverage the stored hash for accurate verification
- The class modifies PDF files in-place during embed_hash(), so ensure you have backups if needed
- Content-based hashing focuses on text and page dimensions, not binary file content, so metadata changes won't affect the hash
- If content extraction fails, the class automatically falls back to binary file hashing
- Temporary files are created during embed_hash() operations and cleaned up automatically
- The embedded hash is stored both in PDF metadata (machine-readable) and as visible text on the last page (human-readable)
- File paths must exist and be accessible with read/write permissions
- The logger attribute can be configured externally for custom logging behavior
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class AuditPageGenerator 57.1% similar
-
class DocumentProcessor 56.9% similar
-
class PDFGenerator 56.6% similar
-
class HashCleaner 55.4% similar
-
class ControlledDocumentConverter 53.1% similar