class PDFManipulator
Manipulates existing PDF documents This class provides methods to add watermarks, merge PDFs, extract pages, and perform other manipulation operations.
/tf/active/vicechatdev/CDocs/utils/pdf_utils.py
1159 - 1740
moderate
Purpose
Manipulates existing PDF documents This class provides methods to add watermarks, merge PDFs, extract pages, and perform other manipulation operations.
Source Code
class PDFManipulator:
"""
Manipulates existing PDF documents
This class provides methods to add watermarks, merge PDFs,
extract pages, and perform other manipulation operations.
"""
def __init__(self, temp_dir: Optional[str] = None):
"""
Initialize the PDF manipulator
Parameters
----------
temp_dir : str, optional
Directory for temporary files. If not provided, a system temp directory is used.
"""
self.temp_dir = temp_dir if temp_dir else tempfile.mkdtemp()
os.makedirs(self.temp_dir, exist_ok=True)
def add_watermark(self, input_path: str, output_path: str, watermark_text: str,
opacity: float = 0.3, color: str = "gray") -> str:
"""
Add a watermark to every page of a PDF
Parameters
----------
input_path : str
Path to the input PDF
output_path : str
Path where the watermarked PDF will be saved
watermark_text : str
Text to use as watermark
opacity : float, optional
Opacity of the watermark (0.0 to 1.0)
color : str, optional
Color of the watermark
Returns
-------
str
Path to the watermarked PDF
Raises
------
FileNotFoundError
If the input file doesn't exist
RuntimeError
If the operation fails
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
try:
# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# Open the PDF with PyMuPDF
doc = fitz.open(input_path)
# Define watermark color
if color.lower() == "red":
rgb = (1, 0, 0)
elif color.lower() == "blue":
rgb = (0, 0, 1)
elif color.lower() == "green":
rgb = (0, 0.5, 0)
else: # Default gray
rgb = (0.5, 0.5, 0.5)
# Add watermark to each page
for page in doc:
# Get page dimensions
rect = page.rect
# Create font for watermark
font_size = min(rect.width, rect.height) / 10
# Insert watermark at an angle
page.insert_text(
rect.center, # Position at center of page
watermark_text,
fontsize=font_size,
fontname="helv",
rotate=45,
color=rgb,
alpha=opacity
)
# Save the watermarked PDF
doc.save(output_path)
doc.close()
logger.info(f"Added watermark to PDF: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error adding watermark to PDF: {str(e)}")
raise RuntimeError(f"Failed to add watermark: {str(e)}")
# This continues the implementation of /CDocs/utils/pdf_utils.py
def merge_pdfs(self, input_paths: List[str], output_path: str) -> str:
"""
Merge multiple PDF files into a single document
Parameters
----------
input_paths : List[str]
List of paths to input PDFs
output_path : str
Path where the merged PDF will be saved
Returns
-------
str
Path to the merged PDF
Raises
------
FileNotFoundError
If any input file doesn't exist
RuntimeError
If the operation fails
"""
# Check if input files exist
for input_path in input_paths:
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
try:
# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# Create a new PDF document
merged_doc = fitz.open()
# Add each input PDF to the merged document
for input_path in input_paths:
with fitz.open(input_path) as pdf:
merged_doc.insert_pdf(pdf)
# Save the merged PDF
merged_doc.save(output_path)
merged_doc.close()
logger.info(f"Merged {len(input_paths)} PDFs into: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error merging PDFs: {str(e)}")
raise RuntimeError(f"Failed to merge PDFs: {str(e)}")
def extract_pages(self, input_path: str, output_path: str,
pages: Union[List[int], range]) -> str:
"""
Extract specific pages from a PDF
Parameters
----------
input_path : str
Path to the input PDF
output_path : str
Path where the extracted PDF will be saved
pages : Union[List[int], range]
List or range of page numbers to extract (0-based)
Returns
-------
str
Path to the extracted PDF
Raises
------
FileNotFoundError
If the input file doesn't exist
RuntimeError
If the operation fails
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
try:
# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# Open the PDF with PyMuPDF
doc = fitz.open(input_path)
# Create a new PDF document for extracted pages
extracted_doc = fitz.open()
# Add specified pages to the new document
for page_num in pages:
if 0 <= page_num < doc.page_count:
extracted_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
else:
logger.warning(f"Page {page_num} out of range (0-{doc.page_count-1}), skipping")
# Save the extracted PDF
extracted_doc.save(output_path)
extracted_doc.close()
doc.close()
logger.info(f"Extracted {len(list(pages))} pages to: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error extracting pages: {str(e)}")
raise RuntimeError(f"Failed to extract pages: {str(e)}")
def add_header_footer(self, input_path: str, output_path: str,
header_text: Optional[str] = None,
footer_text: Optional[str] = None,
include_page_numbers: bool = True) -> str:
"""
Add header and/or footer to every page of a PDF
Parameters
----------
input_path : str
Path to the input PDF
output_path : str
Path where the modified PDF will be saved
header_text : str, optional
Text to add as header
footer_text : str, optional
Text to add as footer
include_page_numbers : bool, optional
Whether to include page numbers in the footer
Returns
-------
str
Path to the modified PDF
Raises
------
FileNotFoundError
If the input file doesn't exist
RuntimeError
If the operation fails
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
try:
# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# Open the PDF with PyMuPDF
doc = fitz.open(input_path)
# Process each page
for page_num, page in enumerate(doc):
# Get page dimensions
rect = page.rect
# Add header if specified
if header_text:
page.insert_text(
fitz.Point(rect.width / 2, 20), # Position at top center
header_text,
fontsize=10,
fontname="helv",
align=1 # Center-aligned
)
# Add footer if specified
if footer_text or include_page_numbers:
# Combine footer text with page numbers if both are requested
text = footer_text or ""
if include_page_numbers:
page_info = f"Page {page_num + 1} of {doc.page_count}"
if footer_text:
text = f"{text} | {page_info}"
else:
text = page_info
page.insert_text(
fitz.Point(rect.width / 2, rect.height - 20), # Position at bottom center
text,
fontsize=10,
fontname="helv",
align=1 # Center-aligned
)
# Save the modified PDF
doc.save(output_path)
doc.close()
logger.info(f"Added header/footer to PDF: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error adding header/footer: {str(e)}")
raise RuntimeError(f"Failed to add header/footer: {str(e)}")
def add_stamps(self, input_path: str, output_path: str,
stamps: List[Dict[str, Any]]) -> str:
"""
Add stamps (text or image overlays) to specific pages
Parameters
----------
input_path : str
Path to the input PDF
output_path : str
Path where the stamped PDF will be saved
stamps : List[Dict[str, Any]]
List of stamp definitions, each with:
- 'page': Page number (0-based)
- 'type': 'text' or 'image'
- 'content': Text or path to image
- 'x', 'y': Position coordinates
- 'rotation': Rotation angle (optional)
- 'opacity': Opacity value (optional)
- Additional type-specific options
Returns
-------
str
Path to the stamped PDF
Raises
------
FileNotFoundError
If the input file doesn't exist
RuntimeError
If the operation fails
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
try:
# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# Open the PDF with PyMuPDF
doc = fitz.open(input_path)
# Add stamps to the document
for stamp in stamps:
page_num = stamp.get('page', 0)
stamp_type = stamp.get('type', 'text')
content = stamp.get('content', '')
x_pos = stamp.get('x', 100)
y_pos = stamp.get('y', 100)
rotation = stamp.get('rotation', 0)
opacity = stamp.get('opacity', 1.0)
# Check if page exists
if page_num < 0 or page_num >= doc.page_count:
logger.warning(f"Page {page_num} out of range (0-{doc.page_count-1}), skipping stamp")
continue
page = doc[page_num]
if stamp_type == 'text':
# Add text stamp
fontsize = stamp.get('fontsize', 12)
fontname = stamp.get('fontname', 'helv')
color = stamp.get('color', (0, 0, 0)) # Default: black
page.insert_text(
fitz.Point(x_pos, y_pos),
content,
fontsize=fontsize,
fontname=fontname,
rotate=rotation,
color=color,
alpha=opacity
)
elif stamp_type == 'image':
# Add image stamp
if not os.path.exists(content):
logger.warning(f"Image file not found: {content}, skipping stamp")
continue
width = stamp.get('width', 100)
height = stamp.get('height', 100)
# Create rectangle for image placement
rect = fitz.Rect(x_pos, y_pos, x_pos + width, y_pos + height)
# Apply rotation if specified
if rotation != 0:
# Create a rotation matrix
matrix = fitz.Matrix(1, 0, 0, 1, 0, 0)
matrix.prerotate(rotation)
# Apply matrix to rectangle corners
rect = rect.transform(matrix)
# Insert image
page.insert_image(rect, filename=content, alpha=opacity)
else:
logger.warning(f"Unknown stamp type: {stamp_type}, skipping")
# Save the stamped PDF
doc.save(output_path)
doc.close()
logger.info(f"Added stamps to PDF: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error adding stamps: {str(e)}")
raise RuntimeError(f"Failed to add stamps: {str(e)}")
def make_pdf_a_compliant(self, input_path: str, output_path: str,
conformance_level: str = "3B") -> str:
"""
Convert PDF to PDF/A format for archival purposes
Parameters
----------
input_path : str
Path to the input PDF
output_path : str
Path where the PDF/A will be saved
conformance_level : str, optional
PDF/A conformance level ('1B', '2B', '3B')
Returns
-------
str
Path to the PDF/A compliant document
Raises
------
FileNotFoundError
If the input file doesn't exist
RuntimeError
If the operation fails
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
# Validate conformance level
valid_levels = ('1B', '2B', '3B')
if conformance_level not in valid_levels:
logger.warning(f"Invalid conformance level: {conformance_level}, using '3B'")
conformance_level = '3B'
try:
# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# We'll use pikepdf for PDF/A conversion
with pikepdf.open(input_path) as pdf:
# Add PDF/A metadata
pdf.add_file_attachment("xmp:PDFAConformance", f"PDF/A-{conformance_level}")
# Create XMP metadata with PDF/A conformance information
xmp_template = f"""<?xpacket begin='' id='W5M0MpCehiHzreSzNTczkc9d'?>
<x:xmpmeta xmlns:x='adobe:ns:meta/' x:xmptk='XMP toolkit 3.0-28, framework 1.6'>
<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'
xmlns:pdfaid='http://www.aiim.org/pdfa/ns/id/'
xmlns:pdf='http://ns.adobe.com/pdf/1.3/'>
<rdf:Description rdf:about=''>
<pdfaid:part>{conformance_level[0]}</pdfaid:part>
<pdfaid:conformance>{conformance_level[1]}</pdfaid:conformance>
<pdf:Producer>CDocs PDF Utilities</pdf:Producer>
</rdf:Description>
</rdf:RDF>
</x:xmpmeta>
<?xpacket end='w'?>"""
# Add metadata
with pdf.open_metadata() as meta:
meta.load_from_str(xmp_template)
# Set rendering intent for PDF/A compliance
pdf.Root.OutputIntents = pikepdf.Array([
pikepdf.Dictionary({
"/S": "/GTS_PDFA1",
"/OutputConditionIdentifier": "sRGB",
"/RegistryName": "http://www.color.org"
})
])
# Save as PDF/A
pdf.save(
output_path,
object_stream_mode=pikepdf.ObjectStreamMode.generate,
linearize=True
)
logger.info(f"Converted PDF to PDF/A-{conformance_level}: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error converting to PDF/A: {str(e)}")
raise RuntimeError(f"Failed to convert to PDF/A: {str(e)}")
def check_pdf_a_compliance(self, input_path: str) -> Dict[str, Any]:
"""
Check if a PDF is PDF/A compliant
Parameters
----------
input_path : str
Path to the PDF to check
Returns
-------
Dict[str, Any]
Results of compliance check with keys:
- 'compliant': bool
- 'level': str or None
- 'issues': List of issues if not compliant
Raises
------
FileNotFoundError
If the input file doesn't exist
RuntimeError
If the operation fails
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
try:
results = {
'compliant': False,
'level': None,
'issues': []
}
# Open PDF with pikepdf to examine metadata
with pikepdf.open(input_path) as pdf:
# Check for PDF/A metadata
if pdf.Root.get('/Metadata') is None:
results['issues'].append("Missing XMP metadata")
return results
# Check metadata for PDF/A indicators
with pdf.open_metadata() as meta:
# Check if PDF/A part and conformance are specified
ns = {
'pdfaid': 'http://www.aiim.org/pdfa/ns/id/'
}
try:
part = meta.get_item('pdfaid:part', ns=ns)
conformance = meta.get_item('pdfaid:conformance', ns=ns)
if part and conformance:
results['compliant'] = True
results['level'] = f"{part}{conformance}"
else:
results['issues'].append("PDF/A identifiers not found in metadata")
except Exception as e:
results['issues'].append(f"Error reading PDF/A metadata: {str(e)}")
# Check for OutputIntents (required for PDF/A)
if '/OutputIntents' not in pdf.Root:
results['issues'].append("Missing OutputIntents (required for PDF/A)")
results['compliant'] = False
logger.info(f"PDF/A compliance check for {input_path}: {results['compliant']}")
return results
except Exception as e:
logger.error(f"Error checking PDF/A compliance: {str(e)}")
raise RuntimeError(f"Failed to check PDF/A compliance: {str(e)}")
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
bases: Parameter of type
Return Value
Returns unspecified type
Class Interface
Methods
__init__(self, temp_dir)
Purpose: Initialize the PDF manipulator Parameters ---------- temp_dir : str, optional Directory for temporary files. If not provided, a system temp directory is used.
Parameters:
temp_dir: Type: Optional[str]
Returns: None
add_watermark(self, input_path, output_path, watermark_text, opacity, color) -> str
Purpose: Add a watermark to every page of a PDF Parameters ---------- input_path : str Path to the input PDF output_path : str Path where the watermarked PDF will be saved watermark_text : str Text to use as watermark opacity : float, optional Opacity of the watermark (0.0 to 1.0) color : str, optional Color of the watermark Returns ------- str Path to the watermarked PDF Raises ------ FileNotFoundError If the input file doesn't exist RuntimeError If the operation fails
Parameters:
input_path: Type: stroutput_path: Type: strwatermark_text: Type: stropacity: Type: floatcolor: Type: str
Returns: Returns str
merge_pdfs(self, input_paths, output_path) -> str
Purpose: Merge multiple PDF files into a single document Parameters ---------- input_paths : List[str] List of paths to input PDFs output_path : str Path where the merged PDF will be saved Returns ------- str Path to the merged PDF Raises ------ FileNotFoundError If any input file doesn't exist RuntimeError If the operation fails
Parameters:
input_paths: Type: List[str]output_path: Type: str
Returns: Returns str
extract_pages(self, input_path, output_path, pages) -> str
Purpose: Extract specific pages from a PDF Parameters ---------- input_path : str Path to the input PDF output_path : str Path where the extracted PDF will be saved pages : Union[List[int], range] List or range of page numbers to extract (0-based) Returns ------- str Path to the extracted PDF Raises ------ FileNotFoundError If the input file doesn't exist RuntimeError If the operation fails
Parameters:
input_path: Type: stroutput_path: Type: strpages: Type: Union[List[int], range]
Returns: Returns str
add_header_footer(self, input_path, output_path, header_text, footer_text, include_page_numbers) -> str
Purpose: Add header and/or footer to every page of a PDF Parameters ---------- input_path : str Path to the input PDF output_path : str Path where the modified PDF will be saved header_text : str, optional Text to add as header footer_text : str, optional Text to add as footer include_page_numbers : bool, optional Whether to include page numbers in the footer Returns ------- str Path to the modified PDF Raises ------ FileNotFoundError If the input file doesn't exist RuntimeError If the operation fails
Parameters:
input_path: Type: stroutput_path: Type: strheader_text: Type: Optional[str]footer_text: Type: Optional[str]include_page_numbers: Type: bool
Returns: Returns str
add_stamps(self, input_path, output_path, stamps) -> str
Purpose: Add stamps (text or image overlays) to specific pages Parameters ---------- input_path : str Path to the input PDF output_path : str Path where the stamped PDF will be saved stamps : List[Dict[str, Any]] List of stamp definitions, each with: - 'page': Page number (0-based) - 'type': 'text' or 'image' - 'content': Text or path to image - 'x', 'y': Position coordinates - 'rotation': Rotation angle (optional) - 'opacity': Opacity value (optional) - Additional type-specific options Returns ------- str Path to the stamped PDF Raises ------ FileNotFoundError If the input file doesn't exist RuntimeError If the operation fails
Parameters:
input_path: Type: stroutput_path: Type: strstamps: Type: List[Dict[str, Any]]
Returns: Returns str
make_pdf_a_compliant(self, input_path, output_path, conformance_level) -> str
Purpose: Convert PDF to PDF/A format for archival purposes Parameters ---------- input_path : str Path to the input PDF output_path : str Path where the PDF/A will be saved conformance_level : str, optional PDF/A conformance level ('1B', '2B', '3B') Returns ------- str Path to the PDF/A compliant document Raises ------ FileNotFoundError If the input file doesn't exist RuntimeError If the operation fails
Parameters:
input_path: Type: stroutput_path: Type: strconformance_level: Type: str
Returns: Returns str
check_pdf_a_compliance(self, input_path) -> Dict[str, Any]
Purpose: Check if a PDF is PDF/A compliant Parameters ---------- input_path : str Path to the PDF to check Returns ------- Dict[str, Any] Results of compliance check with keys: - 'compliant': bool - 'level': str or None - 'issues': List of issues if not compliant Raises ------ FileNotFoundError If the input file doesn't exist RuntimeError If the operation fails
Parameters:
input_path: Type: str
Returns: Returns Dict[str, Any]
Required Imports
import os
import io
import logging
import tempfile
import shutil
Usage Example
# Example usage:
# result = PDFManipulator(bases)
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class Watermarker 65.9% similar
-
class DocumentMerger 60.3% similar
-
function add_watermark 60.1% similar
-
class DocumentProcessor 58.5% similar
-
class PDFGenerator 57.8% similar