class PowerPointProcessor
A class that processes PowerPoint (.pptx) presentations to extract text content and tables, converting tables to markdown format and organizing content by slides.
/tf/active/vicechatdev/leexi/enhanced_meeting_minutes_generator.py
63 - 211
moderate
Purpose
PowerPointProcessor is designed to parse PowerPoint presentations and extract structured content including text from shapes and tables from slides. It separates text and table content into distinct chunks, each associated with slide metadata (slide number and title). Tables are converted to markdown format for easy consumption. The class handles file validation, error recovery, and provides detailed logging throughout the extraction process. It's particularly useful for document processing pipelines, content indexing systems, or any application that needs to extract and structure PowerPoint content programmatically.
Source Code
class PowerPointProcessor:
"""Process PowerPoint presentations to extract text and table content"""
def __init__(self, temp_dir=None):
"""Initialize the PowerPoint processor"""
if not PPTX_AVAILABLE:
logger.warning("python-pptx library not available. PowerPoint processing will be limited.")
self.temp_dir = Path(temp_dir) if temp_dir else Path(tempfile.mkdtemp())
os.makedirs(self.temp_dir, exist_ok=True)
def _is_valid_file(self, file_path):
"""Check if a file appears to be valid and processable"""
try:
path = Path(file_path)
if not path.exists() or path.stat().st_size == 0:
logger.warning(f"File doesn't exist or is empty: {file_path}")
return False
with open(file_path, 'rb') as f:
header = f.read(16)
if not header:
logger.warning(f"File appears unreadable: {file_path}")
return False
return True
except Exception as e:
logger.error(f"Error checking file validity: {file_path} - {str(e)}")
return False
def _table_to_markdown(self, table_data):
"""Convert a 2D array to a markdown table"""
if not table_data or not table_data[0]:
return "| |"
# Create header
markdown = "| " + " | ".join([str(cell) for cell in table_data[0]]) + " |\n"
# Add separator line
markdown += "| " + " | ".join(["---" for _ in table_data[0]]) + " |\n"
# Add data rows
for row in table_data[1:]:
markdown += "| " + " | ".join([str(cell) for cell in row]) + " |\n"
return markdown
def _process_powerpoint_table(self, table):
"""Extract table data from a PowerPoint table"""
try:
table_data = []
for i, row in enumerate(table.rows):
row_data = []
for cell in row.cells:
cell_text = cell.text.strip().replace('\n', ' ').replace('\r', ' ')
row_data.append(cell_text)
table_data.append(row_data)
return self._table_to_markdown(table_data)
except Exception as e:
logger.error(f"Error processing PowerPoint table: {str(e)}")
return "Error processing table"
def process_powerpoint(self, file_path):
"""Process PowerPoint presentations to extract text and table content"""
logger.info(f"Processing PowerPoint: {file_path}")
if not PPTX_AVAILABLE:
logger.error("python-pptx library not available. Cannot process PowerPoint files.")
return {"text_chunks": [], "table_chunks": []}
if not self._is_valid_file(file_path):
logger.error(f"Invalid or corrupted file, skipping: {file_path}")
return None
try:
text_chunks = []
table_chunks = []
# Try to open the presentation
try:
presentation = pptx.Presentation(file_path)
except Exception as e:
logger.error(f"Error opening PowerPoint with python-pptx: {str(e)}")
return None
# Process each slide
for i, slide in enumerate(presentation.slides):
# Get slide title or default
try:
slide_title = None
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text and hasattr(shape, "placeholder_format"):
if shape.placeholder_format.type == 1: # Title placeholder
slide_title = shape.text.strip()
break
if not slide_title:
slide_title = f"Slide {i+1}"
except Exception as slide_err:
logger.warning(f"Error getting slide title: {str(slide_err)}")
slide_title = f"Slide {i+1}"
# First identify all tables to exclude them from text extraction
tables = []
try:
tables = [shape for shape in slide.shapes if hasattr(shape, "has_table") and shape.has_table]
table_ids = set(id(table) for table in tables)
except Exception as table_err:
logger.warning(f"Error identifying tables in slide {i+1}: {str(table_err)}")
table_ids = set()
# Extract all text from shapes on this slide (excluding tables and titles)
text_content = []
for shape in slide.shapes:
try:
# Skip tables and titles
if (id(shape) in table_ids or
(hasattr(shape, "text") and shape.text == slide_title)):
continue
# Add text from non-table shapes
if hasattr(shape, "text") and shape.text.strip():
text_content.append(shape.text.strip())
except Exception as shape_err:
logger.warning(f"Error processing shape in slide {i+1}: {str(shape_err)}")
# Combine all text from this slide into a single chunk
combined_text = "\n".join(text_content)
if combined_text.strip(): # Only add if there's meaningful text
newuid = str(uuid4())
parent_text = f"Slide {i+1}: {slide_title}"
text_chunks.append([parent_text, combined_text, newuid])
# Process tables separately
for shape in tables:
try:
if hasattr(shape, "table"):
table_markdown = self._process_powerpoint_table(shape.table)
newuid = str(uuid4())
parent_text = f"Slide {i+1}: {slide_title} - Table"
table_chunks.append([parent_text, table_markdown, "", newuid])
except Exception as table_process_err:
logger.warning(f"Error processing table in slide {i+1}: {str(table_process_err)}")
return {"text_chunks": text_chunks, "table_chunks": table_chunks}
except Exception as e:
logger.error(f"Error processing PowerPoint {file_path}: {str(e)}")
return None
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
temp_dir: Optional path to a temporary directory for file operations. If not provided, a new temporary directory is created using tempfile.mkdtemp(). The directory is created if it doesn't exist. Type: str or Path-like object, Default: None
Return Value
Instantiation returns a PowerPointProcessor object. The main method process_powerpoint() returns a dictionary with keys 'text_chunks' and 'table_chunks'. text_chunks contains lists of [parent_text, content, uuid], where parent_text is 'Slide N: Title', content is the combined text, and uuid is a unique identifier. table_chunks contains lists of [parent_text, markdown_table, empty_string, uuid]. Returns None if processing fails or file is invalid.
Class Interface
Methods
__init__(self, temp_dir=None)
Purpose: Initialize the PowerPoint processor with an optional temporary directory
Parameters:
temp_dir: Optional path to temporary directory for file operations. If None, creates a new temp directory
Returns: None (constructor)
_is_valid_file(self, file_path) -> bool
Purpose: Check if a file exists, is readable, and appears to be valid for processing
Parameters:
file_path: Path to the file to validate (str or Path-like)
Returns: Boolean indicating whether the file is valid and processable
_table_to_markdown(self, table_data) -> str
Purpose: Convert a 2D array representing table data into markdown table format
Parameters:
table_data: 2D list where first row is headers and subsequent rows are data
Returns: String containing markdown-formatted table with headers, separator, and data rows
_process_powerpoint_table(self, table) -> str
Purpose: Extract data from a PowerPoint table object and convert it to markdown format
Parameters:
table: A python-pptx Table object from a shape
Returns: Markdown-formatted string representation of the table, or 'Error processing table' on failure
process_powerpoint(self, file_path) -> dict | None
Purpose: Main method to process a PowerPoint file and extract all text and table content organized by slides
Parameters:
file_path: Path to the PowerPoint (.pptx) file to process
Returns: Dictionary with keys 'text_chunks' (list of [parent_text, content, uuid]) and 'table_chunks' (list of [parent_text, markdown, '', uuid]). Returns None if processing fails or file is invalid
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
temp_dir |
Path | Path object pointing to the temporary directory used for file operations during processing | instance |
Dependencies
python-pptxpathlibuuidtempfileoslogging
Required Imports
import os
from pathlib import Path
from uuid import uuid4
import tempfile
import logging
Conditional/Optional Imports
These imports are only needed under specific conditions:
import pptx
Condition: Required for PowerPoint processing functionality. The class checks PPTX_AVAILABLE flag and logs warnings if not available, but will return empty results or None
Required (conditional)Usage Example
from pathlib import Path
import logging
import pptx
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Set availability flag
PPTX_AVAILABLE = True
# Instantiate the processor
processor = PowerPointProcessor(temp_dir='/tmp/ppt_processing')
# Process a PowerPoint file
result = processor.process_powerpoint('presentation.pptx')
if result:
# Access text chunks
for parent, text, uid in result['text_chunks']:
print(f'{parent}:')
print(f' Content: {text[:100]}...')
print(f' ID: {uid}')
# Access table chunks
for parent, markdown, _, uid in result['table_chunks']:
print(f'{parent}:')
print(f' Table:\n{markdown}')
print(f' ID: {uid}')
else:
print('Failed to process PowerPoint file')
# Check if a file is valid before processing
if processor._is_valid_file('presentation.pptx'):
result = processor.process_powerpoint('presentation.pptx')
Best Practices
- Always check if PPTX_AVAILABLE is True before instantiating if python-pptx availability is uncertain
- Handle None return values from process_powerpoint() as they indicate processing failures
- The temp_dir is created during initialization but not automatically cleaned up - manage cleanup externally if needed
- File validation is performed automatically via _is_valid_file() before processing
- Text and table chunks are processed separately - tables are excluded from text extraction to avoid duplication
- Each chunk includes a UUID for unique identification and tracking
- The class is stateless between process_powerpoint() calls - safe to reuse for multiple files
- Slide titles are automatically detected from placeholder shapes; falls back to 'Slide N' if not found
- Tables are converted to markdown format with headers and separators for easy rendering
- Error handling is comprehensive with logging at each stage - check logs for detailed failure information
- Private methods (_is_valid_file, _table_to_markdown, _process_powerpoint_table) are internal helpers and should not be called directly in most cases
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function test_pptx_file 58.8% similar
-
class DocumentExtractor 58.7% similar
-
class DocumentProcessor_v2 58.3% similar
-
class DocumentProcessor_v1 58.0% similar
-
function main_v62 56.1% similar