class DocumentExtractor
A document text extraction class that supports multiple file formats including Word, PowerPoint, PDF, and plain text files, with automatic format detection and conversion capabilities.
/tf/active/vicechatdev/leexi/document_extractor.py
40 - 256
complex
Purpose
DocumentExtractor provides a unified interface for extracting text content from various document formats. It automatically detects file types based on extensions, handles format-specific extraction using appropriate libraries (python-docx, python-pptx, pdfplumber/PyPDF2), and can convert documents to PDF using LibreOffice when needed. The class is designed for meeting minutes generation but can be used for any document text extraction workflow. It manages temporary files for conversions and provides robust error handling with logging.
Source Code
class DocumentExtractor:
"""Simplified document extractor for meeting minutes generator"""
# Supported file extensions by type
WORD_EXTENSIONS = ['.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf']
PPT_EXTENSIONS = ['.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx']
PDF_EXTENSIONS = ['.pdf']
TEXT_EXTENSIONS = ['.txt', '.md']
def __init__(self, temp_dir=None):
"""Initialize the document extractor"""
self.temp_dir = Path(temp_dir) if temp_dir else Path(tempfile.mkdtemp())
os.makedirs(self.temp_dir, exist_ok=True)
def _get_file_extension(self, file_path):
"""Get lowercase file extension including the dot"""
return Path(file_path).suffix.lower()
def _get_file_type(self, file_path):
"""Determine file type based on extension"""
ext = self._get_file_extension(file_path)
if ext in self.WORD_EXTENSIONS:
return "word"
elif ext in self.PPT_EXTENSIONS:
return "powerpoint"
elif ext in self.PDF_EXTENSIONS:
return "pdf"
elif ext in self.TEXT_EXTENSIONS:
return "text"
else:
return "unknown"
def _convert_to_pdf_libreoffice(self, input_file):
"""Convert a document to PDF using LibreOffice"""
input_path = Path(input_file)
output_pdf = self.temp_dir / f"{input_path.stem}.pdf"
try:
# Absolute paths to avoid directory issues
abs_input = input_path.absolute()
abs_output_dir = self.temp_dir.absolute()
logger.info(f"Converting {abs_input} to PDF using LibreOffice")
# Use LibreOffice for conversion
cmd = [
"libreoffice", "--headless", "--convert-to", "pdf",
"--outdir", str(abs_output_dir), str(abs_input)
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode == 0 and output_pdf.exists():
logger.info(f"Successfully converted to PDF: {output_pdf}")
return output_pdf
else:
logger.error(f"LibreOffice conversion failed: {result.stderr}")
return None
except subprocess.TimeoutExpired:
logger.error(f"LibreOffice conversion timed out for {input_file}")
return None
except Exception as e:
logger.error(f"Error during PDF conversion: {str(e)}")
return None
def _extract_text_from_pdf(self, file_path):
"""Extract text from PDF using available libraries"""
text_content = []
try:
if HAS_PDFPLUMBER:
import pdfplumber
with pdfplumber.open(file_path) as pdf:
for page_num, page in enumerate(pdf.pages):
page_text = page.extract_text()
if page_text:
text_content.append(f"=== Page {page_num + 1} ===\n{page_text}")
elif HAS_PYPDF2:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num, page in enumerate(pdf_reader.pages):
page_text = page.extract_text()
if page_text:
text_content.append(f"=== Page {page_num + 1} ===\n{page_text}")
else:
logger.warning("No PDF processing library available")
return None
except Exception as e:
logger.error(f"Error extracting text from PDF {file_path}: {str(e)}")
return None
return "\n\n".join(text_content) if text_content else None
def _extract_text_from_word(self, file_path):
"""Extract text from Word documents"""
try:
if HAS_PYTHON_DOCX and self._get_file_extension(file_path) in ['.docx']:
# Use python-docx for .docx files
doc = docx.Document(file_path)
text_content = []
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text_content.append(paragraph.text)
# Extract text from tables
for table in doc.tables:
table_text = []
for row in table.rows:
row_text = []
for cell in row.cells:
row_text.append(cell.text.strip())
table_text.append(" | ".join(row_text))
text_content.append("\n".join(table_text))
return "\n\n".join(text_content) if text_content else None
else:
# Fall back to LibreOffice conversion for .doc and other formats
pdf_path = self._convert_to_pdf_libreoffice(file_path)
if pdf_path:
return self._extract_text_from_pdf(pdf_path)
return None
except Exception as e:
logger.error(f"Error extracting text from Word document {file_path}: {str(e)}")
return None
def _extract_text_from_powerpoint(self, file_path):
"""Extract text from PowerPoint presentations"""
try:
text_content = []
if self._get_file_extension(file_path) in ['.pptx', '.pptm']:
# Use python-pptx for .pptx files
presentation = pptx.Presentation(file_path)
for i, slide in enumerate(presentation.slides):
slide_title = f"Slide {i + 1}"
slide_text = []
# Extract text from shapes
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text:
slide_text.append(shape.text)
if slide_text:
slide_content = f"=== {slide_title} ===\n" + "\n".join(slide_text)
text_content.append(slide_content)
else:
# Fall back to LibreOffice conversion for .ppt files
pdf_path = self._convert_to_pdf_libreoffice(file_path)
if pdf_path:
return self._extract_text_from_pdf(pdf_path)
return None
return "\n\n".join(text_content) if text_content else None
except Exception as e:
logger.error(f"Error extracting text from PowerPoint {file_path}: {str(e)}")
return None
def _extract_text_from_text_file(self, file_path):
"""Extract text from plain text files"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except UnicodeDecodeError:
try:
with open(file_path, 'r', encoding='latin-1') as f:
return f.read()
except Exception as e:
logger.error(f"Error reading text file {file_path}: {str(e)}")
return None
except Exception as e:
logger.error(f"Error extracting text from {file_path}: {str(e)}")
return None
def extract_text(self, file_path):
"""Extract text from a document based on its type"""
file_path = Path(file_path)
if not file_path.exists():
logger.error(f"File does not exist: {file_path}")
return None
file_type = self._get_file_type(file_path)
logger.info(f"Processing {file_type} document: {file_path}")
try:
if file_type == "text":
return self._extract_text_from_text_file(file_path)
elif file_type == "word":
return self._extract_text_from_word(file_path)
elif file_type == "powerpoint":
return self._extract_text_from_powerpoint(file_path)
elif file_type == "pdf":
return self._extract_text_from_pdf(file_path)
else:
logger.warning(f"Unsupported file type: {file_path}")
return None
except Exception as e:
logger.error(f"Error processing document {file_path}: {str(e)}")
return None
def get_supported_extensions(self):
"""Get list of all supported file extensions"""
return (self.TEXT_EXTENSIONS + self.WORD_EXTENSIONS +
self.PPT_EXTENSIONS + self.PDF_EXTENSIONS)
def is_supported_file(self, file_path):
"""Check if a file type is supported"""
ext = self._get_file_extension(file_path)
return ext in self.get_supported_extensions()
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
temp_dir: Optional path to a temporary directory for storing intermediate files (e.g., PDF conversions). If None, a new temporary directory is created using tempfile.mkdtemp(). The directory is created if it doesn't exist. Type: str or Path-like object.
Return Value
Instantiation returns a DocumentExtractor object. The main extract_text() method returns a string containing the extracted text content from the document, or None if extraction fails. Text from multi-page documents includes page/slide separators. Other methods return: file type strings, boolean values for support checks, lists of extensions, or Path objects for converted files.
Class Interface
Methods
__init__(self, temp_dir=None)
Purpose: Initialize the DocumentExtractor with an optional temporary directory for file conversions
Parameters:
temp_dir: Optional path to temporary directory; if None, creates a new temp directory
Returns: None (constructor)
_get_file_extension(self, file_path) -> str
Purpose: Extract the lowercase file extension including the dot from a file path
Parameters:
file_path: Path to the file (str or Path object)
Returns: Lowercase file extension string including the dot (e.g., '.docx', '.pdf')
_get_file_type(self, file_path) -> str
Purpose: Determine the document type category based on file extension
Parameters:
file_path: Path to the file (str or Path object)
Returns: String indicating file type: 'word', 'powerpoint', 'pdf', 'text', or 'unknown'
_convert_to_pdf_libreoffice(self, input_file) -> Path | None
Purpose: Convert a document to PDF format using LibreOffice command-line interface
Parameters:
input_file: Path to the input document to convert
Returns: Path object pointing to the converted PDF file, or None if conversion fails
_extract_text_from_pdf(self, file_path) -> str | None
Purpose: Extract text content from a PDF file using pdfplumber or PyPDF2
Parameters:
file_path: Path to the PDF file
Returns: Extracted text with page separators ('=== Page N ==='), or None if extraction fails
_extract_text_from_word(self, file_path) -> str | None
Purpose: Extract text from Word documents, handling both .docx (via python-docx) and legacy formats (via LibreOffice conversion)
Parameters:
file_path: Path to the Word document
Returns: Extracted text including paragraphs and table content, or None if extraction fails
_extract_text_from_powerpoint(self, file_path) -> str | None
Purpose: Extract text from PowerPoint presentations, handling .pptx (via python-pptx) and legacy formats (via LibreOffice)
Parameters:
file_path: Path to the PowerPoint file
Returns: Extracted text with slide separators ('=== Slide N ==='), or None if extraction fails
_extract_text_from_text_file(self, file_path) -> str | None
Purpose: Read and return content from plain text files with encoding fallback
Parameters:
file_path: Path to the text file
Returns: File content as string, or None if reading fails. Tries UTF-8 first, then Latin-1
extract_text(self, file_path) -> str | None
Purpose: Main public method to extract text from any supported document format by auto-detecting type and routing to appropriate handler
Parameters:
file_path: Path to the document file (str or Path object)
Returns: Extracted text content as string, or None if file doesn't exist, is unsupported, or extraction fails
get_supported_extensions(self) -> list
Purpose: Get a complete list of all file extensions supported by the extractor
Returns: List of lowercase extension strings including dots (e.g., ['.txt', '.docx', '.pdf', ...])
is_supported_file(self, file_path) -> bool
Purpose: Check whether a file's extension is supported for text extraction
Parameters:
file_path: Path to the file to check
Returns: True if the file extension is supported, False otherwise
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
WORD_EXTENSIONS |
list[str] | Class variable containing supported Word document extensions: ['.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf'] | class |
PPT_EXTENSIONS |
list[str] | Class variable containing supported PowerPoint extensions: ['.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx'] | class |
PDF_EXTENSIONS |
list[str] | Class variable containing supported PDF extensions: ['.pdf'] | class |
TEXT_EXTENSIONS |
list[str] | Class variable containing supported plain text extensions: ['.txt', '.md'] | class |
temp_dir |
Path | Instance variable storing the Path object for the temporary directory used for file conversions and intermediate storage | instance |
Dependencies
ostempfilesubprocessloggingpathlibuuidpython-pptxopenpyxlpython-docxPyPDF2pdfplumber
Required Imports
import os
import tempfile
import subprocess
import logging
from pathlib import Path
from uuid import uuid4
Conditional/Optional Imports
These imports are only needed under specific conditions:
import pptx
Condition: Required for extracting text from .pptx and .pptm PowerPoint files
Required (conditional)import docx
Condition: Required for extracting text from .docx Word files
Required (conditional)import pdfplumber
Condition: Preferred library for PDF text extraction (checked via HAS_PDFPLUMBER flag)
Optionalimport PyPDF2
Condition: Fallback library for PDF text extraction if pdfplumber is not available (checked via HAS_PYPDF2 flag)
Optionalimport openpyxl
Condition: Imported in source but not actively used in the class methods
OptionalUsage Example
import logging
from pathlib import Path
# Setup logger
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Define library availability flags
HAS_PDFPLUMBER = True
HAS_PYPDF2 = True
HAS_PYTHON_DOCX = True
# Create extractor instance
extractor = DocumentExtractor(temp_dir='/tmp/doc_extraction')
# Check if file is supported
if extractor.is_supported_file('meeting_notes.docx'):
# Extract text from document
text = extractor.extract_text('meeting_notes.docx')
if text:
print(f"Extracted {len(text)} characters")
print(text[:500]) # Print first 500 chars
# Get all supported extensions
supported = extractor.get_supported_extensions()
print(f"Supported formats: {supported}")
# Process multiple files
files = ['report.pdf', 'presentation.pptx', 'notes.txt']
for file_path in files:
if extractor.is_supported_file(file_path):
content = extractor.extract_text(file_path)
if content:
print(f"Successfully extracted from {file_path}")
Best Practices
- Always check if a file is supported using is_supported_file() before attempting extraction
- Handle None return values from extract_text() as they indicate extraction failures
- Ensure LibreOffice is installed for legacy format support (.doc, .ppt, etc.)
- The temp_dir will accumulate converted PDF files; implement cleanup if processing many documents
- Check module-level flags (HAS_PDFPLUMBER, HAS_PYPDF2, HAS_PYTHON_DOCX) before instantiation to ensure required libraries are available
- For production use, implement proper cleanup of temp_dir after processing
- The class uses subprocess calls to LibreOffice with a 120-second timeout; ensure this is sufficient for large documents
- Text extraction quality varies by format; PDF extraction may lose formatting and structure
- For .docx files, both paragraphs and tables are extracted; table cells are separated by ' | '
- PowerPoint slides are numbered and separated with '=== Slide N ===' markers
- The class logs extensively; configure logging appropriately for your use case
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
function test_document_extractor 72.9% similar
-
class DocumentProcessor_v2 68.7% similar
-
class DocumentProcessor_v1 68.6% similar
-
class PDFTextExtractor 67.3% similar
-
class DocumentConverter 65.2% similar