DocumentExtractor - Code Extractor

class DocumentExtractor

Maturity: 48

A document text extraction class that supports multiple file formats including Word, PowerPoint, PDF, and plain text files, with automatic format detection and conversion capabilities.

File:
/tf/active/vicechatdev/leexi/document_extractor.py

Lines:
40 - 256

Complexity:
complex

Purpose

DocumentExtractor provides a unified interface for extracting text content from various document formats. It automatically detects file types based on extensions, handles format-specific extraction using appropriate libraries (python-docx, python-pptx, pdfplumber/PyPDF2), and can convert documents to PDF using LibreOffice when needed. The class is designed for meeting minutes generation but can be used for any document text extraction workflow. It manages temporary files for conversions and provides robust error handling with logging.

Source Code

class DocumentExtractor:
    """Simplified document extractor for meeting minutes generator"""
    
    # Supported file extensions by type
    WORD_EXTENSIONS = ['.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf']
    PPT_EXTENSIONS = ['.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx']
    PDF_EXTENSIONS = ['.pdf']
    TEXT_EXTENSIONS = ['.txt', '.md']
    
    def __init__(self, temp_dir=None):
        """Initialize the document extractor"""
        self.temp_dir = Path(temp_dir) if temp_dir else Path(tempfile.mkdtemp())
        os.makedirs(self.temp_dir, exist_ok=True)
    
    def _get_file_extension(self, file_path):
        """Get lowercase file extension including the dot"""
        return Path(file_path).suffix.lower()
    
    def _get_file_type(self, file_path):
        """Determine file type based on extension"""
        ext = self._get_file_extension(file_path)
        
        if ext in self.WORD_EXTENSIONS:
            return "word"
        elif ext in self.PPT_EXTENSIONS:
            return "powerpoint"
        elif ext in self.PDF_EXTENSIONS:
            return "pdf"
        elif ext in self.TEXT_EXTENSIONS:
            return "text"
        else:
            return "unknown"
    
    def _convert_to_pdf_libreoffice(self, input_file):
        """Convert a document to PDF using LibreOffice"""
        input_path = Path(input_file)
        output_pdf = self.temp_dir / f"{input_path.stem}.pdf"
        
        try:
            # Absolute paths to avoid directory issues
            abs_input = input_path.absolute()
            abs_output_dir = self.temp_dir.absolute()
            
            logger.info(f"Converting {abs_input} to PDF using LibreOffice")
            
            # Use LibreOffice for conversion
            cmd = [
                "libreoffice", "--headless", "--convert-to", "pdf",
                "--outdir", str(abs_output_dir), str(abs_input)
            ]
            
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
            
            if result.returncode == 0 and output_pdf.exists():
                logger.info(f"Successfully converted to PDF: {output_pdf}")
                return output_pdf
            else:
                logger.error(f"LibreOffice conversion failed: {result.stderr}")
                return None
                
        except subprocess.TimeoutExpired:
            logger.error(f"LibreOffice conversion timed out for {input_file}")
            return None
        except Exception as e:
            logger.error(f"Error during PDF conversion: {str(e)}")
            return None
    
    def _extract_text_from_pdf(self, file_path):
        """Extract text from PDF using available libraries"""
        text_content = []
        
        try:
            if HAS_PDFPLUMBER:
                import pdfplumber
                with pdfplumber.open(file_path) as pdf:
                    for page_num, page in enumerate(pdf.pages):
                        page_text = page.extract_text()
                        if page_text:
                            text_content.append(f"=== Page {page_num + 1} ===\n{page_text}")
            elif HAS_PYPDF2:
                with open(file_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    for page_num, page in enumerate(pdf_reader.pages):
                        page_text = page.extract_text()
                        if page_text:
                            text_content.append(f"=== Page {page_num + 1} ===\n{page_text}")
            else:
                logger.warning("No PDF processing library available")
                return None
                
        except Exception as e:
            logger.error(f"Error extracting text from PDF {file_path}: {str(e)}")
            return None
        
        return "\n\n".join(text_content) if text_content else None
    
    def _extract_text_from_word(self, file_path):
        """Extract text from Word documents"""
        try:
            if HAS_PYTHON_DOCX and self._get_file_extension(file_path) in ['.docx']:
                # Use python-docx for .docx files
                doc = docx.Document(file_path)
                text_content = []
                
                for paragraph in doc.paragraphs:
                    if paragraph.text.strip():
                        text_content.append(paragraph.text)
                
                # Extract text from tables
                for table in doc.tables:
                    table_text = []
                    for row in table.rows:
                        row_text = []
                        for cell in row.cells:
                            row_text.append(cell.text.strip())
                        table_text.append(" | ".join(row_text))
                    text_content.append("\n".join(table_text))
                
                return "\n\n".join(text_content) if text_content else None
            else:
                # Fall back to LibreOffice conversion for .doc and other formats
                pdf_path = self._convert_to_pdf_libreoffice(file_path)
                if pdf_path:
                    return self._extract_text_from_pdf(pdf_path)
                return None
                
        except Exception as e:
            logger.error(f"Error extracting text from Word document {file_path}: {str(e)}")
            return None
    
    def _extract_text_from_powerpoint(self, file_path):
        """Extract text from PowerPoint presentations"""
        try:
            text_content = []
            
            if self._get_file_extension(file_path) in ['.pptx', '.pptm']:
                # Use python-pptx for .pptx files
                presentation = pptx.Presentation(file_path)
                
                for i, slide in enumerate(presentation.slides):
                    slide_title = f"Slide {i + 1}"
                    slide_text = []
                    
                    # Extract text from shapes
                    for shape in slide.shapes:
                        if hasattr(shape, "text") and shape.text:
                            slide_text.append(shape.text)
                    
                    if slide_text:
                        slide_content = f"=== {slide_title} ===\n" + "\n".join(slide_text)
                        text_content.append(slide_content)
            else:
                # Fall back to LibreOffice conversion for .ppt files
                pdf_path = self._convert_to_pdf_libreoffice(file_path)
                if pdf_path:
                    return self._extract_text_from_pdf(pdf_path)
                return None
            
            return "\n\n".join(text_content) if text_content else None
            
        except Exception as e:
            logger.error(f"Error extracting text from PowerPoint {file_path}: {str(e)}")
            return None
    
    def _extract_text_from_text_file(self, file_path):
        """Extract text from plain text files"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except UnicodeDecodeError:
            try:
                with open(file_path, 'r', encoding='latin-1') as f:
                    return f.read()
            except Exception as e:
                logger.error(f"Error reading text file {file_path}: {str(e)}")
                return None
        except Exception as e:
            logger.error(f"Error extracting text from {file_path}: {str(e)}")
            return None
    
    def extract_text(self, file_path):
        """Extract text from a document based on its type"""
        file_path = Path(file_path)
        
        if not file_path.exists():
            logger.error(f"File does not exist: {file_path}")
            return None
        
        file_type = self._get_file_type(file_path)
        logger.info(f"Processing {file_type} document: {file_path}")
        
        try:
            if file_type == "text":
                return self._extract_text_from_text_file(file_path)
            elif file_type == "word":
                return self._extract_text_from_word(file_path)
            elif file_type == "powerpoint":
                return self._extract_text_from_powerpoint(file_path)
            elif file_type == "pdf":
                return self._extract_text_from_pdf(file_path)
            else:
                logger.warning(f"Unsupported file type: {file_path}")
                return None
                
        except Exception as e:
            logger.error(f"Error processing document {file_path}: {str(e)}")
            return None
    
    def get_supported_extensions(self):
        """Get list of all supported file extensions"""
        return (self.TEXT_EXTENSIONS + self.WORD_EXTENSIONS + 
                self.PPT_EXTENSIONS + self.PDF_EXTENSIONS)
    
    def is_supported_file(self, file_path):
        """Check if a file type is supported"""
        ext = self._get_file_extension(file_path)
        return ext in self.get_supported_extensions()

Parameters

Name	Type	Default	Kind
`bases`	-	-

Parameter Details

temp_dir: Optional path to a temporary directory for storing intermediate files (e.g., PDF conversions). If None, a new temporary directory is created using tempfile.mkdtemp(). The directory is created if it doesn't exist. Type: str or Path-like object.

Return Value

Instantiation returns a DocumentExtractor object. The main extract_text() method returns a string containing the extracted text content from the document, or None if extraction fails. Text from multi-page documents includes page/slide separators. Other methods return: file type strings, boolean values for support checks, lists of extensions, or Path objects for converted files.

Class Interface

Methods

`init(self, temp_dir=None)`

Purpose: Initialize the DocumentExtractor with an optional temporary directory for file conversions

Parameters:

temp_dir: Optional path to temporary directory; if None, creates a new temp directory

Returns: None (constructor)

`_get_file_extension(self, file_path) -> str`

Purpose: Extract the lowercase file extension including the dot from a file path

Parameters:

file_path: Path to the file (str or Path object)

Returns: Lowercase file extension string including the dot (e.g., '.docx', '.pdf')

`_get_file_type(self, file_path) -> str`

Purpose: Determine the document type category based on file extension

Parameters:

file_path: Path to the file (str or Path object)

Returns: String indicating file type: 'word', 'powerpoint', 'pdf', 'text', or 'unknown'

`_convert_to_pdf_libreoffice(self, input_file) -> Path | None`

Purpose: Convert a document to PDF format using LibreOffice command-line interface

Parameters:

input_file: Path to the input document to convert

Returns: Path object pointing to the converted PDF file, or None if conversion fails

`_extract_text_from_pdf(self, file_path) -> str | None`

Purpose: Extract text content from a PDF file using pdfplumber or PyPDF2

Parameters:

file_path: Path to the PDF file

Returns: Extracted text with page separators ('=== Page N ==='), or None if extraction fails

`_extract_text_from_word(self, file_path) -> str | None`

Purpose: Extract text from Word documents, handling both .docx (via python-docx) and legacy formats (via LibreOffice conversion)

Parameters:

file_path: Path to the Word document

Returns: Extracted text including paragraphs and table content, or None if extraction fails

`_extract_text_from_powerpoint(self, file_path) -> str | None`

Purpose: Extract text from PowerPoint presentations, handling .pptx (via python-pptx) and legacy formats (via LibreOffice)

Parameters:

file_path: Path to the PowerPoint file

Returns: Extracted text with slide separators ('=== Slide N ==='), or None if extraction fails

`_extract_text_from_text_file(self, file_path) -> str | None`

Purpose: Read and return content from plain text files with encoding fallback

Parameters:

file_path: Path to the text file

Returns: File content as string, or None if reading fails. Tries UTF-8 first, then Latin-1

`extract_text(self, file_path) -> str | None`

Purpose: Main public method to extract text from any supported document format by auto-detecting type and routing to appropriate handler

Parameters:

file_path: Path to the document file (str or Path object)

Returns: Extracted text content as string, or None if file doesn't exist, is unsupported, or extraction fails

`get_supported_extensions(self) -> list`

Purpose: Get a complete list of all file extensions supported by the extractor

Returns: List of lowercase extension strings including dots (e.g., ['.txt', '.docx', '.pdf', ...])

`is_supported_file(self, file_path) -> bool`

Purpose: Check whether a file's extension is supported for text extraction

Parameters:

file_path: Path to the file to check

Returns: True if the file extension is supported, False otherwise

Attributes

Name	Type	Description	Scope
`WORD_EXTENSIONS`	list[str]	Class variable containing supported Word document extensions: ['.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf']	class
`PPT_EXTENSIONS`	list[str]	Class variable containing supported PowerPoint extensions: ['.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx']	class
`PDF_EXTENSIONS`	list[str]	Class variable containing supported PDF extensions: ['.pdf']	class
`TEXT_EXTENSIONS`	list[str]	Class variable containing supported plain text extensions: ['.txt', '.md']	class
`temp_dir`	Path	Instance variable storing the Path object for the temporary directory used for file conversions and intermediate storage	instance

Dependencies

os
tempfile
subprocess
logging
pathlib
uuid
python-pptx
openpyxl
python-docx
PyPDF2
pdfplumber

Required Imports

import os
import tempfile
import subprocess
import logging
from pathlib import Path
from uuid import uuid4

Conditional/Optional Imports

These imports are only needed under specific conditions:

import pptx

Condition: Required for extracting text from .pptx and .pptm PowerPoint files

Required (conditional)

import docx

Condition: Required for extracting text from .docx Word files

Required (conditional)

import pdfplumber

Condition: Preferred library for PDF text extraction (checked via HAS_PDFPLUMBER flag)

Optional

import PyPDF2

Condition: Fallback library for PDF text extraction if pdfplumber is not available (checked via HAS_PYPDF2 flag)

Optional

import openpyxl

Condition: Imported in source but not actively used in the class methods

Optional

Usage Example

import logging
from pathlib import Path

# Setup logger
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Define library availability flags
HAS_PDFPLUMBER = True
HAS_PYPDF2 = True
HAS_PYTHON_DOCX = True

# Create extractor instance
extractor = DocumentExtractor(temp_dir='/tmp/doc_extraction')

# Check if file is supported
if extractor.is_supported_file('meeting_notes.docx'):
    # Extract text from document
    text = extractor.extract_text('meeting_notes.docx')
    if text:
        print(f"Extracted {len(text)} characters")
        print(text[:500])  # Print first 500 chars

# Get all supported extensions
supported = extractor.get_supported_extensions()
print(f"Supported formats: {supported}")

# Process multiple files
files = ['report.pdf', 'presentation.pptx', 'notes.txt']
for file_path in files:
    if extractor.is_supported_file(file_path):
        content = extractor.extract_text(file_path)
        if content:
            print(f"Successfully extracted from {file_path}")

Best Practices

Always check if a file is supported using is_supported_file() before attempting extraction
Handle None return values from extract_text() as they indicate extraction failures
Ensure LibreOffice is installed for legacy format support (.doc, .ppt, etc.)
The temp_dir will accumulate converted PDF files; implement cleanup if processing many documents
Check module-level flags (HAS_PDFPLUMBER, HAS_PYPDF2, HAS_PYTHON_DOCX) before instantiation to ensure required libraries are available
For production use, implement proper cleanup of temp_dir after processing
The class uses subprocess calls to LibreOffice with a 120-second timeout; ensure this is sufficient for large documents
Text extraction quality varies by format; PDF extraction may lose formatting and structure
For .docx files, both paragraphs and tables are extracted; table cells are separated by ' | '
PowerPoint slides are numbered and separated with '=== Slide N ===' markers
The class logs extensively; configure logging appropriately for your use case

Similar Components

AI-powered semantic similarity - components with related functionality:

function test_document_extractor 72.9% similar

A test function that validates the DocumentExtractor class by testing file type support detection, text extraction from various document formats, and error handling.
From: /tf/active/vicechatdev/leexi/test_document_extractor.py
class DocumentProcessor_v2 68.7% similar

A document processing class that extracts text from PDF and Word documents using llmsherpa as the primary method with fallback support for PyPDF2, pdfplumber, and python-docx.
From: /tf/active/vicechatdev/contract_validity_analyzer/utils/document_processor_old.py
class DocumentProcessor_v1 68.6% similar

A document processing class that extracts text from PDF and Word documents using llmsherpa as the primary method with fallback support for PyPDF2, pdfplumber, and python-docx.
From: /tf/active/vicechatdev/contract_validity_analyzer/utils/document_processor_new.py
class PDFTextExtractor 67.3% similar

A class for extracting text, images, and structured content from PDF documents with layout preservation capabilities.
From: /tf/active/vicechatdev/CDocs/utils/pdf_utils.py
class DocumentConverter 65.2% similar

A class that converts various document formats (Word, Excel, PowerPoint, OpenDocument, Visio) to PDF using LibreOffice's headless conversion capabilities, with support for parallel processing and directory structure preservation.
From: /tf/active/vicechatdev/pdfconverter.py

← Back to Browse

Assistant

Hi! I can help improve this code. Tell me what you'd like to enhance (e.g., "add error handling", "optimize performance", "improve readability", "add type hints").

Code Comparison

Original Code

                            class DocumentExtractor:
    """Simplified document extractor for meeting minutes generator"""
    
    # Supported file extensions by type
    WORD_EXTENSIONS = ['.doc', '.docx', '.docm', '.dot', '.dotx', '.dotm', '.rtf']
    PPT_EXTENSIONS = ['.ppt', '.pptx', '.pptm', '.pot', '.potx', '.potm', '.pps', '.ppsx']
    PDF_EXTENSIONS = ['.pdf']
    TEXT_EXTENSIONS = ['.txt', '.md']
    
    def __init__(self, temp_dir=None):
        """Initialize the document extractor"""
        self.temp_dir = Path(temp_dir) if temp_dir else Path(tempfile.mkdtemp())
        os.makedirs(self.temp_dir, exist_ok=True)
    
    def _get_file_extension(self, file_path):
        """Get lowercase file extension including the dot"""
        return Path(file_path).suffix.lower()
    
    def _get_file_type(self, file_path):
        """Determine file type based on extension"""
        ext = self._get_file_extension(file_path)
        
        if ext in self.WORD_EXTENSIONS:
            return "word"
        elif ext in self.PPT_EXTENSIONS:
            return "powerpoint"
        elif ext in self.PDF_EXTENSIONS:
            return "pdf"
        elif ext in self.TEXT_EXTENSIONS:
            return "text"
        else:
            return "unknown"
    
    def _convert_to_pdf_libreoffice(self, input_file):
        """Convert a document to PDF using LibreOffice"""
        input_path = Path(input_file)
        output_pdf = self.temp_dir / f"{input_path.stem}.pdf"
        
        try:
            # Absolute paths to avoid directory issues
            abs_input = input_path.absolute()
            abs_output_dir = self.temp_dir.absolute()
            
            logger.info(f"Converting {abs_input} to PDF using LibreOffice")
            
            # Use LibreOffice for conversion
            cmd = [
                "libreoffice", "--headless", "--convert-to", "pdf",
                "--outdir", str(abs_output_dir), str(abs_input)
            ]
            
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
            
            if result.returncode == 0 and output_pdf.exists():
                logger.info(f"Successfully converted to PDF: {output_pdf}")
                return output_pdf
            else:
                logger.error(f"LibreOffice conversion failed: {result.stderr}")
                return None
                
        except subprocess.TimeoutExpired:
            logger.error(f"LibreOffice conversion timed out for {input_file}")
            return None
        except Exception as e:
            logger.error(f"Error during PDF conversion: {str(e)}")
            return None
    
    def _extract_text_from_pdf(self, file_path):
        """Extract text from PDF using available libraries"""
        text_content = []
        
        try:
            if HAS_PDFPLUMBER:
                import pdfplumber
                with pdfplumber.open(file_path) as pdf:
                    for page_num, page in enumerate(pdf.pages):
                        page_text = page.extract_text()
                        if page_text:
                            text_content.append(f"=== Page {page_num + 1} ===\n{page_text}")
            elif HAS_PYPDF2:
                with open(file_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    for page_num, page in enumerate(pdf_reader.pages):
                        page_text = page.extract_text()
                        if page_text:
                            text_content.append(f"=== Page {page_num + 1} ===\n{page_text}")
            else:
                logger.warning("No PDF processing library available")
                return None
                
        except Exception as e:
            logger.error(f"Error extracting text from PDF {file_path}: {str(e)}")
            return None
        
        return "\n\n".join(text_content) if text_content else None
    
    def _extract_text_from_word(self, file_path):
        """Extract text from Word documents"""
        try:
            if HAS_PYTHON_DOCX and self._get_file_extension(file_path) in ['.docx']:
                # Use python-docx for .docx files
                doc = docx.Document(file_path)
                text_content = []
                
                for paragraph in doc.paragraphs:
                    if paragraph.text.strip():
                        text_content.append(paragraph.text)
                
                # Extract text from tables
                for table in doc.tables:
                    table_text = []
                    for row in table.rows:
                        row_text = []
                        for cell in row.cells:
                            row_text.append(cell.text.strip())
                        table_text.append(" | ".join(row_text))
                    text_content.append("\n".join(table_text))
                
                return "\n\n".join(text_content) if text_content else None
            else:
                # Fall back to LibreOffice conversion for .doc and other formats
                pdf_path = self._convert_to_pdf_libreoffice(file_path)
                if pdf_path:
                    return self._extract_text_from_pdf(pdf_path)
                return None
                
        except Exception as e:
            logger.error(f"Error extracting text from Word document {file_path}: {str(e)}")
            return None
    
    def _extract_text_from_powerpoint(self, file_path):
        """Extract text from PowerPoint presentations"""
        try:
            text_content = []
            
            if self._get_file_extension(file_path) in ['.pptx', '.pptm']:
                # Use python-pptx for .pptx files
                presentation = pptx.Presentation(file_path)
                
                for i, slide in enumerate(presentation.slides):
                    slide_title = f"Slide {i + 1}"
                    slide_text = []
                    
                    # Extract text from shapes
                    for shape in slide.shapes:
                        if hasattr(shape, "text") and shape.text:
                            slide_text.append(shape.text)
                    
                    if slide_text:
                        slide_content = f"=== {slide_title} ===\n" + "\n".join(slide_text)
                        text_content.append(slide_content)
            else:
                # Fall back to LibreOffice conversion for .ppt files
                pdf_path = self._convert_to_pdf_libreoffice(file_path)
                if pdf_path:
                    return self._extract_text_from_pdf(pdf_path)
                return None
            
            return "\n\n".join(text_content) if text_content else None
            
        except Exception as e:
            logger.error(f"Error extracting text from PowerPoint {file_path}: {str(e)}")
            return None
    
    def _extract_text_from_text_file(self, file_path):
        """Extract text from plain text files"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except UnicodeDecodeError:
            try:
                with open(file_path, 'r', encoding='latin-1') as f:
                    return f.read()
            except Exception as e:
                logger.error(f"Error reading text file {file_path}: {str(e)}")
                return None
        except Exception as e:
            logger.error(f"Error extracting text from {file_path}: {str(e)}")
            return None
    
    def extract_text(self, file_path):
        """Extract text from a document based on its type"""
        file_path = Path(file_path)
        
        if not file_path.exists():
            logger.error(f"File does not exist: {file_path}")
            return None
        
        file_type = self._get_file_type(file_path)
        logger.info(f"Processing {file_type} document: {file_path}")
        
        try:
            if file_type == "text":
                return self._extract_text_from_text_file(file_path)
            elif file_type == "word":
                return self._extract_text_from_word(file_path)
            elif file_type == "powerpoint":
                return self._extract_text_from_powerpoint(file_path)
            elif file_type == "pdf":
                return self._extract_text_from_pdf(file_path)
            else:
                logger.warning(f"Unsupported file type: {file_path}")
                return None
                
        except Exception as e:
            logger.error(f"Error processing document {file_path}: {str(e)}")
            return None
    
    def get_supported_extensions(self):
        """Get list of all supported file extensions"""
        return (self.TEXT_EXTENSIONS + self.WORD_EXTENSIONS + 
                self.PPT_EXTENSIONS + self.PDF_EXTENSIONS)
    
    def is_supported_file(self, file_path):
        """Check if a file type is supported"""
        ext = self._get_file_extension(file_path)
        return ext in self.get_supported_extensions()
                        

Improved Code

🔍 Code Extractor

class DocumentExtractor

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

`init(self, temp_dir=None)`

`_get_file_extension(self, file_path) -> str`

`_get_file_type(self, file_path) -> str`

`_convert_to_pdf_libreoffice(self, input_file) -> Path | None`

`_extract_text_from_pdf(self, file_path) -> str | None`

`_extract_text_from_word(self, file_path) -> str | None`

`_extract_text_from_powerpoint(self, file_path) -> str | None`

`_extract_text_from_text_file(self, file_path) -> str | None`

`extract_text(self, file_path) -> str | None`

`get_supported_extensions(self) -> list`

`is_supported_file(self, file_path) -> bool`

Attributes

Dependencies

Required Imports

Conditional/Optional Imports

Usage Example

Best Practices

Tags

Similar Components

function test_document_extractor 72.9% similar

class DocumentProcessor_v2 68.7% similar

class DocumentProcessor_v1 68.6% similar

class PDFTextExtractor 67.3% similar

class DocumentConverter 65.2% similar

class DocumentExtractor

Purpose

Source Code

Parameters

Parameter Details

Return Value

Class Interface

Methods

__init__(self, temp_dir=None)

_get_file_extension(self, file_path) -> str

_get_file_type(self, file_path) -> str

_convert_to_pdf_libreoffice(self, input_file) -> Path | None

_extract_text_from_pdf(self, file_path) -> str | None

_extract_text_from_word(self, file_path) -> str | None

_extract_text_from_powerpoint(self, file_path) -> str | None

_extract_text_from_text_file(self, file_path) -> str | None

extract_text(self, file_path) -> str | None

get_supported_extensions(self) -> list

is_supported_file(self, file_path) -> bool

Attributes

Dependencies

Required Imports

Conditional/Optional Imports

Usage Example

Best Practices

Tags

Similar Components

function test_document_extractor 72.9% similar

class DocumentProcessor_v2 68.7% similar

class DocumentProcessor_v1 68.6% similar

class PDFTextExtractor 67.3% similar

class DocumentConverter 65.2% similar

✨ Improve Code: DocumentExtractor

Code Comparison

`init(self, temp_dir=None)`

`_get_file_extension(self, file_path) -> str`

`_get_file_type(self, file_path) -> str`

`_convert_to_pdf_libreoffice(self, input_file) -> Path | None`

`_extract_text_from_pdf(self, file_path) -> str | None`

`_extract_text_from_word(self, file_path) -> str | None`

`_extract_text_from_powerpoint(self, file_path) -> str | None`

`_extract_text_from_text_file(self, file_path) -> str | None`

`extract_text(self, file_path) -> str | None`

`get_supported_extensions(self) -> list`

`is_supported_file(self, file_path) -> bool`