class PDFTextExtractor
A class for extracting text, images, and structured content from PDF documents with layout preservation capabilities.
/tf/active/vicechatdev/CDocs/utils/pdf_utils.py
1743 - 2097
complex
Purpose
PDFTextExtractor provides comprehensive PDF content extraction functionality, including plain text extraction with optional layout preservation, structured content extraction with section/paragraph detection, and Markdown conversion. It uses PyMuPDF (fitz) for PDF processing and can identify document structure including headings, paragraphs, images, and metadata. The class is designed to handle complex PDF layouts and provide output suitable for further processing or analysis.
Source Code
class PDFTextExtractor:
"""
Extract text and content from PDF documents
This class provides methods to extract text, images, and
structured content from PDF documents, with options for
layout analysis similar to llmsherpa's LayoutPDFReader.
"""
def __init__(self, temp_dir: Optional[str] = None):
"""
Initialize the PDF text extractor
Parameters
----------
temp_dir : str, optional
Directory for temporary files. If not provided, a system temp directory is used.
"""
self.temp_dir = temp_dir if temp_dir else tempfile.mkdtemp()
os.makedirs(self.temp_dir, exist_ok=True)
def extract_text(self, input_path: str, maintain_layout: bool = True) -> str:
"""
Extract text from a PDF document
Parameters
----------
input_path : str
Path to the PDF document
maintain_layout : bool, optional
Whether to maintain page layout (otherwise, text is flattened)
Returns
-------
str
Extracted text
Raises
------
FileNotFoundError
If the input file doesn't exist
RuntimeError
If the extraction fails
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
try:
# Open the PDF with PyMuPDF
doc = fitz.open(input_path)
text_parts = []
# Process each page
for page_num, page in enumerate(doc):
if maintain_layout:
# Get text with layout preservation
text = page.get_text("dict")
# Process blocks to maintain layout
page_text = []
for block in text.get("blocks", []):
if block.get("type") == 0: # Text block
for line in block.get("lines", []):
line_text = " ".join(span.get("text", "") for span in line.get("spans", []))
if line_text.strip():
page_text.append(line_text)
# Join lines with newlines
page_content = "\n".join(page_text)
else:
# Get plain text (flattened)
page_content = page.get_text("text")
# Add page separator if there are multiple pages
if page_num > 0:
text_parts.append("\n\n--- Page {} ---\n\n".format(page_num + 1))
else:
text_parts.append("--- Page 1 ---\n\n")
text_parts.append(page_content)
doc.close()
# Join all text parts
extracted_text = "".join(text_parts)
logger.info(f"Extracted text from {input_path} ({len(extracted_text)} characters)")
return extracted_text
except Exception as e:
logger.error(f"Error extracting text: {str(e)}")
raise RuntimeError(f"Failed to extract text: {str(e)}")
def extract_structured_content(self, input_path: str) -> Dict[str, Any]:
"""
Extract structured content from a PDF document
Parameters
----------
input_path : str
Path to the PDF document
Returns
-------
Dict[str, Any]
Structured content with sections, paragraphs, tables, etc.
Raises
------
FileNotFoundError
If the input file doesn't exist
RuntimeError
If the extraction fails
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
try:
# Open the PDF with PyMuPDF
doc = fitz.open(input_path)
# Initialize structured content
content = {
'title': None,
'pages': [],
'metadata': {}
}
# Extract document metadata
metadata = doc.metadata
if metadata:
content['metadata'] = {
'title': metadata.get('title', ''),
'author': metadata.get('author', ''),
'subject': metadata.get('subject', ''),
'keywords': metadata.get('keywords', ''),
'creator': metadata.get('creator', ''),
'producer': metadata.get('producer', ''),
'creationDate': metadata.get('creationDate', ''),
'modDate': metadata.get('modDate', '')
}
# Try to guess document title from metadata or first page
content['title'] = metadata.get('title', '')
if not content['title'] and doc.page_count > 0:
# Extract first page text and try to find a title
first_page = doc[0]
first_page_text = first_page.get_text("dict")
# Look for large font text at the top of the page
if "blocks" in first_page_text:
for block in first_page_text["blocks"]:
if block.get("type") == 0: # Text block
for line in block.get("lines", []):
for span in line.get("spans", []):
# If this is a large font and near the top of the page
if span.get("size", 0) > 14 and span.get("origin", [0, 0])[1] < 100:
potential_title = span.get("text", "").strip()
if potential_title and len(potential_title) > 5:
content['title'] = potential_title
break
if content['title']:
break
if content['title']:
break
# Process each page
for page_num, page in enumerate(doc):
page_content = {
'number': page_num + 1,
'sections': [],
'tables': [],
'images': []
}
# Extract text with layout information
page_text = page.get_text("dict")
# Track current section
current_section = {
'heading': '',
'level': 0,
'paragraphs': []
}
# Process blocks
for block in page_text.get("blocks", []):
if block.get("type") == 0: # Text block
block_text = []
is_heading = False
heading_level = 0
for line in block.get("lines", []):
line_text = []
largest_font = 0
for span in line.get("spans", []):
text = span.get("text", "").strip()
font_size = span.get("size", 0)
is_bold = span.get("font", "").lower().find("bold") >= 0
# Track largest font in the line
if font_size > largest_font:
largest_font = font_size
line_text.append(text)
# Join span text to form line
full_line_text = " ".join(line_text).strip()
if full_line_text:
block_text.append(full_line_text)
# Check if this might be a heading
if full_line_text and largest_font > 12:
is_heading = True
# Estimate heading level based on font size
if largest_font >= 18:
heading_level = 1
elif largest_font >= 16:
heading_level = 2
elif largest_font >= 14:
heading_level = 3
else:
heading_level = 4
# Join lines to form block text
full_block_text = "\n".join(block_text).strip()
if full_block_text:
if is_heading:
# If we have content in the current section, add it
if current_section['paragraphs']:
page_content['sections'].append(current_section)
# Start a new section
current_section = {
'heading': full_block_text,
'level': heading_level,
'paragraphs': []
}
else:
# Add paragraph to current section
current_section['paragraphs'].append(full_block_text)
elif block.get("type") == 1: # Image block
# Extract image information
image_info = {
'bbox': block.get("bbox", [0, 0, 0, 0]),
'width': block.get("width", 0),
'height': block.get("height", 0)
}
page_content['images'].append(image_info)
# Add final section if it has content
if current_section['paragraphs'] or current_section['heading']:
page_content['sections'].append(current_section)
# Add page to content
content['pages'].append(page_content)
doc.close()
logger.info(f"Extracted structured content from {input_path}")
return content
except Exception as e:
logger.error(f"Error extracting structured content: {str(e)}")
raise RuntimeError(f"Failed to extract structured content: {str(e)}")
def extract_to_markdown(self, input_path: str, output_path: str) -> str:
"""
Extract PDF content and convert to Markdown
Parameters
----------
input_path : str
Path to the PDF document
output_path : str
Path where the Markdown file will be saved
Returns
-------
str
Path to the generated Markdown file
Raises
------
FileNotFoundError
If the input file doesn't exist
RuntimeError
If the extraction fails
"""
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file not found: {input_path}")
try:
# Create output directory if it doesn't exist
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
# Extract structured content
content = self.extract_structured_content(input_path)
# Convert to Markdown
markdown_lines = []
# Add title if available
if content['title']:
markdown_lines.append(f"# {content['title']}\n")
# Add metadata if available
if content['metadata']:
markdown_lines.append("## Document Information\n")
for key, value in content['metadata'].items():
if value:
markdown_lines.append(f"- **{key}**: {value}")
markdown_lines.append("\n")
# Process pages
for page in content['pages']:
markdown_lines.append(f"## Page {page['number']}\n")
# Process sections
for section in page['sections']:
heading = section['heading']
level = section['level']
# Add section heading
if heading:
# Ensure heading level is between 1-6
heading_level = min(max(level, 1), 6)
markdown_lines.append(f"{'#' * heading_level} {heading}\n")
# Add paragraphs
for paragraph in section['paragraphs']:
markdown_lines.append(f"{paragraph}\n\n")
# Note image presence
if page['images']:
markdown_lines.append(f"*This page contains {len(page['images'])} image(s)*\n\n")
markdown_lines.append("---\n\n")
# Write Markdown to file
with open(output_path, 'w', encoding='utf-8') as f:
f.write("\n".join(markdown_lines))
logger.info(f"Extracted content to Markdown: {output_path}")
return output_path
except Exception as e:
logger.error(f"Error converting to Markdown: {str(e)}")
raise RuntimeError(f"Failed to convert to Markdown: {str(e)}")
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
temp_dir: Optional directory path for storing temporary files during processing. If not provided, a system temporary directory is automatically created using tempfile.mkdtemp(). The directory is created if it doesn't exist.
Return Value
Instantiation returns a PDFTextExtractor object. The extract_text() method returns a string containing the extracted text with page separators. The extract_structured_content() method returns a dictionary with keys 'title', 'pages', and 'metadata', where 'pages' contains lists of sections, tables, and images per page. The extract_to_markdown() method returns the file path string of the generated Markdown file.
Class Interface
Methods
__init__(self, temp_dir: Optional[str] = None)
Purpose: Initialize the PDF text extractor with an optional temporary directory for file processing
Parameters:
temp_dir: Optional directory path for temporary files. If None, creates a system temp directory
Returns: None (constructor)
extract_text(self, input_path: str, maintain_layout: bool = True) -> str
Purpose: Extract text from a PDF document with optional layout preservation
Parameters:
input_path: Path to the PDF document to extract text frommaintain_layout: If True, preserves page layout structure; if False, returns flattened text
Returns: String containing extracted text with page separators (e.g., '--- Page 1 ---')
extract_structured_content(self, input_path: str) -> Dict[str, Any]
Purpose: Extract structured content from a PDF including sections, paragraphs, tables, images, and metadata
Parameters:
input_path: Path to the PDF document to extract structured content from
Returns: Dictionary with keys 'title' (str), 'pages' (list of page dicts), and 'metadata' (dict). Each page contains 'number', 'sections', 'tables', and 'images' lists. Sections have 'heading', 'level', and 'paragraphs'.
extract_to_markdown(self, input_path: str, output_path: str) -> str
Purpose: Extract PDF content and convert it to Markdown format, saving to a file
Parameters:
input_path: Path to the PDF document to convertoutput_path: Path where the Markdown file will be saved (directory created if needed)
Returns: String path to the generated Markdown file
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
temp_dir |
str | Directory path for storing temporary files during PDF processing operations | instance |
Dependencies
ostempfilefitzlogging
Required Imports
import os
import tempfile
import fitz
import logging
Usage Example
import os
import tempfile
import fitz
import logging
# Setup logger
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
# Instantiate the extractor
extractor = PDFTextExtractor(temp_dir='/tmp/pdf_processing')
# Extract plain text with layout preservation
text = extractor.extract_text('document.pdf', maintain_layout=True)
print(f'Extracted {len(text)} characters')
# Extract structured content
structured = extractor.extract_structured_content('document.pdf')
print(f"Title: {structured['title']}")
print(f"Pages: {len(structured['pages'])}")
for page in structured['pages']:
print(f"Page {page['number']}: {len(page['sections'])} sections")
# Convert to Markdown
markdown_path = extractor.extract_to_markdown('document.pdf', 'output.md')
print(f'Markdown saved to: {markdown_path}')
Best Practices
- Always check that input PDF files exist before calling extraction methods (or catch FileNotFoundError)
- Handle RuntimeError exceptions that may be raised during extraction failures
- The temp_dir is created during initialization but not automatically cleaned up; consider manual cleanup or using context managers
- For large PDFs, extraction methods may consume significant memory; process in batches if needed
- The maintain_layout parameter in extract_text() affects output format: True preserves layout structure, False flattens text
- Structured content extraction uses heuristics (font size, position) to detect headings; results may vary based on PDF structure
- Heading level detection is based on font size: >=18pt=level 1, >=16pt=level 2, >=14pt=level 3, >12pt=level 4
- The extract_to_markdown() method creates output directories automatically if they don't exist
- Image extraction only captures metadata (bbox, dimensions), not actual image data
- Document title is extracted from metadata first, then from first page large text if metadata is unavailable
- All methods open and close the PDF document within their scope; no persistent file handles are maintained
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class DocumentExtractor 67.3% similar
-
class DocumentProcessor_v2 63.1% similar
-
class DocumentProcessor_v1 62.6% similar
-
class RegulatoryExtractor 61.0% similar
-
function extract_text_from_pdf 58.6% similar