class RegulatoryExtractor
A class for extracting structured metadata from regulatory guideline PDF documents using LLM-based analysis and storing the results in an Excel tracking spreadsheet.
/tf/active/vicechatdev/reg_extractor.py
11 - 383
complex
Purpose
RegulatoryExtractor automates the process of analyzing regulatory guideline PDFs (from agencies like FDA, EMA, MHRA, TGA) to extract key metadata such as title, reference number, jurisdiction, adoption date, effective date, and summary. It handles both text-based and scanned PDFs (via OCR), uses OpenAI's GPT models for intelligent data extraction, and maintains a deduplicated Excel tracking table. The class manages token limits for large documents and generates Filecloud URLs for document references.
Source Code
class RegulatoryExtractor:
"""
Class for extracting structured data from regulatory guideline PDFs
and storing results in an Excel tracking table.
"""
def __init__(self,
api_key: Optional[str] = None,
model: str = "gpt-4o",
excel_path: str = "regulatory_tracking.xlsx"):
"""
Initialize the RegulatoryExtractor.
Args:
api_key: OpenAI API key (if None, looks for OPENAI_API_KEY env variable)
model: LLM model to use for extraction
excel_path: Path to the Excel tracking file
"""
load_dotenv() # Load environment variables
self.api_key = api_key or os.getenv("OPENAI_API_KEY")
if not self.api_key:
raise ValueError("API key must be provided or set as OPENAI_API_KEY environment variable")
self.model = model
self.excel_path = excel_path
self.client = OpenAI(api_key=self.api_key)
self.logger = logging.getLogger(__name__)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
def extract_text_from_pdf(self, pdf_path: str) -> str:
"""
Extract text content from a PDF file with OCR fallback for scanned documents.
Uses tiktoken for precise token counting.
Args:
pdf_path: Path to the PDF file
Returns:
Extracted text content
"""
self.logger.info(f"Extracting text from {pdf_path}")
try:
# Initialize tiktoken encoding for token counting
encoding = tiktoken.get_encoding("cl100k_base")
# First try standard text extraction
text = ""
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n"
# Check if the text extraction yielded minimal text
# Use tiktoken to count tokens per page for more accuracy
tokens = len(encoding.encode(text))
avg_tokens_per_page = tokens / len(pdf_reader.pages) if pdf_reader.pages else 0
if avg_tokens_per_page < 50: # Very low token count indicates possible scan
self.logger.info(f"Minimal text extracted ({avg_tokens_per_page:.2f} tokens/page), trying OCR fallback")
text = self._extract_text_with_ocr(pdf_path)
tokens = len(encoding.encode(text))
# Check token limit and truncate if necessary
if tokens > 70000: # Reduced from 100k to ensure we stay within model limits
self.logger.warning(f"PDF text is very long ({tokens} tokens), truncating...")
# Calculate how many tokens to keep from beginning and end
half_tokens = 35000 # Half of our target size
# Extract beginning tokens (up to half_tokens)
beginning_text = encoding.decode(encoding.encode(text)[:half_tokens])
# Extract ending tokens
end_text = encoding.decode(encoding.encode(text)[-half_tokens:])
# Combine with a note about truncation
text = beginning_text + "\n\n...[content truncated]...\n\n" + end_text
# Verify the final token count
final_tokens = len(encoding.encode(text))
self.logger.info(f"Truncated text to {final_tokens} tokens")
return text
except Exception as e:
self.logger.error(f"Error extracting text: {str(e)}")
# Try OCR as fallback for any extraction errors
try:
self.logger.info(f"Extraction error, trying OCR fallback for {pdf_path}")
return self._extract_text_with_ocr(pdf_path)
except Exception as ocr_err:
self.logger.error(f"OCR fallback also failed: {str(ocr_err)}")
raise
def _extract_text_with_ocr(self, pdf_path: str) -> str:
"""
Extract text from PDF using OCR via llmsherpa API.
Based on logic from the offline_docstore_multi_vice module.
Args:
pdf_path: Path to the PDF file
Returns:
OCR extracted text
"""
self.logger.info(f"Using OCR to extract text from {pdf_path}")
try:
# Use llmsherpa for OCR-based extraction
llmsherpa_api_url = "http://llmsherpa:5001/api/parseDocument?renderFormat=all&useNewIndentParser=yes&applyOcr=yes"
# Import llmsherpa if available
try:
from llmsherpa.readers import LayoutPDFReader
pdf_reader = LayoutPDFReader(llmsherpa_api_url)
except ImportError:
self.logger.error("llmsherpa module not found. Install with: pip install llmsherpa")
raise ImportError("llmsherpa module required for OCR processing")
# Extract text using llmsherpa
doc = pdf_reader.read_pdf(pdf_path)
# Combine all text chunks with formatting improvements
all_text = []
text_chunk_interim = ""
min_chunk_len = 4000 # Similar to offline_docstore_multi_vice
for chunk in doc.chunks():
if hasattr(chunk, 'to_text'):
# Clean and normalize the text
clean_text = chunk.to_text().replace("- ","").replace("\n"," ")
text_chunk_interim = clean_text if text_chunk_interim == "" else text_chunk_interim + "\n" + clean_text
# Add chunk when it reaches minimum length
if len(text_chunk_interim) > min_chunk_len:
all_text.append(text_chunk_interim)
text_chunk_interim = ""
# Add any remaining text
if text_chunk_interim:
all_text.append(text_chunk_interim)
# Join all text with line breaks
combined_text = "\n\n".join(all_text)
# Count tokens in the extracted text
encoding = tiktoken.get_encoding("cl100k_base")
tokens = len(encoding.encode(combined_text))
self.logger.info(f"Successfully extracted {len(combined_text)} chars / {tokens} tokens using OCR")
return combined_text
except Exception as e:
self.logger.error(f"OCR extraction error: {str(e)}")
raise
def extract_guideline_data(self, text: str) -> Dict[str, Any]:
"""
Use LLM to extract structured data from guideline text.
Args:
text: Text content from the guideline
Returns:
Dictionary containing extracted fields
"""
self.logger.info("Extracting structured data using LLM")
prompt = """
You are a specialized data extraction assistant for regulatory documentation. Analyze the provided regulatory guideline document and extract the following specific fields:
1. Title: The official name of the guideline
2. Reference Number: Any identifying code, number, or reference ID
3. Jurisdiction: Whether it originates from EU, US, AU, UK, or other regulatory body
4. Adoption Date: When the guideline was officially adopted/published
5. Effective Date: When the guideline comes into force/effect
6. Summary : A brief summary of the guideline content maximum 1 paragraph long
Return ONLY a valid JSON object with these fields. If any information is unclear or missing, use null for that field. Format dates as YYYY-MM-DD when possible.
Example response format:
{
"title": "Full title of the guideline",
"referenceNumber": "GUID-2023-01",
"jurisdiction": "EU",
"adoptionDate": "2023-05-15",
"effectiveDate": "2024-01-01",
"summary": "Brief summary of the guideline content."
}
Important:
- Look for dates with context like "adopted on", "published on", "comes into effect on", "effective from"
- Jurisdiction may be indicated by mentions of agencies (FDA, EMA, MHRA, TGA)
- If dates appear in different formats, standardize to YYYY-MM-DD
Here is the document text:
""" + text
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a precise data extraction tool for regulatory documents."},
{"role": "user", "content": prompt}
],
temperature=0.0, # Use low temperature for precise extraction
)
content = response.choices[0].message.content
# Try to extract just the JSON part from the response
# (in case the model includes other text)
try:
# Find the first { and the last }
start_idx = content.find('{')
end_idx = content.rfind('}') + 1
if start_idx >= 0 and end_idx > 0:
json_str = content[start_idx:end_idx]
data = json.loads(json_str)
else:
# If no JSON delimiters found, try parsing the whole response
data = json.loads(content)
return data
except json.JSONDecodeError:
self.logger.error(f"Failed to parse JSON from response: {content}")
raise ValueError("LLM did not return valid JSON")
except Exception as e:
self.logger.error(f"Error during LLM extraction: {str(e)}")
raise
def update_excel_tracking(self, data: Dict[str, Any], pdf_path: str,orig_file) -> None:
"""
Update Excel tracking table with extracted data.
Only adds new entries if the source file doesn't already exist.
Args:
data: Extracted guideline data
pdf_path: Path to the source PDF file
"""
self.logger.info(f"Updating Excel tracking file: {self.excel_path}")
# Add source file information
data['sourceFile'] = os.path.basename(pdf_path)
source_file = data['sourceFile']
# Generate document URL using the same logic from OneCo_hybrid_RAG
data['documentURL'] = self.generate_filecloud_url(orig_file)
# Load existing Excel file if it exists, or create new DataFrame
if os.path.exists(self.excel_path):
try:
df = pd.read_excel(self.excel_path)
except Exception as e:
self.logger.error(f"Error reading Excel file: {str(e)}")
df = pd.DataFrame()
else:
df = pd.DataFrame()
# Check if this source file already exists in the DataFrame
if 'sourceFile' in df.columns and source_file in df['sourceFile'].values:
self.logger.info(f"Entry for {source_file} already exists in tracking file. Skipping.")
return
# Convert data dict to DataFrame and append
new_row = pd.DataFrame([data])
df = pd.concat([df, new_row], ignore_index=True)
# Write back to Excel
try:
df.to_excel(self.excel_path, index=False)
self.logger.info(f"Successfully updated tracking file with data for {data.get('title')}")
except Exception as e:
self.logger.error(f"Error writing to Excel file: {str(e)}")
raise
def generate_filecloud_url(self, filepath: str) -> str:
"""
Generate a Filecloud URL for the given file path using the same logic
from OneCo_hybrid_RAG.
Args:
filepath: Path to the file
Returns:
Filecloud URL for the file
"""
# Create file basename for display
filename = os.path.basename(filepath)
# Escape spaces in filename with + for the first part
encoded_filename = filename.replace(' ', '+')
# Extract directory path without filename
directory_path = os.path.dirname(filepath)
# Ensure path ends with '/'
if directory_path and not directory_path.endswith('/'):
directory_path += '/'
# Encode path for the second part (after #expl-tabl.)
encoded_path = directory_path
encoded_path = encoded_path.replace(' ', '%20')
# Construct the full URL
file_url = f"https://filecloud.vicebio.com/ui/core/index.html?filter={encoded_filename}#expl-tabl.{encoded_path}"
return file_url
def process_pdf(self, pdf_path: str,orig_file) -> Dict[str, Any]:
"""
Process a single PDF file - extract text, extract data, update Excel.
Args:
pdf_path: Path to the PDF file
Returns:
Extracted data dictionary
"""
self.logger.info(f"Processing regulatory guideline PDF: {pdf_path}")
try:
# Extract text from PDF
text = self.extract_text_from_pdf(pdf_path)
# Extract structured data using LLM
data = self.extract_guideline_data(text)
# Update Excel tracking
self.update_excel_tracking(data, pdf_path,orig_file)
return data
except Exception as e:
self.logger.error(f"Error processing {pdf_path}: {str(e)}")
raise
def process_directory(self, directory_path: str) -> List[Dict[str, Any]]:
"""
Process all PDF files in a directory.
Args:
directory_path: Path to directory containing PDF files
Returns:
List of extracted data dictionaries
"""
self.logger.info(f"Processing all PDFs in directory: {directory_path}")
results = []
for filename in os.listdir(directory_path):
if filename.lower().endswith('.pdf'):
pdf_path = os.path.join(directory_path, filename)
try:
data = self.process_pdf(pdf_path)
results.append(data)
except Exception as e:
self.logger.error(f"Skipping {filename} due to error: {str(e)}")
continue
self.logger.info(f"Processed {len(results)} PDF files successfully")
return results
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
api_key: OpenAI API key for authentication. If None, the class will attempt to load from the OPENAI_API_KEY environment variable. Must be provided either directly or via environment variable.
model: The OpenAI model identifier to use for text extraction. Defaults to 'gpt-4o'. Should be a model capable of handling large context windows and JSON output.
excel_path: File path where the Excel tracking spreadsheet will be created or updated. Defaults to 'regulatory_tracking.xlsx' in the current directory. The file will be created if it doesn't exist.
Return Value
The constructor returns a RegulatoryExtractor instance. Key methods return: extract_text_from_pdf() returns extracted text as string; extract_guideline_data() returns a dictionary with keys 'title', 'referenceNumber', 'jurisdiction', 'adoptionDate', 'effectiveDate', 'summary'; process_pdf() returns the extracted data dictionary; process_directory() returns a list of data dictionaries for all processed PDFs.
Class Interface
Methods
__init__(self, api_key: Optional[str] = None, model: str = 'gpt-4o', excel_path: str = 'regulatory_tracking.xlsx')
Purpose: Initialize the RegulatoryExtractor with API credentials, model selection, and Excel tracking file path
Parameters:
api_key: OpenAI API key (optional if set in environment)model: OpenAI model identifier to useexcel_path: Path to Excel tracking file
Returns: None - initializes instance attributes
extract_text_from_pdf(self, pdf_path: str) -> str
Purpose: Extract text content from a PDF file with automatic OCR fallback for scanned documents and token-based truncation for large files
Parameters:
pdf_path: File system path to the PDF file to extract
Returns: Extracted text content as a string, truncated to ~70,000 tokens if necessary
_extract_text_with_ocr(self, pdf_path: str) -> str
Purpose: Private method to extract text from PDF using OCR via llmsherpa API when standard extraction fails or yields minimal content
Parameters:
pdf_path: File system path to the PDF file
Returns: OCR-extracted text content as a string
extract_guideline_data(self, text: str) -> Dict[str, Any]
Purpose: Use LLM to extract structured metadata fields from guideline text content
Parameters:
text: Raw text content from the guideline document
Returns: Dictionary with keys: 'title', 'referenceNumber', 'jurisdiction', 'adoptionDate', 'effectiveDate', 'summary'. Values are strings or None if not found.
update_excel_tracking(self, data: Dict[str, Any], pdf_path: str, orig_file: str) -> None
Purpose: Update Excel tracking table with extracted data, adding sourceFile and documentURL fields. Skips if entry already exists.
Parameters:
data: Dictionary of extracted guideline datapdf_path: Path to the processed PDF fileorig_file: Original file path for generating Filecloud URL
Returns: None - updates Excel file as side effect
generate_filecloud_url(self, filepath: str) -> str
Purpose: Generate a Filecloud URL for the given file path with proper encoding for spaces and special characters
Parameters:
filepath: File system path to generate URL for
Returns: Formatted Filecloud URL string pointing to the file
process_pdf(self, pdf_path: str, orig_file: str) -> Dict[str, Any]
Purpose: Complete workflow to process a single PDF: extract text, extract structured data, and update Excel tracking
Parameters:
pdf_path: Path to the PDF file to processorig_file: Original file path for URL generation
Returns: Dictionary of extracted guideline data
process_directory(self, directory_path: str) -> List[Dict[str, Any]]
Purpose: Process all PDF files in a directory, handling errors gracefully and continuing with remaining files
Parameters:
directory_path: Path to directory containing PDF files
Returns: List of dictionaries containing extracted data for each successfully processed PDF
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
api_key |
str | OpenAI API key used for authentication with the OpenAI API | instance |
model |
str | OpenAI model identifier (e.g., 'gpt-4o') used for text extraction | instance |
excel_path |
str | File path to the Excel tracking spreadsheet | instance |
client |
OpenAI | Initialized OpenAI client instance for making API calls | instance |
logger |
logging.Logger | Logger instance for tracking operations and errors throughout the extraction process | instance |
Dependencies
osjsonpandasPyPDF2typingloggingopenaidotenvtiktokenllmsherpa
Required Imports
import os
import json
import pandas as pd
import PyPDF2
from typing import List, Dict, Any, Optional, Union
import logging
from openai import OpenAI
from dotenv import load_dotenv
import tiktoken
Conditional/Optional Imports
These imports are only needed under specific conditions:
from llmsherpa.readers import LayoutPDFReader
Condition: only when OCR fallback is triggered for scanned PDFs or when standard text extraction yields minimal content (< 50 tokens per page)
OptionalUsage Example
# Basic usage
from regulatory_extractor import RegulatoryExtractor
# Initialize with API key from environment
extractor = RegulatoryExtractor(
model='gpt-4o',
excel_path='my_tracking.xlsx'
)
# Process a single PDF
data = extractor.process_pdf('guideline.pdf', 'original/path/guideline.pdf')
print(f"Extracted: {data['title']}")
# Process all PDFs in a directory
results = extractor.process_directory('/path/to/guidelines')
print(f"Processed {len(results)} documents")
# Manual workflow for custom processing
text = extractor.extract_text_from_pdf('document.pdf')
structured_data = extractor.extract_guideline_data(text)
extractor.update_excel_tracking(structured_data, 'document.pdf', 'original/document.pdf')
Best Practices
- Always ensure OPENAI_API_KEY is set before instantiation to avoid ValueError
- The class automatically handles token limits by truncating documents over 70,000 tokens, keeping first and last 35,000 tokens
- Excel tracking file uses deduplication based on sourceFile column - reprocessing the same file will be skipped
- For scanned PDFs, ensure llmsherpa service is accessible at http://llmsherpa:5001 or OCR will fail
- Use process_pdf() for single files and process_directory() for batch processing - both handle errors gracefully
- The extract_guideline_data() method uses temperature=0.0 for deterministic extraction results
- Method call order for manual processing: extract_text_from_pdf() -> extract_guideline_data() -> update_excel_tracking()
- The class maintains internal state through instance attributes (client, logger, excel_path) - create separate instances for different tracking files
- OCR fallback is automatic when standard extraction yields < 50 tokens per page
- Generated Filecloud URLs use specific encoding rules - use generate_filecloud_url() for consistency
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class QueryBasedExtractor 64.6% similar
-
class PDFTextExtractor 61.0% similar
-
class ContractDataExtractor 60.9% similar
-
class DocumentExtractor 60.6% similar
-
class QueryBasedExtractor_v2 60.0% similar