class LiteratureClient
A client class for searching and retrieving scientific literature from multiple academic databases including PubMed, Semantic Scholar, arXiv, and ScienceOpen.
/tf/active/vicechatdev/QA_updater/data_access/literature_client.py
14 - 397
complex
Purpose
LiteratureClient provides a unified interface to query multiple scientific literature databases. It handles API authentication, rate limiting, date filtering, and response parsing for each source. The class manages API credentials through a ConfigParser object and implements per-source rate limiting to comply with API usage policies. It can search individual sources or aggregate results from all configured sources simultaneously.
Source Code
class LiteratureClient:
"""Client for accessing scientific literature data from multiple sources."""
def __init__(self, config: ConfigParser):
"""Initialize the client with required credentials."""
self.logger = logging.getLogger(__name__)
self.config = config
# API keys
self.pubmed_api_key = self.config.get('api_keys', 'pubmed_api_key', fallback=None)
self.semantic_scholar_api_key = self.config.get('api_keys', 'semantic_scholar_api_key', fallback=None)
self.scienceopen_api_key = self.config.get('api_keys', 'scienceopen_api_key', fallback=None)
self.scienceopen_api_secret = self.config.get('api_keys', 'scienceopen_api_secret', fallback=None)
# Rate limiting configurations
self.pubmed_rate_limit = float(self.config.get('rate_limits', 'pubmed_rate_limit', fallback=3))
self.semantic_scholar_rate_limit = float(self.config.get('rate_limits', 'semantic_scholar_rate_limit', fallback=100/300)) # 100 requests per 5 minutes
self.arxiv_rate_limit = float(self.config.get('rate_limits', 'arxiv_rate_limit', fallback=1/3)) # 1 request per 3 seconds
# For rate limiting
self.last_pubmed_request = 0
self.last_semantic_scholar_request = 0
self.last_arxiv_request = 0
self.last_scienceopen_request = 0
self.logger.info("LiteratureClient initialized.")
def search_pubmed(self, query: str, max_results: int = 10, days_back: int = 90) -> List[Dict[str, Any]]:
"""
Search PubMed for articles matching the query.
Args:
query: Search terms
max_results: Maximum number of results to return
days_back: Only include articles from this many days ago
Returns:
List of article metadata dictionaries
"""
# Rate limiting
current_time = time.time()
time_since_last = current_time - self.last_pubmed_request
if time_since_last < (1.0/self.pubmed_rate_limit):
time.sleep((1.0/self.pubmed_rate_limit) - time_since_last)
# Calculate date range for filtering
date_from = (datetime.now() - timedelta(days=days_back)).strftime("%Y/%m/%d")
# PubMed API endpoint
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
# Request parameters
params = {
"db": "pubmed",
"term": query + f" AND (\"" + date_from + "\"[Date - Publication] : \"3000\"[Date - Publication])",
"retmax": str(max_results),
"retmode": "json",
"api_key": self.pubmed_api_key if self.pubmed_api_key else None,
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
except requests.exceptions.RequestException as e:
self.logger.error(f"PubMed API error: {e}")
return []
except Exception as e:
self.logger.exception(f"Error during PubMed API request: {e}")
return []
# Extract article IDs
article_ids = data.get("esearchresult", {}).get("idlist", [])
# Fetch article details
articles = []
if article_ids:
articles = self._fetch_pubmed_details(article_ids)
self.last_pubmed_request = time.time()
return articles
def _fetch_pubmed_details(self, article_ids: List[str]) -> List[Dict[str, Any]]:
"""Fetch detailed information for a list of PubMed article IDs."""
# PubMed API endpoint for fetching details
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
# Request parameters
params = {
"db": "pubmed",
"id": ",".join(article_ids),
"retmode": "json",
"api_key": self.pubmed_api_key if self.pubmed_api_key else None,
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
except requests.exceptions.RequestException as e:
self.logger.error(f"PubMed API error (fetching details): {e}")
return []
except Exception as e:
self.logger.exception(f"Error during PubMed API request (fetching details): {e}")
return []
# Process results
articles = []
for article_id, article_data in data.get("result", {}).items():
if article_id == "uids":
continue
article = {
"source": "pubmed",
"pubmed_id": article_id,
"title": article_data.get("title"),
"authors": article_data.get("authors", []),
"journal": article_data.get("source"),
"publication_date": article_data.get("pubdate"),
"doi": article_data.get("elocationid"),
"url": f"https://pubmed.ncbi.nlm.nih.gov/{article_id}/",
}
articles.append(article)
return articles
def search_semantic_scholar(self, query: str, max_results: int = 10, days_back: int = 90) -> List[Dict[str, Any]]:
"""
Search Semantic Scholar API for articles matching the query.
Args:
query: Search terms
max_results: Maximum number of results to return
days_back: Only include articles from this many days ago
Returns:
List of article metadata dictionaries
"""
# Rate limiting
current_time = time.time()
time_since_last = current_time - self.last_semantic_scholar_request
if time_since_last < (1.0/self.semantic_scholar_rate_limit):
time.sleep((1.0/self.semantic_scholar_rate_limit) - time_since_last)
# Calculate date range for filtering
date_from = (datetime.now() - timedelta(days=days_back)).strftime("%Y-%m-%d")
# Semantic Scholar API endpoint
url = "https://api.semanticscholar.org/graph/v1/paper/search"
# Request parameters
params = {
"query": query,
"limit": max_results,
"offset": 0,
"fields": "paperId,url,title,abstract,authors,venue,year,citationCount,influentialCitationCount,isOpenAccess,doi",
"publicationDateFrom": date_from,
}
# Headers
headers = {}
if self.semantic_scholar_api_key:
headers["x-api-key"] = self.semantic_scholar_api_key
try:
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
except requests.exceptions.RequestException as e:
self.logger.error(f"Semantic Scholar API error: {e}")
return []
except Exception as e:
self.logger.exception(f"Error during Semantic Scholar API request: {e}")
return []
# Process results
articles = []
for paper in data.get("data", []):
authors = [{"name": author.get("name")} for author in paper.get("authors", [])]
article = {
"source": "semantic_scholar",
"paper_id": paper.get("paperId"),
"url": paper.get("url"),
"title": paper.get("title"),
"abstract": paper.get("abstract"),
"authors": authors,
"venue": paper.get("venue"),
"year": paper.get("year"),
"citation_count": paper.get("citationCount"),
"influential_citation_count": paper.get("influentialCitationCount"),
"is_open_access": paper.get("isOpenAccess"),
"doi": paper.get("doi"),
}
articles.append(article)
self.last_semantic_scholar_request = time.time()
return articles
def search_arxiv(self, query: str, max_results: int = 10, days_back: int = 90) -> List[Dict[str, Any]]:
"""
Search arXiv API for articles matching the query.
Args:
query: Search terms
max_results: Maximum number of results to return
days_back: Only include articles from this many days ago
Returns:
List of article metadata dictionaries
"""
# Rate limiting
current_time = time.time()
time_since_last = current_time - self.last_arxiv_request
if time_since_last < (1.0/self.arxiv_rate_limit):
time.sleep((1.0/self.arxiv_rate_limit) - time_since_last)
# Calculate date range for filtering
date_from = datetime.now() - timedelta(days=days_back)
date_str = date_from.strftime("%Y%m%d%H%M%S")
# arXiv API endpoint
url = "http://export.arxiv.org/api/query"
# Request parameters
params = {
"search_query": query + f" AND submittedDate:[{date_str} TO 30000101000000]",
"start": 0,
"max_results": max_results,
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
# Parse XML response
soup = BeautifulSoup(response.content, "xml")
except requests.exceptions.RequestException as e:
self.logger.error(f"ArXiv API error: {e}")
return []
except Exception as e:
self.logger.exception(f"Error during ArXiv API request: {e}")
return []
# Process results
articles = []
for entry in soup.find_all("entry"):
# Extract categories
categories = [category.get("term") for category in entry.find_all("category")]
# Extract authors
authors = [{"name": author.find("name").text} for author in entry.find_all("author")]
# Extract PDF URL
pdf_url = None
for link in entry.find_all("link"):
if link.get("title") == "pdf":
pdf_url = link.get("href")
break
article = {
"source": "arxiv",
"arxiv_id": entry.find("id").text,
"title": entry.find("title").text,
"summary": entry.find("summary").text,
"authors": authors,
"published": entry.find("published").text,
"updated": entry.find("updated").text,
"categories": categories,
"pdf_url": pdf_url,
"url": entry.find("id").text,
}
articles.append(article)
self.last_arxiv_request = time.time()
return articles
def search_scienceopen(self, query: str, max_results: int = 10, days_back: int = 90) -> List[Dict[str, Any]]:
"""
Search ScienceOpen API for articles matching the query.
Args:
query: Search terms
max_results: Maximum number of results to return
days_back: Only include articles from this many days ago
Returns:
List of article metadata dictionaries
"""
# ScienceOpen API requires authentication
if not (self.scienceopen_api_key and self.scienceopen_api_secret):
self.logger.warning("ScienceOpen API credentials not provided. Skipping ScienceOpen search.")
return []
# ScienceOpen API endpoint
url = "https://api.scienceopen.com/search/articles"
# Calculate date range for filtering
date_from = datetime.now() - timedelta(days=days_back)
date_str = date_from.strftime("%Y-%m-%d")
# Request parameters
params = {
"query": query,
"rows": max_results,
"published_since": date_str,
"api_key": self.scienceopen_api_key,
"api_secret": self.scienceopen_api_secret,
}
try:
response = requests.get(url, params=params)
response.raise_for_status()
data = response.json()
except requests.exceptions.RequestException as e:
self.logger.error(f"ScienceOpen API error: {e}")
return []
except Exception as e:
self.logger.exception(f"Error during ScienceOpen API request: {e}")
return []
# Process results
articles = []
for item in data.get("data", []):
article = {
"source": "scienceopen",
"doi": item.get("doi"),
"title": item.get("title"),
"abstract": item.get("abstract"),
"authors": item.get("authors"),
"publication_date": item.get("date_published"),
"url": f"https://www.scienceopen.com/document?vid={item.get('id')}",
}
articles.append(article)
return articles
def search_all(self, query: str, max_results_per_source: int = 5, days_back: int = 90) -> List[Dict[str, Any]]:
"""
Search all configured literature sources.
Args:
query: Search terms
max_results_per_source: Maximum results to return per source
days_back: Only include articles from this many days ago
Returns:
Combined list of results from all sources
"""
results = []
# PubMed results
try:
pubmed_results = self.search_pubmed(query, max_results_per_source, days_back)
results.extend(pubmed_results)
self.logger.info(f"Retrieved {len(pubmed_results)} results from PubMed")
except Exception as e:
self.logger.error(f"PubMed search error: {e}")
# Semantic Scholar results
try:
semantic_scholar_results = self.search_semantic_scholar(query, max_results_per_source, days_back)
results.extend(semantic_scholar_results)
self.logger.info(f"Retrieved {len(semantic_scholar_results)} results from Semantic Scholar")
except Exception as e:
self.logger.error(f"Semantic Scholar search error: {e}")
# arXiv results
try:
arxiv_results = self.search_arxiv(query, max_results_per_source, days_back)
results.extend(arxiv_results)
self.logger.info(f"Retrieved {len(arxiv_results)} results from arXiv")
except Exception as e:
self.logger.error(f"arXiv search error: {e}")
# ScienceOpen results
try:
scienceopen_results = self.search_scienceopen(query, max_results_per_source, days_back)
results.extend(scienceopen_results)
self.logger.info(f"Retrieved {len(scienceopen_results)} results from ScienceOpen")
except Exception as e:
self.logger.error(f"ScienceOpen search error: {e}")
return results
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
config: A ConfigParser object containing API credentials and rate limit configurations. Expected sections: 'api_keys' (with pubmed_api_key, semantic_scholar_api_key, scienceopen_api_key, scienceopen_api_secret) and 'rate_limits' (with pubmed_rate_limit, semantic_scholar_rate_limit, arxiv_rate_limit). All keys are optional with fallback defaults.
Return Value
Instantiation returns a LiteratureClient object. Search methods return List[Dict[str, Any]] containing article metadata dictionaries. Each dictionary contains source-specific fields like title, authors, publication date, DOI, and URL. Returns empty list on API errors or when no results found.
Class Interface
Methods
__init__(self, config: ConfigParser)
Purpose: Initialize the LiteratureClient with API credentials and rate limiting configuration
Parameters:
config: ConfigParser object containing 'api_keys' and 'rate_limits' sections with optional API credentials and rate limit values
Returns: None - initializes instance attributes including logger, config, API keys, rate limits, and request timestamps
search_pubmed(self, query: str, max_results: int = 10, days_back: int = 90) -> List[Dict[str, Any]]
Purpose: Search PubMed database for articles matching the query with date filtering
Parameters:
query: Search terms to query PubMedmax_results: Maximum number of results to return (default 10)days_back: Only include articles published within this many days (default 90)
Returns: List of dictionaries containing article metadata with keys: source, pubmed_id, title, authors, journal, publication_date, doi, url. Returns empty list on error.
_fetch_pubmed_details(self, article_ids: List[str]) -> List[Dict[str, Any]]
Purpose: Internal method to fetch detailed metadata for a list of PubMed article IDs
Parameters:
article_ids: List of PubMed article ID strings to fetch details for
Returns: List of dictionaries with detailed article metadata including title, authors, journal, publication_date, doi, and url. Returns empty list on error.
search_semantic_scholar(self, query: str, max_results: int = 10, days_back: int = 90) -> List[Dict[str, Any]]
Purpose: Search Semantic Scholar API for academic papers matching the query
Parameters:
query: Search terms to query Semantic Scholarmax_results: Maximum number of results to return (default 10)days_back: Only include papers published within this many days (default 90)
Returns: List of dictionaries with keys: source, paper_id, url, title, abstract, authors, venue, year, citation_count, influential_citation_count, is_open_access, doi. Returns empty list on error.
search_arxiv(self, query: str, max_results: int = 10, days_back: int = 90) -> List[Dict[str, Any]]
Purpose: Search arXiv preprint repository for papers matching the query
Parameters:
query: Search terms to query arXivmax_results: Maximum number of results to return (default 10)days_back: Only include papers submitted within this many days (default 90)
Returns: List of dictionaries with keys: source, arxiv_id, title, summary, authors, published, updated, categories, pdf_url, url. Returns empty list on error.
search_scienceopen(self, query: str, max_results: int = 10, days_back: int = 90) -> List[Dict[str, Any]]
Purpose: Search ScienceOpen database for articles (requires API credentials)
Parameters:
query: Search terms to query ScienceOpenmax_results: Maximum number of results to return (default 10)days_back: Only include articles published within this many days (default 90)
Returns: List of dictionaries with keys: source, doi, title, abstract, authors, publication_date, url. Returns empty list if credentials missing or on error.
search_all(self, query: str, max_results_per_source: int = 5, days_back: int = 90) -> List[Dict[str, Any]]
Purpose: Search all configured literature sources and aggregate results
Parameters:
query: Search terms to query all sourcesmax_results_per_source: Maximum results to return from each source (default 5)days_back: Only include articles/papers from this many days (default 90)
Returns: Combined list of article dictionaries from all sources. Each dictionary structure depends on the source. Returns partial results if some sources fail.
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
logger |
logging.Logger | Logger instance for recording client operations and errors | instance |
config |
ConfigParser | Configuration object containing API keys and rate limit settings | instance |
pubmed_api_key |
Optional[str] | API key for PubMed/NCBI E-utilities (optional, enables higher rate limits) | instance |
semantic_scholar_api_key |
Optional[str] | API key for Semantic Scholar (optional, enables higher rate limits) | instance |
scienceopen_api_key |
Optional[str] | API key for ScienceOpen (required for ScienceOpen searches) | instance |
scienceopen_api_secret |
Optional[str] | API secret for ScienceOpen (required for ScienceOpen searches) | instance |
pubmed_rate_limit |
float | Maximum requests per second for PubMed API (default 3) | instance |
semantic_scholar_rate_limit |
float | Maximum requests per second for Semantic Scholar API (default 100/300, i.e., 100 requests per 5 minutes) | instance |
arxiv_rate_limit |
float | Maximum requests per second for arXiv API (default 1/3, i.e., 1 request per 3 seconds) | instance |
last_pubmed_request |
float | Timestamp of last PubMed API request for rate limiting (initialized to 0) | instance |
last_semantic_scholar_request |
float | Timestamp of last Semantic Scholar API request for rate limiting (initialized to 0) | instance |
last_arxiv_request |
float | Timestamp of last arXiv API request for rate limiting (initialized to 0) | instance |
last_scienceopen_request |
float | Timestamp of last ScienceOpen API request for rate limiting (initialized to 0) | instance |
Dependencies
timerequestsjsonospandastypingdatetimebs4urllib.parseconfigparserlogging
Required Imports
import time
import requests
import json
import os
import pandas as pd
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import urllib.parse
from configparser import ConfigParser
import logging
Usage Example
from configparser import ConfigParser
import logging
# Setup logging
logging.basicConfig(level=logging.INFO)
# Create configuration
config = ConfigParser()
config.add_section('api_keys')
config.set('api_keys', 'pubmed_api_key', 'your_pubmed_key')
config.set('api_keys', 'semantic_scholar_api_key', 'your_ss_key')
config.add_section('rate_limits')
config.set('rate_limits', 'pubmed_rate_limit', '3')
# Initialize client
client = LiteratureClient(config)
# Search single source
pubmed_results = client.search_pubmed('machine learning', max_results=10, days_back=90)
for article in pubmed_results:
print(f"Title: {article['title']}")
print(f"URL: {article['url']}")
# Search all sources
all_results = client.search_all('deep learning', max_results_per_source=5, days_back=30)
print(f"Total results: {len(all_results)}")
# Search specific sources
arxiv_results = client.search_arxiv('neural networks', max_results=5, days_back=60)
ss_results = client.search_semantic_scholar('computer vision', max_results=10, days_back=45)
Best Practices
- Always provide a properly configured ConfigParser object with at least the required sections ('api_keys' and 'rate_limits'), even if empty
- API keys are optional but recommended for higher rate limits and better service
- Rate limiting is automatically handled - avoid making rapid successive calls to the same source
- The client maintains state for rate limiting timestamps; reuse the same instance for multiple searches
- Handle empty result lists gracefully as API errors return empty lists rather than raising exceptions
- For ScienceOpen, both api_key and api_secret must be provided or the search will be skipped
- The days_back parameter filters results by publication date; adjust based on your needs
- Use search_all() for comprehensive coverage but be aware it will take longer due to rate limiting
- Each source returns different metadata fields; check the 'source' field to determine available data
- Consider implementing retry logic for production use as network errors return empty lists
- Logger messages provide insight into retrieval success/failure; monitor logs in production
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class PatentClient 66.4% similar
-
class ClinicalTrialsClient 64.7% similar
-
class CompanyNewsClient 58.9% similar
-
class LLMClient_v1 55.8% similar
-
class DocumentDownloader 55.4% similar