class CompanyNewsClient
A client class for accessing company news and information from multiple sources including GDELT Project and NewsAPI, with built-in rate limiting and error handling.
/tf/active/vicechatdev/QA_updater/data_access/company_news_client.py
16 - 219
moderate
Purpose
CompanyNewsClient provides a unified interface for searching news articles from multiple news data sources (GDELT and NewsAPI). It handles API authentication, rate limiting, date filtering, and result normalization across different news sources. The class is designed for applications that need to aggregate news data about companies or topics from multiple sources while respecting API rate limits and handling errors gracefully.
Source Code
class CompanyNewsClient:
"""Client for accessing company information and news data from multiple sources."""
def __init__(self, config: ConfigParser):
"""Initialize the client with required credentials."""
self.logger = logging.getLogger(__name__)
self.config = config
# API keys
self.news_api_key = self.config.get('api_keys', 'news_api_key', fallback=None)
# Rate limiting configurations
self.gdelt_rate_limit = float(self.config.get('rate_limits', 'gdelt_rate_limit', fallback=1))
self.news_api_rate_limit = float(self.config.get('rate_limits', 'news_api_rate_limit', fallback=1))
# Clients
if self.news_api_key:
self.news_api = NewsApiClient(api_key=self.news_api_key)
else:
self.news_api = None
self.logger.warning("NewsAPI client not initialized due to missing API key")
# For rate limiting
self.last_gdelt_request = 0
self.last_news_api_request = 0
self.logger.info("CompanyNewsClient initialized.")
def search_gdelt(self, query: str, max_results: int = 10, days_back: int = 7) -> List[Dict[str, Any]]:
"""
Search GDELT Project for news articles matching the query using direct API access.
Args:
query: Search terms
max_results: Maximum number of results to return
days_back: Only include articles from this many days ago
Returns:
List of article metadata dictionaries
"""
# Rate limiting
current_time = time.time()
time_since_last = current_time - self.last_gdelt_request
if time_since_last < (15.0/self.gdelt_rate_limit): # 15 seconds per request
time.sleep((15.0/self.gdelt_rate_limit) - time_since_last)
# Calculate date range for filtering
end_date = datetime.now()
start_date = end_date - timedelta(days=days_back)
start_date_str = start_date.strftime('%Y%m%d%H%M%S')
end_date_str = end_date.strftime('%Y%m%d%H%M%S')
try:
# GDELT Article Search API endpoint
base_url = "https://api.gdeltproject.org/api/v2/doc/doc"
# URL-encode the query
encoded_query = urllib.parse.quote(query)
# Construct API URL with parameters
url = (f"{base_url}?query={encoded_query}"
f"&mode=artlist&format=json"
f"&startdatetime={start_date_str}&enddatetime={end_date_str}"
f"&maxrecords={max_results}")
# Make the request
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
# Process the results
articles = []
for article in data.get('articles', []):
# Extract relevant data
processed_article = {
"source": "gdelt",
"title": article.get('title', ''),
"url": article.get('url', ''),
"date": article.get('seendate', ''),
"domain": article.get('domain', ''),
"source_country": article.get('sourcecountry', ''),
"language": article.get('language', ''),
"tone": article.get('tone', 0)
}
# Add themes if available
if 'themes' in article:
processed_article["themes"] = article['themes'].split(',')
# Add locations if available
if 'locations' in article:
processed_article["locations"] = article['locations'].split(';')
# Add persons if available
if 'persons' in article:
processed_article["persons"] = article['persons'].split(';')
# Add organizations if available
if 'organizations' in article:
processed_article["organizations"] = article['organizations'].split(';')
articles.append(processed_article)
self.last_gdelt_request = time.time()
return articles
except Exception as e:
self.logger.error(f"GDELT search error: {e}")
return []
def search_news_api(self, query: str, max_results: int = 10, days_back: int = 7) -> List[Dict[str, Any]]:
"""
Search NewsAPI for articles matching the query.
Args:
query: Search terms
max_results: Maximum number of results to return
days_back: Only include articles from this many days ago
Returns:
List of article metadata dictionaries
"""
# Rate limiting
current_time = time.time()
time_since_last = current_time - self.last_news_api_request
if time_since_last < (1.0/self.news_api_rate_limit):
time.sleep((1.0/self.news_api_rate_limit) - time_since_last)
if not self.news_api:
self.logger.warning("NewsAPI client not initialized. Cannot search news.")
return []
# Calculate date for filtering (NewsAPI free tier limits to 1 month back)
if days_back > 30:
self.logger.warning("NewsAPI free tier only allows searching up to 30 days back. Using 30 days.")
days_back = 30
from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
try:
# Query the NewsAPI
response = self.news_api.get_everything(
q=query,
from_param=from_date,
language='en',
sort_by='relevancy',
page_size=max_results
)
# Process results
articles = []
for article in response.get('articles', []):
articles.append({
"source": "newsapi",
"source_name": article.get('source', {}).get('name'),
"author": article.get('author'),
"title": article.get('title'),
"description": article.get('description'),
"url": article.get('url'),
"url_to_image": article.get('urlToImage'),
"published_at": article.get('publishedAt'),
"content": article.get('content')
})
self.last_news_api_request = time.time()
return articles
except Exception as e:
self.logger.error(f"NewsAPI error: {e}")
return []
def search_all_news(self, query: str, max_results_per_source: int = 5, days_back: int = 7) -> List[Dict[str, Any]]:
"""
Search all configured news sources.
Args:
query: Search terms
max_results_per_source: Maximum results to return per source
days_back: Only include articles from this many days ago
Returns:
Combined list of results from all news sources
"""
results = []
# GDELT results
try:
gdelt_results = self.search_gdelt(query, max_results_per_source, days_back)
results.extend(gdelt_results)
self.logger.info(f"Retrieved {len(gdelt_results)} results from GDELT")
except Exception as e:
self.logger.error(f"GDELT search error: {e}")
# NewsAPI results
try:
news_api_results = self.search_news_api(query, max_results_per_source, days_back)
results.extend(news_api_results)
self.logger.info(f"Retrieved {len(news_api_results)} results from NewsAPI")
except Exception as e:
self.logger.error(f"NewsAPI search error: {e}")
return results
Parameters
| Name | Type | Default | Kind |
|---|---|---|---|
bases |
- | - |
Parameter Details
config: A ConfigParser object containing configuration settings including API keys (news_api_key) and rate limits (gdelt_rate_limit, news_api_rate_limit). The config should have sections 'api_keys' and 'rate_limits' with appropriate key-value pairs. Missing values will use fallback defaults.
Return Value
Instantiation returns a CompanyNewsClient object. The search methods (search_gdelt, search_news_api, search_all_news) return List[Dict[str, Any]] containing article metadata dictionaries. Each dictionary contains source-specific fields like title, url, date, and additional metadata. Returns empty list on errors.
Class Interface
Methods
__init__(self, config: ConfigParser)
Purpose: Initialize the CompanyNewsClient with configuration including API keys and rate limits
Parameters:
config: ConfigParser object containing API keys in 'api_keys' section and rate limits in 'rate_limits' section
Returns: None (constructor)
search_gdelt(self, query: str, max_results: int = 10, days_back: int = 7) -> List[Dict[str, Any]]
Purpose: Search GDELT Project for news articles matching the query using direct API access with rate limiting
Parameters:
query: Search terms to query GDELT formax_results: Maximum number of results to return (default: 10)days_back: Only include articles from this many days ago (default: 7)
Returns: List of article metadata dictionaries with fields: source, title, url, date, domain, source_country, language, tone, and optional themes, locations, persons, organizations
search_news_api(self, query: str, max_results: int = 10, days_back: int = 7) -> List[Dict[str, Any]]
Purpose: Search NewsAPI for articles matching the query with rate limiting and date filtering
Parameters:
query: Search terms to query NewsAPI formax_results: Maximum number of results to return (default: 10)days_back: Only include articles from this many days ago, max 30 for free tier (default: 7)
Returns: List of article metadata dictionaries with fields: source, source_name, author, title, description, url, url_to_image, published_at, content. Returns empty list if NewsAPI client not initialized.
search_all_news(self, query: str, max_results_per_source: int = 5, days_back: int = 7) -> List[Dict[str, Any]]
Purpose: Search all configured news sources (GDELT and NewsAPI) and combine results
Parameters:
query: Search terms to query all sources formax_results_per_source: Maximum results to return per source (default: 5)days_back: Only include articles from this many days ago (default: 7)
Returns: Combined list of article metadata dictionaries from all sources. Each dictionary has a 'source' field indicating origin (gdelt or newsapi).
Attributes
| Name | Type | Description | Scope |
|---|---|---|---|
logger |
logging.Logger | Logger instance for the class, used for info, warning, and error messages | instance |
config |
ConfigParser | Configuration object containing API keys and rate limit settings | instance |
news_api_key |
Optional[str] | API key for NewsAPI service, retrieved from config or None if not provided | instance |
gdelt_rate_limit |
float | Rate limit multiplier for GDELT requests (default: 1.0), used to calculate sleep time | instance |
news_api_rate_limit |
float | Rate limit multiplier for NewsAPI requests (default: 1.0), used to calculate sleep time | instance |
news_api |
Optional[NewsApiClient] | NewsAPI client instance, initialized only if API key is provided, otherwise None | instance |
last_gdelt_request |
float | Timestamp of the last GDELT API request, used for rate limiting (initialized to 0) | instance |
last_news_api_request |
float | Timestamp of the last NewsAPI request, used for rate limiting (initialized to 0) | instance |
Dependencies
timerequestsjsonospandastypingdatetimebs4urllib.parseconfigparserloggingnewsapi
Required Imports
import time
import requests
import json
import os
import pandas as pd
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import urllib.parse
from configparser import ConfigParser
import logging
from newsapi import NewsApiClient
Usage Example
from configparser import ConfigParser
import logging
# Setup logging
logging.basicConfig(level=logging.INFO)
# Create config
config = ConfigParser()
config.add_section('api_keys')
config.set('api_keys', 'news_api_key', 'your_newsapi_key_here')
config.add_section('rate_limits')
config.set('rate_limits', 'gdelt_rate_limit', '1')
config.set('rate_limits', 'news_api_rate_limit', '1')
# Instantiate client
client = CompanyNewsClient(config)
# Search GDELT only
gdelt_articles = client.search_gdelt('Tesla Inc', max_results=10, days_back=7)
for article in gdelt_articles:
print(f"{article['title']} - {article['url']}")
# Search NewsAPI only
news_articles = client.search_news_api('Tesla Inc', max_results=10, days_back=7)
for article in news_articles:
print(f"{article['title']} - {article['url']}")
# Search all sources
all_articles = client.search_all_news('Tesla Inc', max_results_per_source=5, days_back=7)
print(f"Total articles found: {len(all_articles)}")
Best Practices
- Always provide a properly configured ConfigParser object with at least the 'api_keys' and 'rate_limits' sections
- The NewsAPI client will not be initialized if the API key is missing; check self.news_api before calling search_news_api
- Rate limiting is automatically handled; avoid making concurrent requests with multiple instances to the same APIs
- GDELT enforces a 15-second rate limit per request; plan accordingly for large-scale data collection
- NewsAPI free tier limits searches to 30 days back; the class automatically adjusts if days_back > 30
- Methods return empty lists on errors rather than raising exceptions; check logs for error details
- The class maintains state for rate limiting (last_gdelt_request, last_news_api_request); avoid sharing instances across threads
- search_all_news aggregates results from all sources; use individual search methods for source-specific queries
- Article dictionaries have different schemas depending on the source; check the 'source' field to determine structure
- GDELT results include additional metadata like tone, themes, locations, persons, and organizations when available
Tags
Similar Components
AI-powered semantic similarity - components with related functionality:
-
class ClinicalTrialsClient 60.2% similar
-
class LiteratureClient 58.9% similar
-
class PatentClient 56.8% similar
-
class LLMClient_v1 50.8% similar
-
class GoogleSearchClient 48.8% similar