309 lines
12 KiB
Python
309 lines
12 KiB
Python
import time
|
|
import feedparser
|
|
import os
|
|
from django.utils import timezone
|
|
from datetime import timedelta
|
|
from ..models import Search, Source
|
|
from .fetch_utils import decode_gnews_urls
|
|
from .logger import get_logger
|
|
logger = get_logger()
|
|
|
|
from gnews import GNews
|
|
from duckduckgo_search import DDGS
|
|
from GoogleNews import GoogleNews
|
|
from search_engines import Yahoo, Aol
|
|
|
|
###########################################################################
|
|
###########################################################################
|
|
from abc import ABC, abstractmethod
|
|
|
|
# Generic fetcher (fetches articles, writes to DB)
|
|
class FetcherAbstract(ABC):
|
|
@abstractmethod
|
|
def _fetch_raw_urls(self):
|
|
pass
|
|
|
|
@abstractmethod
|
|
def _get_name(self):
|
|
pass
|
|
|
|
def _get_source_object(self, source):
|
|
# TODO: Cache
|
|
# self.cached_sources = {}
|
|
# Get source object
|
|
obj_source, created = Source.objects.get_or_create(source=source)
|
|
return obj_source
|
|
|
|
def _post_process_urls(self, raw_urls, obj_search):
|
|
# Searching URL Host based? Make sure results belong to that site
|
|
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
|
|
# Get clean URL host
|
|
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
|
|
# Ensure URL host in URL
|
|
raw_urls = [u for u in raw_urls if url_host_clean in u]
|
|
|
|
return raw_urls
|
|
|
|
def fetch_articles(self, db_writer, obj_search):
|
|
# Source name
|
|
source_name = self._get_name()
|
|
|
|
# Search
|
|
keyword_search = obj_search.search
|
|
# URL Host search? -> site:${URL_HOST}
|
|
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
|
|
keyword_search = "{}{}".format("site:", keyword_search)
|
|
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
|
|
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
|
|
start_date = timezone.now() - timedelta(days=7)
|
|
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
|
|
|
|
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
|
|
# Fetch
|
|
raw_urls = self._fetch_raw_urls(keyword_search)
|
|
# Post-process
|
|
raw_urls = self._post_process_urls(raw_urls, obj_search)
|
|
|
|
# Write to DB
|
|
db_writer.insert_raw_urls(raw_urls, self._get_source_object(source_name), obj_search)
|
|
|
|
###########################################################################
|
|
|
|
class SearchGNews(FetcherAbstract):
|
|
def __init__(self, args={}):
|
|
super().__init__()
|
|
# Parameters
|
|
self.language = args.get("language", "en")
|
|
self.country = args.get("country", "US")
|
|
self.period = args.get("period", "7d")
|
|
self.max_results = args.get("max_results", 100)
|
|
|
|
def _get_name(self):
|
|
# [source] [period] [language-country] [max_results]
|
|
return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
|
|
|
|
def _fetch_raw_urls(self, keyword_search):
|
|
try:
|
|
# Get news
|
|
results_gnews = GNews(language=self.language, country=self.country, period=self.period, max_results=self.max_results).get_news(keyword_search)
|
|
# Get list of encoded urls
|
|
encoded_urls = [e.get("url") for e in results_gnews]
|
|
# Decode
|
|
urls = decode_gnews_urls(encoded_urls)
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
|
urls = []
|
|
return urls
|
|
|
|
class SearchDuckDuckGoGeneral(FetcherAbstract):
|
|
def __init__(self, args={}):
|
|
super().__init__()
|
|
# Parameters
|
|
self.language = args.get("language", "wt")
|
|
self.country = args.get("country", "wt")
|
|
self.max_results = args.get("max_results", 20)
|
|
self.region = "{}-{}".format(self.language, self.country).lower()
|
|
self.period = None
|
|
|
|
def _get_name(self):
|
|
# [source] [language-country] [max_results]
|
|
return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
|
|
|
|
def _fetch_raw_urls(self, keyword_search):
|
|
try:
|
|
news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
|
|
urls = [e.get("href") for e in news]
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
|
urls = []
|
|
return urls
|
|
|
|
class SearchDuckDuckGoNews(FetcherAbstract):
|
|
def __init__(self, args={}):
|
|
super().__init__()
|
|
# Parameters
|
|
self.language = args.get("language", "wt")
|
|
self.country = args.get("country", "wt")
|
|
self.max_results = args.get("max_results", 100)
|
|
self.region = "{}-{}".format(self.language, self.country).lower()
|
|
self.period = None
|
|
|
|
def _get_name(self):
|
|
# [source] [language-country] [max_results]
|
|
return "ddg-news {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
|
|
|
|
def _fetch_raw_urls(self, keyword_search):
|
|
try:
|
|
news = DDGS().news(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
|
|
urls = [e.get("url") for e in news]
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
|
urls = []
|
|
return urls
|
|
|
|
class SearchGoogleNews(FetcherAbstract):
|
|
def __init__(self, args={}):
|
|
super().__init__()
|
|
# Parameters
|
|
self.language = args.get("language", "en")
|
|
self.country = args.get("country", "US")
|
|
self.period = args.get("period", "7d")
|
|
|
|
def _get_name(self):
|
|
# [source] [period] [language-country]
|
|
return "googlenews {} {}-{}".format(self.period, self.language, self.country)
|
|
|
|
def _fetch_raw_urls(self, keyword_search):
|
|
try:
|
|
# Initialize
|
|
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
|
|
googlenews.enableException(True)
|
|
# Search
|
|
googlenews.get_news(keyword_search)
|
|
# Fetch
|
|
encoded_urls = googlenews.get_links()
|
|
# Decode
|
|
urls = decode_gnews_urls(encoded_urls)
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
|
urls = []
|
|
return urls
|
|
|
|
class SearchGoogleGeneral(FetcherAbstract):
|
|
def __init__(self, args={}):
|
|
super().__init__()
|
|
# Parameters
|
|
self.language = args.get("language", "en")
|
|
self.country = args.get("country", "US")
|
|
self.period = args.get("period", "7d")
|
|
self.pages = args.get("pages", 1)
|
|
|
|
def _get_name(self):
|
|
# [source] [period] [language-country] [pages]
|
|
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip()
|
|
|
|
def _fetch_raw_urls(self, keyword_search):
|
|
try:
|
|
# Initialize
|
|
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
|
|
googlenews.enableException(True)
|
|
# Search
|
|
googlenews.search(keyword_search)
|
|
|
|
set_links = set()
|
|
# Iterate pages
|
|
for i in range(self.pages):
|
|
# Sleep between pages fetch
|
|
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
|
|
# Number of URLs fetched so far
|
|
num_before = len(set_links)
|
|
# Get page
|
|
try:
|
|
links = googlenews.page_at(i+1)
|
|
except Exception as e:
|
|
logger.warning("Exception fetching page - {}: {}".format(self._get_name(), str(e)))
|
|
break
|
|
# Links
|
|
for l in links:
|
|
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
|
|
set_links.add( l.get("link").split("&ved=")[0] )
|
|
# Finished?
|
|
if (num_before == len(set_links)):
|
|
break
|
|
# To list
|
|
urls = list(set_links)
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
|
urls = []
|
|
return urls
|
|
|
|
class SearchGoogleNewsRSS(FetcherAbstract):
|
|
def __init__(self, args={}):
|
|
super().__init__()
|
|
# Parameters
|
|
self.language = args.get("language", "en")
|
|
self.country = args.get("country", "US")
|
|
|
|
def _get_name(self):
|
|
# [source] [language-country]
|
|
return "googlenews-rss {}-{}".format(self.language, self.country).strip()
|
|
|
|
def _fetch_raw_urls(self, keyword_search):
|
|
try:
|
|
# Search URL with parameters filled: https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
|
|
search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(self.language, self.country.upper()), self.country.upper(), self.country.upper(), self.language)
|
|
# Control characters
|
|
search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
|
|
# Initialize
|
|
encoded_urls = []
|
|
# Fetch feeds
|
|
feeds = feedparser.parse(search_url)
|
|
# Parse
|
|
for f in feeds.get("entries", []):
|
|
# Encoded URL
|
|
encoded_url = f.get("link", None)
|
|
'''
|
|
# Available publish date?
|
|
publish_date_parsed = f.get("published_parsed")
|
|
if (publish_date_parsed is None):
|
|
publish_date = f.get("published", None)
|
|
if (publish_date is not None):
|
|
publish_date_parsed = dateutil.parser.parse(publish_date)
|
|
|
|
# Published date
|
|
urls_publish_date.append(publish_date_parsed)'
|
|
'''
|
|
# Append
|
|
encoded_urls.append(encoded_url)
|
|
|
|
# Decode
|
|
urls = decode_gnews_urls(encoded_urls)
|
|
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
|
urls = []
|
|
|
|
return urls
|
|
|
|
class SearchYahooGeneral(FetcherAbstract):
|
|
def __init__(self, args={}):
|
|
super().__init__()
|
|
# Parameters
|
|
self.pages = args.get("pages", 2)
|
|
|
|
def _get_name(self):
|
|
# [source] [language-country] [pages]
|
|
return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip()
|
|
|
|
def _fetch_raw_urls(self, keyword_search):
|
|
try:
|
|
results = Yahoo().search(keyword_search, pages=self.pages)
|
|
urls = results.links()
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
|
urls = []
|
|
return urls
|
|
|
|
class SearchAOLGeneral(FetcherAbstract):
|
|
def __init__(self, args={}):
|
|
super().__init__()
|
|
# Parameters
|
|
self.pages = args.get("pages", 2)
|
|
|
|
def _get_name(self):
|
|
# [source] [language-country] [pages]
|
|
return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip()
|
|
|
|
def _fetch_raw_urls(self, keyword_search):
|
|
try:
|
|
results = Aol().search(keyword_search, pages=self.pages)
|
|
urls = results.links()
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
|
urls = []
|
|
return urls
|
|
###########################################################################
|
|
|
|
# List of instances
|
|
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]
|