import time import feedparser import os from django.utils import timezone from datetime import timedelta from ..models import Search, Source from .fetch_utils import decode_gnews_urls from .logger import get_logger logger = get_logger() from gnews import GNews from duckduckgo_search import DDGS from GoogleNews import GoogleNews from search_engines import Yahoo, Aol ########################################################################### ########################################################################### from abc import ABC, abstractmethod # Generic fetcher (fetches articles, writes to DB) class FetcherAbstract(ABC): @abstractmethod def _fetch_raw_urls(self): pass @abstractmethod def _get_name(self): pass def _get_source_object(self, source): # TODO: Cache # self.cached_sources = {} # Get source object obj_source, created = Source.objects.get_or_create(source=source) return obj_source def _post_process_urls(self, raw_urls, obj_search): # Searching URL Host based? Make sure results belong to that site if (obj_search.type == Search.TYPE_ENUM.URL_HOST): # Get clean URL host url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "") # Ensure URL host in URL raw_urls = [u for u in raw_urls if url_host_clean in u] return raw_urls def fetch_articles(self, db_writer, obj_search): # Source name source_name = self._get_name() # Search keyword_search = obj_search.search # URL Host search? -> site:${URL_HOST} if (obj_search.type == Search.TYPE_ENUM.URL_HOST): keyword_search = "{}{}".format("site:", keyword_search) # Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK} if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH): start_date = timezone.now() - timedelta(days=7) keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year)) logger.debug("Starting search: {} - {}".format(keyword_search, source_name)) # Fetch raw_urls = self._fetch_raw_urls(keyword_search) # Post-process raw_urls = self._post_process_urls(raw_urls, obj_search) # Write to DB db_writer.insert_raw_urls(raw_urls, self._get_source_object(source_name), obj_search) ########################################################################### class SearchGNews(FetcherAbstract): def __init__(self, args={}): super().__init__() # Parameters self.language = args.get("language", "en") self.country = args.get("country", "US") self.period = args.get("period", "7d") self.max_results = args.get("max_results", 100) def _get_name(self): # [source] [period] [language-country] [max_results] return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip() def _fetch_raw_urls(self, keyword_search): try: # Get news results_gnews = GNews(language=self.language, country=self.country, period=self.period, max_results=self.max_results).get_news(keyword_search) # Get list of encoded urls encoded_urls = [e.get("url") for e in results_gnews] # Decode urls = decode_gnews_urls(encoded_urls) except Exception as e: logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) urls = [] return urls class SearchDuckDuckGoGeneral(FetcherAbstract): def __init__(self, args={}): super().__init__() # Parameters self.language = args.get("language", "wt") self.country = args.get("country", "wt") self.max_results = args.get("max_results", 20) self.region = "{}-{}".format(self.language, self.country).lower() self.period = None def _get_name(self): # [source] [language-country] [max_results] return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip() def _fetch_raw_urls(self, keyword_search): try: news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results) urls = [e.get("href") for e in news] except Exception as e: logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) urls = [] return urls class SearchDuckDuckGoNews(FetcherAbstract): def __init__(self, args={}): super().__init__() # Parameters self.language = args.get("language", "wt") self.country = args.get("country", "wt") self.max_results = args.get("max_results", 100) self.region = "{}-{}".format(self.language, self.country).lower() self.period = None def _get_name(self): # [source] [language-country] [max_results] return "ddg-news {} results={}".format(self.region, self.max_results).replace("results=None", "").strip() def _fetch_raw_urls(self, keyword_search): try: news = DDGS().news(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results) urls = [e.get("url") for e in news] except Exception as e: logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) urls = [] return urls class SearchGoogleNews(FetcherAbstract): def __init__(self, args={}): super().__init__() # Parameters self.language = args.get("language", "en") self.country = args.get("country", "US") self.period = args.get("period", "7d") def _get_name(self): # [source] [period] [language-country] return "googlenews {} {}-{}".format(self.period, self.language, self.country) def _fetch_raw_urls(self, keyword_search): try: # Initialize googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country) googlenews.enableException(True) # Search googlenews.get_news(keyword_search) # Fetch encoded_urls = googlenews.get_links() # Decode urls = decode_gnews_urls(encoded_urls) except Exception as e: logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) urls = [] return urls class SearchGoogleGeneral(FetcherAbstract): def __init__(self, args={}): super().__init__() # Parameters self.language = args.get("language", "en") self.country = args.get("country", "US") self.period = args.get("period", "7d") self.pages = args.get("pages", 1) def _get_name(self): # [source] [period] [language-country] [pages] return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip() def _fetch_raw_urls(self, keyword_search): try: # Initialize googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country) googlenews.enableException(True) # Search googlenews.search(keyword_search) set_links = set() # Iterate pages for i in range(self.pages): # Sleep between pages fetch time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4))) # Number of URLs fetched so far num_before = len(set_links) # Get page try: links = googlenews.page_at(i+1) except Exception as e: logger.warning("Exception fetching page - {}: {}".format(self._get_name(), str(e))) break # Links for l in links: # 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK' set_links.add( l.get("link").split("&ved=")[0] ) # Finished? if (num_before == len(set_links)): break # To list urls = list(set_links) except Exception as e: logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) urls = [] return urls class SearchGoogleNewsRSS(FetcherAbstract): def __init__(self, args={}): super().__init__() # Parameters self.language = args.get("language", "en") self.country = args.get("country", "US") def _get_name(self): # [source] [language-country] return "googlenews-rss {}-{}".format(self.language, self.country).strip() def _fetch_raw_urls(self, keyword_search): try: # Search URL with parameters filled: https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(self.language, self.country.upper()), self.country.upper(), self.country.upper(), self.language) # Control characters search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen # Initialize encoded_urls = [] # Fetch feeds feeds = feedparser.parse(search_url) # Parse for f in feeds.get("entries", []): # Encoded URL encoded_url = f.get("link", None) ''' # Available publish date? publish_date_parsed = f.get("published_parsed") if (publish_date_parsed is None): publish_date = f.get("published", None) if (publish_date is not None): publish_date_parsed = dateutil.parser.parse(publish_date) # Published date urls_publish_date.append(publish_date_parsed)' ''' # Append encoded_urls.append(encoded_url) # Decode urls = decode_gnews_urls(encoded_urls) except Exception as e: logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) urls = [] return urls class SearchYahooGeneral(FetcherAbstract): def __init__(self, args={}): super().__init__() # Parameters self.pages = args.get("pages", 2) def _get_name(self): # [source] [language-country] [pages] return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip() def _fetch_raw_urls(self, keyword_search): try: results = Yahoo().search(keyword_search, pages=self.pages) urls = results.links() except Exception as e: logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) urls = [] return urls class SearchAOLGeneral(FetcherAbstract): def __init__(self, args={}): super().__init__() # Parameters self.pages = args.get("pages", 2) def _get_name(self): # [source] [language-country] [pages] return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip() def _fetch_raw_urls(self, keyword_search): try: results = Aol().search(keyword_search, pages=self.pages) urls = results.links() except Exception as e: logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) urls = [] return urls ########################################################################### # List of instances ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]