from duckduckgo_search import DDGS from gnews import GNews from GoogleNews import GoogleNews import requests from bs4 import BeautifulSoup import os import time import json import numpy as np import random from .google_bypass import GoogleByPass from abc import ABC, abstractmethod from .logger import get_logger logger = get_logger() # Generic fetcher (fetches articles, writes to DB) class FetcherAbstract(ABC): @abstractmethod def _fetch(self): pass def fetch_articles(self, db_writer): logger.debug("Starting fetch() for {}".format(self.name)) # Fetch articles list_news = self._fetch() logger.info("Found #{} articles for search: {}".format(len(list_news), self.name)) # Write to DB db_writer.write_batch(list_news, self.name) # https://techblog.willshouse.com/2012/01/03/most-common-user-agents/ user_agents_list = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0", "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48", "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15", "Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41", "Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" ] class FetcherPreSearch(FetcherAbstract): def __init__(self, search): """ # period -> - h = hours (eg: 12h) - d = days (eg: 7d) - m = months (eg: 6m) - y = years (eg: 1y) """ self.search = search self.period = "1d" # TODO Fixed for the moment # self.lang = lang # self.region = region search_category = "news" self.name = "presearch {} {} {}".format(search, search_category, self.period) def _fetch(self): try: # PreSearch fetching endpoint, parameter search keyword presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search) # Timeout: 15 minutes r = requests.get(presearch_fetch_endpoint, timeout=900) # Decode list_news = json.loads(r.text).get("list_urls", []) except Exception as e: logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e))) list_news = [] return list_news class FetcherGNews(FetcherAbstract): def __init__(self, search, period, lang="en", region="US"): """ # period -> - h = hours (eg: 12h) - d = days (eg: 7d) - m = months (eg: 6m) - y = years (eg: 1y) """ self.search = search self.period = period self.lang = lang self.region = region search_category = "news" self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region)) def _fetch(self): try: list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search) # Decode list_news = [] for l in list_dict_news: list_news.append(l.get("url")) except Exception as e: logger.warning("Exception fetching {}: {}".format(self.name, str(e))) list_news = [] # Bypass Google links list_news_redirections = GoogleByPass().bypass_google_urls(list_news) return list_news_redirections class FetcherGoogleNews(FetcherAbstract): def __init__(self, search, search_category="news", period="1d", lang="en", region="US"): assert(search_category in ["news", "general"]) self.lang = lang self.region = region self.period = period self.search_category = search_category self.search = search self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region)) def _fetch(self): try: # Initialize g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region) g.enableException(True) if (self.search_category == "general"): set_links = set() # Search g.search(self.search) # Iterate pages MAX_ITER_PAGES = 15 for i in range(MAX_ITER_PAGES): time.sleep(random.uniform(1, 1.5)) num_before = len(set_links) # Get page try: links = g.page_at(i) except Exception as e: logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e))) break # Links for l in links: # '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ' url = l.get("link").split("url=")[-1] set_links.add(url) num_after = len(set_links) # Finished? if (num_before == num_after): logger.debug("Iterated {} pages on GoogleNews general search".format(i)) break # To list list_news = list(set_links) elif (self.search_category == "news"): # Search g.get_news(self.search) # Fetch list_news = g.get_links() except Exception as e: logger.warning("Exception fetching {}: {}".format(self.name, str(e))) list_news = [] # Bypass Google links list_news_redirections = GoogleByPass().bypass_google_urls(list_news) return list_news_redirections class FetcherDuckDuckGo(FetcherAbstract): def __init__(self, search, search_category, period, lang="wt", region="wt"): assert(search_category in ["news", "general"]) assert(period in ["d", "w", "m", "y"]) self.search = search self.search_category = search_category self.period = period self.lang_region = "{}-{}".format(lang, region) self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region) def _fetch(self): try: list_news = [] with DDGS(timeout=10) as ddgs: if (self.search_category == "general"): generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region) elif (self.search_category == "news"): generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region) for l in generator_links: list_news.append( l.get("url", l.get("href")) ) except Exception as e: logger.warning("Exception fetching {}: {}".format(self.name, str(e))) list_news = [] return list_news class FetcherSearxNews(FetcherAbstract): def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"): assert(search_category in ["news", "general"]) assert(period in [None, "day", "week", "month", "year"]) # Random header (minimize prob of web-scrapping detection) self.headers = { 'User-agent': str(np.random.choice(user_agents_list)), 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', } """ # Optional header self.headers = { 'User-agent': str(np.random.choice(user_agents_list)), 'Accept-Encoding': 'gzip, deflate, br', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'TE': 'trailers', 'Sec-Fetch-Site': 'cross-site', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Dest': 'document', } """ self.search = search self.searx_instance = searx_instance self.lang_region = "{}-{}".format(lang, region) self.search_category = search_category self.period = period self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5 self.request_timeout = 240 period_name_mapping = { None: "no_date_range", "day": "1d", "week": "1w", "month": "1m", "year": "1y", } self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region) logger.info("SearX - Initialized SearX fetcher: {}".format(self.name)) def _request_and_decode(self, url_search): # Initial random time sleep (minimize chance of getting blocked) time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher)) # Request logger.debug("SearX - Searching: {}".format(url_search)) try: r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout) except Exception as e: logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e)) return [] if (r.status_code == 200): # Status code Ok pass elif (r.status_code == 429): # TooManyRequests, "Rate limit exceeded" logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text)) return [] elif (r.status_code != 200): logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text)) return [] else: logger.debug("SearX - Status code: {}".format(r.status_code)) # Decode request soup = BeautifulSoup(r.text, 'html.parser') page_url_set = set() # h3 links for elem in soup.find_all('h3'): # Get url url = elem.find('a').get('href') page_url_set.add(url) return page_url_set def _get_news_list(self): ############################################################ # Domain & search parameter search_domain = os.path.join(self.searx_instance, "search?q=") # Search keywords search_formatted = self.search.replace(" ", "+").replace(":", "%3A") # Period formatted period_formatted = "&time_range={}".format(self.period) if self.period is not None else "" # Search parameters search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted) # Combined url search url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters) ############################################################ # Request and decode on page=1 url_set = self._request_and_decode(url_search_nopage) # No results? if (len(url_set) == 0): logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage)) return [] # Iterate pages search_numpage = 2 while True: # Combine url search with page number url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage) # Request and decode on page=X url_set_i = self._request_and_decode(url_search_with_page) # Length before merging length_current = len(url_set) # Merge url_set = url_set.union(url_set_i) # Length after merging length_merged = len(url_set) # No new elements? if (length_current == length_merged): logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage)) break # Next page search_numpage += 1 return list(url_set) def _fetch(self): try: # Fetch news list_news = self._get_news_list() except Exception as e: logger.warning("Exception fetching {}: {}".format(self.name, str(e))) list_news = [] return list_news