385 lines
19 KiB
Python
385 lines
19 KiB
Python
from duckduckgo_search import DDGS
|
|
from gnews import GNews
|
|
from GoogleNews import GoogleNews
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
import time
|
|
import json
|
|
import numpy as np
|
|
import random
|
|
from .google_bypass import GoogleByPass
|
|
from abc import ABC, abstractmethod
|
|
from .logger import get_logger
|
|
logger = get_logger()
|
|
|
|
|
|
|
|
# Generic fetcher (fetches articles, writes to DB)
|
|
class FetcherAbstract(ABC):
|
|
@abstractmethod
|
|
def _fetch(self):
|
|
pass
|
|
|
|
def fetch_articles(self, db_writer):
|
|
logger.debug("Starting fetch() for {}".format(self.name))
|
|
# Fetch articles
|
|
list_news = self._fetch()
|
|
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
|
|
# Write to DB
|
|
db_writer.write_batch(list_news, self.name)
|
|
|
|
# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
|
|
|
|
user_agents_list = [
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
|
"Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
|
|
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
|
|
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
|
"Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
|
|
"Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
|
|
"Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
|
|
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class FetcherPreSearch(FetcherAbstract):
|
|
def __init__(self, search):
|
|
"""
|
|
# period ->
|
|
- h = hours (eg: 12h)
|
|
- d = days (eg: 7d)
|
|
- m = months (eg: 6m)
|
|
- y = years (eg: 1y)
|
|
"""
|
|
self.search = search
|
|
self.period = "1d" # TODO Fixed for the moment
|
|
# self.lang = lang
|
|
# self.region = region
|
|
search_category = "news"
|
|
self.name = "presearch {} {} {}".format(search, search_category, self.period)
|
|
|
|
def _fetch(self):
|
|
try:
|
|
# PreSearch fetching endpoint, parameter search keyword
|
|
presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search)
|
|
# Timeout: 15 minutes
|
|
r = requests.get(presearch_fetch_endpoint, timeout=900)
|
|
# Decode
|
|
list_news = json.loads(r.text).get("list_urls", [])
|
|
except Exception as e:
|
|
logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e)))
|
|
list_news = []
|
|
return list_news
|
|
|
|
|
|
|
|
class FetcherGNews(FetcherAbstract):
|
|
def __init__(self, search, period, lang="en", region="US"):
|
|
"""
|
|
# period ->
|
|
- h = hours (eg: 12h)
|
|
- d = days (eg: 7d)
|
|
- m = months (eg: 6m)
|
|
- y = years (eg: 1y)
|
|
"""
|
|
self.search = search
|
|
self.period = period
|
|
self.lang = lang
|
|
self.region = region
|
|
search_category = "news"
|
|
self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
|
|
|
|
def _fetch(self):
|
|
try:
|
|
list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search)
|
|
# Decode
|
|
list_news = []
|
|
for l in list_dict_news:
|
|
list_news.append(l.get("url"))
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
|
list_news = []
|
|
|
|
# Bypass Google links
|
|
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
|
|
|
|
return list_news_redirections
|
|
|
|
class FetcherGoogleNews(FetcherAbstract):
|
|
def __init__(self, search, search_category="news", period="1d", lang="en", region="US"):
|
|
assert(search_category in ["news", "general"])
|
|
|
|
self.lang = lang
|
|
self.region = region
|
|
self.period = period
|
|
self.search_category = search_category
|
|
self.search = search
|
|
self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
|
|
|
|
def _fetch(self):
|
|
try:
|
|
# Initialize
|
|
g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region)
|
|
g.enableException(True)
|
|
|
|
if (self.search_category == "general"):
|
|
set_links = set()
|
|
# Search
|
|
g.search(self.search)
|
|
|
|
# Iterate pages
|
|
MAX_ITER_PAGES = 15
|
|
for i in range(MAX_ITER_PAGES):
|
|
time.sleep(random.uniform(1, 1.5))
|
|
num_before = len(set_links)
|
|
|
|
# Get page
|
|
try:
|
|
links = g.page_at(i)
|
|
except Exception as e:
|
|
logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e)))
|
|
break
|
|
# Links
|
|
for l in links:
|
|
# '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ'
|
|
url = l.get("link").split("url=")[-1]
|
|
set_links.add(url)
|
|
|
|
num_after = len(set_links)
|
|
|
|
# Finished?
|
|
if (num_before == num_after):
|
|
logger.debug("Iterated {} pages on GoogleNews general search".format(i))
|
|
break
|
|
# To list
|
|
list_news = list(set_links)
|
|
elif (self.search_category == "news"):
|
|
# Search
|
|
g.get_news(self.search)
|
|
# Fetch
|
|
list_news = g.get_links()
|
|
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
|
list_news = []
|
|
|
|
# Bypass Google links
|
|
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
|
|
|
|
return list_news_redirections
|
|
|
|
class FetcherDuckDuckGo(FetcherAbstract):
|
|
def __init__(self, search, search_category, period, lang="wt", region="wt"):
|
|
assert(search_category in ["news", "general"])
|
|
assert(period in ["d", "w", "m", "y"])
|
|
self.search = search
|
|
self.search_category = search_category
|
|
self.period = period
|
|
self.lang_region = "{}-{}".format(lang, region)
|
|
self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region)
|
|
|
|
def _fetch(self):
|
|
try:
|
|
list_news = []
|
|
with DDGS(timeout=10) as ddgs:
|
|
if (self.search_category == "general"):
|
|
generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region)
|
|
elif (self.search_category == "news"):
|
|
generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region)
|
|
|
|
for l in generator_links:
|
|
list_news.append( l.get("url", l.get("href")) )
|
|
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
|
list_news = []
|
|
return list_news
|
|
|
|
|
|
class FetcherSearxNews(FetcherAbstract):
|
|
def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"):
|
|
assert(search_category in ["news", "general"])
|
|
assert(period in [None, "day", "week", "month", "year"])
|
|
# Random header (minimize prob of web-scrapping detection)
|
|
self.headers = {
|
|
'User-agent': str(np.random.choice(user_agents_list)),
|
|
'Accept-Encoding': 'gzip, deflate',
|
|
'Accept': '*/*',
|
|
'Connection': 'keep-alive',
|
|
}
|
|
""" # Optional header
|
|
self.headers = {
|
|
'User-agent': str(np.random.choice(user_agents_list)),
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
'TE': 'trailers',
|
|
'Sec-Fetch-Site': 'cross-site',
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
'Sec-Fetch-Dest': 'document',
|
|
}
|
|
"""
|
|
self.search = search
|
|
self.searx_instance = searx_instance
|
|
self.lang_region = "{}-{}".format(lang, region)
|
|
self.search_category = search_category
|
|
self.period = period
|
|
self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5
|
|
self.request_timeout = 240
|
|
|
|
period_name_mapping = {
|
|
None: "no_date_range",
|
|
"day": "1d",
|
|
"week": "1w",
|
|
"month": "1m",
|
|
"year": "1y",
|
|
}
|
|
self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region)
|
|
logger.info("SearX - Initialized SearX fetcher: {}".format(self.name))
|
|
|
|
def _request_and_decode(self, url_search):
|
|
# Initial random time sleep (minimize chance of getting blocked)
|
|
time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher))
|
|
# Request
|
|
logger.debug("SearX - Searching: {}".format(url_search))
|
|
try:
|
|
r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout)
|
|
except Exception as e:
|
|
logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e))
|
|
return []
|
|
|
|
if (r.status_code == 200):
|
|
# Status code Ok
|
|
pass
|
|
elif (r.status_code == 429):
|
|
# TooManyRequests, "Rate limit exceeded"
|
|
logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text))
|
|
return []
|
|
elif (r.status_code != 200):
|
|
logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text))
|
|
return []
|
|
else:
|
|
logger.debug("SearX - Status code: {}".format(r.status_code))
|
|
|
|
# Decode request
|
|
soup = BeautifulSoup(r.text, 'html.parser')
|
|
page_url_set = set()
|
|
# h3 links
|
|
for elem in soup.find_all('h3'):
|
|
# Get url
|
|
url = elem.find('a').get('href')
|
|
page_url_set.add(url)
|
|
return page_url_set
|
|
|
|
def _get_news_list(self):
|
|
############################################################
|
|
# Domain & search parameter
|
|
search_domain = os.path.join(self.searx_instance, "search?q=")
|
|
# Search keywords
|
|
search_formatted = self.search.replace(" ", "+").replace(":", "%3A")
|
|
# Period formatted
|
|
period_formatted = "&time_range={}".format(self.period) if self.period is not None else ""
|
|
# Search parameters
|
|
search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted)
|
|
# Combined url search
|
|
url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters)
|
|
############################################################
|
|
|
|
# Request and decode on page=1
|
|
url_set = self._request_and_decode(url_search_nopage)
|
|
# No results?
|
|
if (len(url_set) == 0):
|
|
logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage))
|
|
return []
|
|
|
|
# Iterate pages
|
|
search_numpage = 2
|
|
while True:
|
|
# Combine url search with page number
|
|
url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage)
|
|
# Request and decode on page=X
|
|
url_set_i = self._request_and_decode(url_search_with_page)
|
|
|
|
# Length before merging
|
|
length_current = len(url_set)
|
|
# Merge
|
|
url_set = url_set.union(url_set_i)
|
|
# Length after merging
|
|
length_merged = len(url_set)
|
|
|
|
# No new elements?
|
|
if (length_current == length_merged):
|
|
logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage))
|
|
break
|
|
# Next page
|
|
search_numpage += 1
|
|
|
|
return list(url_set)
|
|
|
|
def _fetch(self):
|
|
try:
|
|
# Fetch news
|
|
list_news = self._get_news_list()
|
|
except Exception as e:
|
|
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
|
list_news = []
|
|
return list_news
|