Files
matitos_news/app_urls/api/obsolete_src/fetch_search_sources.py
2025-03-12 17:56:40 +01:00

385 lines
19 KiB
Python

from duckduckgo_search import DDGS
from gnews import GNews
from GoogleNews import GoogleNews
import requests
from bs4 import BeautifulSoup
import os
import time
import json
import numpy as np
import random
from .google_bypass import GoogleByPass
from abc import ABC, abstractmethod
from .logger import get_logger
logger = get_logger()
# Generic fetcher (fetches articles, writes to DB)
class FetcherAbstract(ABC):
@abstractmethod
def _fetch(self):
pass
def fetch_articles(self, db_writer):
logger.debug("Starting fetch() for {}".format(self.name))
# Fetch articles
list_news = self._fetch()
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
# Write to DB
db_writer.write_batch(list_news, self.name)
# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
user_agents_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
"Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
]
class FetcherPreSearch(FetcherAbstract):
def __init__(self, search):
"""
# period ->
- h = hours (eg: 12h)
- d = days (eg: 7d)
- m = months (eg: 6m)
- y = years (eg: 1y)
"""
self.search = search
self.period = "1d" # TODO Fixed for the moment
# self.lang = lang
# self.region = region
search_category = "news"
self.name = "presearch {} {} {}".format(search, search_category, self.period)
def _fetch(self):
try:
# PreSearch fetching endpoint, parameter search keyword
presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search)
# Timeout: 15 minutes
r = requests.get(presearch_fetch_endpoint, timeout=900)
# Decode
list_news = json.loads(r.text).get("list_urls", [])
except Exception as e:
logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e)))
list_news = []
return list_news
class FetcherGNews(FetcherAbstract):
def __init__(self, search, period, lang="en", region="US"):
"""
# period ->
- h = hours (eg: 12h)
- d = days (eg: 7d)
- m = months (eg: 6m)
- y = years (eg: 1y)
"""
self.search = search
self.period = period
self.lang = lang
self.region = region
search_category = "news"
self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
def _fetch(self):
try:
list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search)
# Decode
list_news = []
for l in list_dict_news:
list_news.append(l.get("url"))
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
# Bypass Google links
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
return list_news_redirections
class FetcherGoogleNews(FetcherAbstract):
def __init__(self, search, search_category="news", period="1d", lang="en", region="US"):
assert(search_category in ["news", "general"])
self.lang = lang
self.region = region
self.period = period
self.search_category = search_category
self.search = search
self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
def _fetch(self):
try:
# Initialize
g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region)
g.enableException(True)
if (self.search_category == "general"):
set_links = set()
# Search
g.search(self.search)
# Iterate pages
MAX_ITER_PAGES = 15
for i in range(MAX_ITER_PAGES):
time.sleep(random.uniform(1, 1.5))
num_before = len(set_links)
# Get page
try:
links = g.page_at(i)
except Exception as e:
logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e)))
break
# Links
for l in links:
# '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ'
url = l.get("link").split("url=")[-1]
set_links.add(url)
num_after = len(set_links)
# Finished?
if (num_before == num_after):
logger.debug("Iterated {} pages on GoogleNews general search".format(i))
break
# To list
list_news = list(set_links)
elif (self.search_category == "news"):
# Search
g.get_news(self.search)
# Fetch
list_news = g.get_links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
# Bypass Google links
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
return list_news_redirections
class FetcherDuckDuckGo(FetcherAbstract):
def __init__(self, search, search_category, period, lang="wt", region="wt"):
assert(search_category in ["news", "general"])
assert(period in ["d", "w", "m", "y"])
self.search = search
self.search_category = search_category
self.period = period
self.lang_region = "{}-{}".format(lang, region)
self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region)
def _fetch(self):
try:
list_news = []
with DDGS(timeout=10) as ddgs:
if (self.search_category == "general"):
generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region)
elif (self.search_category == "news"):
generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region)
for l in generator_links:
list_news.append( l.get("url", l.get("href")) )
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
return list_news
class FetcherSearxNews(FetcherAbstract):
def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"):
assert(search_category in ["news", "general"])
assert(period in [None, "day", "week", "month", "year"])
# Random header (minimize prob of web-scrapping detection)
self.headers = {
'User-agent': str(np.random.choice(user_agents_list)),
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*',
'Connection': 'keep-alive',
}
""" # Optional header
self.headers = {
'User-agent': str(np.random.choice(user_agents_list)),
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'TE': 'trailers',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Dest': 'document',
}
"""
self.search = search
self.searx_instance = searx_instance
self.lang_region = "{}-{}".format(lang, region)
self.search_category = search_category
self.period = period
self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5
self.request_timeout = 240
period_name_mapping = {
None: "no_date_range",
"day": "1d",
"week": "1w",
"month": "1m",
"year": "1y",
}
self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region)
logger.info("SearX - Initialized SearX fetcher: {}".format(self.name))
def _request_and_decode(self, url_search):
# Initial random time sleep (minimize chance of getting blocked)
time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher))
# Request
logger.debug("SearX - Searching: {}".format(url_search))
try:
r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout)
except Exception as e:
logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e))
return []
if (r.status_code == 200):
# Status code Ok
pass
elif (r.status_code == 429):
# TooManyRequests, "Rate limit exceeded"
logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text))
return []
elif (r.status_code != 200):
logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text))
return []
else:
logger.debug("SearX - Status code: {}".format(r.status_code))
# Decode request
soup = BeautifulSoup(r.text, 'html.parser')
page_url_set = set()
# h3 links
for elem in soup.find_all('h3'):
# Get url
url = elem.find('a').get('href')
page_url_set.add(url)
return page_url_set
def _get_news_list(self):
############################################################
# Domain & search parameter
search_domain = os.path.join(self.searx_instance, "search?q=")
# Search keywords
search_formatted = self.search.replace(" ", "+").replace(":", "%3A")
# Period formatted
period_formatted = "&time_range={}".format(self.period) if self.period is not None else ""
# Search parameters
search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted)
# Combined url search
url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters)
############################################################
# Request and decode on page=1
url_set = self._request_and_decode(url_search_nopage)
# No results?
if (len(url_set) == 0):
logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage))
return []
# Iterate pages
search_numpage = 2
while True:
# Combine url search with page number
url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage)
# Request and decode on page=X
url_set_i = self._request_and_decode(url_search_with_page)
# Length before merging
length_current = len(url_set)
# Merge
url_set = url_set.union(url_set_i)
# Length after merging
length_merged = len(url_set)
# No new elements?
if (length_current == length_merged):
logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage))
break
# Next page
search_numpage += 1
return list(url_set)
def _fetch(self):
try:
# Fetch news
list_news = self._get_news_list()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
return list_news