73 lines
3.6 KiB
Python
73 lines
3.6 KiB
Python
from .db_utils import DB_Handler
|
|
from .utils import get_searxng_instances
|
|
from .fetch_search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch
|
|
from .logger import get_logger
|
|
logger = get_logger()
|
|
|
|
class FetchSearcher():
|
|
def __init__(self, db_handler: DB_Handler, full=True) -> None:
|
|
logger.debug("Initializing News feed")
|
|
self.db_handler = db_handler
|
|
self.full_search = full
|
|
|
|
def _run_fetching(self, search_text):
|
|
logger.debug("Starting _run_fetching() for {}".format(search_text))
|
|
|
|
# Common parameters
|
|
lang, region = "en", "US"
|
|
|
|
### PreSearch
|
|
dict_params_news = {"search": search_text}
|
|
FetcherPreSearch(**dict_params_news).fetch_articles(self.db_handler)
|
|
|
|
### DuckDuckGo
|
|
period = "d"
|
|
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
|
|
FetcherDuckDuckGo(**dict_params_news).fetch_articles(self.db_handler)
|
|
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
|
|
FetcherDuckDuckGo(**dict_params_general).fetch_articles(self.db_handler)
|
|
|
|
if (self.full_search):
|
|
# Avoid site:{} search due to G-Bypass required time
|
|
if ("site:" not in search_text):
|
|
### GNews
|
|
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
|
|
FetcherGNews(**dict_params).fetch_articles(self.db_handler)
|
|
|
|
### GoogleNews
|
|
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
|
|
FetcherGoogleNews(**dict_params_news).fetch_articles(self.db_handler)
|
|
# dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
|
|
|
|
if False:
|
|
### SearxNG
|
|
period = "day"
|
|
for searx_instance in get_searxng_instances():
|
|
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
|
|
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
|
|
# Append thread
|
|
FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
|
|
FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)
|
|
|
|
logger.debug("Finished _run_fetching()")
|
|
|
|
def run(self):
|
|
try:
|
|
logger.info("Fetching text searches & URL hosts of interest")
|
|
|
|
# Get text searches of interest
|
|
list_search_text_of_interest = self.db_handler._get_search_list()
|
|
|
|
# Get URL host of interest
|
|
list_url_host = self.db_handler._get_url_host_list()
|
|
# Get text searches for URL hosts
|
|
list_search_text_url_host = ["site:{}".format(l) for l in list_url_host]
|
|
|
|
for search_text in list_search_text_of_interest + list_search_text_url_host:
|
|
logger.debug("Fetching news for search: {}".format(search_text))
|
|
self._run_fetching(search_text)
|
|
|
|
logger.info("Finished fetching text searches & URL hosts of interest")
|
|
except Exception as e:
|
|
logger.warning("Exception in NewsSearch.run(): {}".format(str(e)))
|
|
|