Fetch search fix

This commit is contained in:
Luciano Gervasoni
2025-04-02 22:39:51 +02:00
parent 84da104dc8
commit 3b54e247e7
5 changed files with 33 additions and 34 deletions

View File

@@ -3,7 +3,7 @@
conda create -n matitos_urls python=3.12 conda create -n matitos_urls python=3.12
conda activate matitos_urls conda activate matitos_urls
# Core # Core
pip install django psycopg[binary] django-redis django-tasks-scheduler pip install django==5.1 psycopg[binary] django-redis django-tasks-scheduler
# Fetcher # Fetcher
pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect
# News visualization # News visualization

View File

@@ -76,7 +76,7 @@ class DB_Handler():
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url) cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url) cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
logger.info("Inserted #{} raw URLs, Source-Search {}-{}".format(len(urls_to_insert), obj_source.source, obj_search.search)) logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search))
except Exception as e: except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))

View File

@@ -44,9 +44,7 @@ class FetchSearcher():
args = { args = {
"language": "en", "language": "en",
"country": "US", "country": "US",
"period": "7d", # "period": ["7d", "1d"], # TODO: List of periods to iterate
"max_results": 100,
"max_pages": 1,
} }
for SearchInstance in ListSearchInstances: for SearchInstance in ListSearchInstances:

View File

@@ -59,17 +59,17 @@ class FetcherAbstract(ABC):
########################################################################### ###########################################################################
class SearchGNews(FetcherAbstract): class SearchGNews(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_results":100}): def __init__(self, args={}):
super().__init__() super().__init__()
# Parameters # Parameters
self.language = args.get("language") self.language = args.get("language", "en")
self.country = args.get("country") self.country = args.get("country", "US")
self.period = args.get("period") self.period = args.get("period", "7d")
self.max_results = args.get("max_results") self.max_results = args.get("max_results", 100)
def _get_name(self): def _get_name(self):
# [source] [period] [language-country] [max_results] # [source] [period] [language-country] [max_results]
return "gnews {} {}-{} results={}".format("news", self.period, self.language, self.country, self.max_results).replace("results=None", "").strip() return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search): def _fetch_raw_urls(self, keyword_search):
try: try:
@@ -85,12 +85,12 @@ class SearchGNews(FetcherAbstract):
return urls return urls
class SearchDuckDuckGoGeneral(FetcherAbstract): class SearchDuckDuckGoGeneral(FetcherAbstract):
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}): def __init__(self, args={}):
super().__init__() super().__init__()
# Parameters # Parameters
self.language = args.get("language") self.language = args.get("language", "wt")
self.country = args.get("country") self.country = args.get("country", "wt")
self.max_results = args.get("max_results") self.max_results = args.get("max_results", 20)
self.region = "{}-{}".format(self.language, self.country).lower() self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None self.period = None
@@ -108,12 +108,12 @@ class SearchDuckDuckGoGeneral(FetcherAbstract):
return urls return urls
class SearchDuckDuckGoNews(FetcherAbstract): class SearchDuckDuckGoNews(FetcherAbstract):
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}): def __init__(self, args={}):
super().__init__() super().__init__()
# Parameters # Parameters
self.language = args.get("language") self.language = args.get("language", "wt")
self.country = args.get("country") self.country = args.get("country", "wt")
self.max_results = args.get("max_results") self.max_results = args.get("max_results", 100)
self.region = "{}-{}".format(self.language, self.country).lower() self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None self.period = None
@@ -131,12 +131,12 @@ class SearchDuckDuckGoNews(FetcherAbstract):
return urls return urls
class SearchGoogleNews(FetcherAbstract): class SearchGoogleNews(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d"}): def __init__(self, args={}):
super().__init__() super().__init__()
# Parameters # Parameters
self.language = args.get("language") self.language = args.get("language", "en")
self.country = args.get("country") self.country = args.get("country", "US")
self.period = args.get("period") self.period = args.get("period", "7d")
def _get_name(self): def _get_name(self):
# [source] [period] [language-country] # [source] [period] [language-country]
@@ -159,13 +159,13 @@ class SearchGoogleNews(FetcherAbstract):
return urls return urls
class SearchGoogleGeneral(FetcherAbstract): class SearchGoogleGeneral(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_pages":1}): def __init__(self, args={}):
super().__init__() super().__init__()
# Parameters # Parameters
self.language = args.get("language") self.language = args.get("language", "en")
self.country = args.get("country") self.country = args.get("country", "US")
self.period = args.get("period") self.period = args.get("period", "7d")
self.max_pages = args.get("max_pages") self.max_pages = args.get("max_pages", 1)
def _get_name(self): def _get_name(self):
# [source] [period] [language-country] [pages] # [source] [period] [language-country] [pages]
@@ -202,16 +202,16 @@ class SearchGoogleGeneral(FetcherAbstract):
# To list # To list
urls = list(set_links) urls = list(set_links)
except Exception as e: except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(self._get_name(), str(e))) logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = [] urls = []
return urls return urls
class SearchGoogleNewsRSS(FetcherAbstract): class SearchGoogleNewsRSS(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US"}): def __init__(self, args={}):
super().__init__() super().__init__()
# Parameters # Parameters
self.language = args.get("language") self.language = args.get("language", "en")
self.country = args.get("country") self.country = args.get("country", "US")
def _get_name(self): def _get_name(self):
# [source] [language-country] # [source] [language-country]

View File

@@ -13,6 +13,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
for url in encoded_urls: for url in encoded_urls:
# Already cached? # Already cached?
decoded_url = cache.get("gnews_decode_{}".format(url)) decoded_url = cache.get("gnews_decode_{}".format(url))
if (decoded_url is not None): if (decoded_url is not None):
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url)) logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
# Append decoded URL # Append decoded URL
@@ -29,7 +30,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
# Cache decoded URL # Cache decoded URL
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12) cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
else: else:
logger.warning("Error decoding news.google.com, URL {}".format(url)) logger.info("Bad status while decoding news.google.com, URL {}".format(url))
except Exception as e: except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc())) logger.warning("Error decoding news.google.com, URL: {}".format(url))
return list_decoded_urls return list_decoded_urls