Fetch search fix

This commit is contained in:
Luciano Gervasoni
2025-04-02 22:39:51 +02:00
parent 84da104dc8
commit 3b54e247e7
5 changed files with 33 additions and 34 deletions

View File

@@ -3,7 +3,7 @@
conda create -n matitos_urls python=3.12
conda activate matitos_urls
# Core
pip install django psycopg[binary] django-redis django-tasks-scheduler
pip install django==5.1 psycopg[binary] django-redis django-tasks-scheduler
# Fetcher
pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect
# News visualization

View File

@@ -76,7 +76,7 @@ class DB_Handler():
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
logger.info("Inserted #{} raw URLs, Source-Search {}-{}".format(len(urls_to_insert), obj_source.source, obj_search.search))
logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search))
except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))

View File

@@ -44,9 +44,7 @@ class FetchSearcher():
args = {
"language": "en",
"country": "US",
"period": "7d",
"max_results": 100,
"max_pages": 1,
# "period": ["7d", "1d"], # TODO: List of periods to iterate
}
for SearchInstance in ListSearchInstances:

View File

@@ -59,17 +59,17 @@ class FetcherAbstract(ABC):
###########################################################################
class SearchGNews(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_results":100}):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.period = args.get("period")
self.max_results = args.get("max_results")
self.language = args.get("language", "en")
self.country = args.get("country", "US")
self.period = args.get("period", "7d")
self.max_results = args.get("max_results", 100)
def _get_name(self):
# [source] [period] [language-country] [max_results]
return "gnews {} {}-{} results={}".format("news", self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
@@ -85,12 +85,12 @@ class SearchGNews(FetcherAbstract):
return urls
class SearchDuckDuckGoGeneral(FetcherAbstract):
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.max_results = args.get("max_results")
self.language = args.get("language", "wt")
self.country = args.get("country", "wt")
self.max_results = args.get("max_results", 20)
self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None
@@ -108,12 +108,12 @@ class SearchDuckDuckGoGeneral(FetcherAbstract):
return urls
class SearchDuckDuckGoNews(FetcherAbstract):
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.max_results = args.get("max_results")
self.language = args.get("language", "wt")
self.country = args.get("country", "wt")
self.max_results = args.get("max_results", 100)
self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None
@@ -131,12 +131,12 @@ class SearchDuckDuckGoNews(FetcherAbstract):
return urls
class SearchGoogleNews(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d"}):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.period = args.get("period")
self.language = args.get("language", "en")
self.country = args.get("country", "US")
self.period = args.get("period", "7d")
def _get_name(self):
# [source] [period] [language-country]
@@ -159,13 +159,13 @@ class SearchGoogleNews(FetcherAbstract):
return urls
class SearchGoogleGeneral(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_pages":1}):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.period = args.get("period")
self.max_pages = args.get("max_pages")
self.language = args.get("language", "en")
self.country = args.get("country", "US")
self.period = args.get("period", "7d")
self.max_pages = args.get("max_pages", 1)
def _get_name(self):
# [source] [period] [language-country] [pages]
@@ -202,16 +202,16 @@ class SearchGoogleGeneral(FetcherAbstract):
# To list
urls = list(set_links)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(self._get_name(), str(e)))
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleNewsRSS(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US"}):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.language = args.get("language", "en")
self.country = args.get("country", "US")
def _get_name(self):
# [source] [language-country]

View File

@@ -13,6 +13,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
for url in encoded_urls:
# Already cached?
decoded_url = cache.get("gnews_decode_{}".format(url))
if (decoded_url is not None):
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
# Append decoded URL
@@ -29,7 +30,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
# Cache decoded URL
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
else:
logger.warning("Error decoding news.google.com, URL {}".format(url))
logger.info("Bad status while decoding news.google.com, URL {}".format(url))
except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
logger.warning("Error decoding news.google.com, URL: {}".format(url))
return list_decoded_urls