diff --git a/app_urls/README.md b/app_urls/README.md index 09e5048..a23b61e 100644 --- a/app_urls/README.md +++ b/app_urls/README.md @@ -3,7 +3,7 @@ conda create -n matitos_urls python=3.12 conda activate matitos_urls # Core -pip install django psycopg[binary] django-redis django-tasks-scheduler +pip install django==5.1 psycopg[binary] django-redis django-tasks-scheduler # Fetcher pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect # News visualization diff --git a/app_urls/api/src/db_utils.py b/app_urls/api/src/db_utils.py index 79a02c8..4bf23d4 100644 --- a/app_urls/api/src/db_utils.py +++ b/app_urls/api/src/db_utils.py @@ -76,7 +76,7 @@ class DB_Handler(): cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url) cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url) - logger.info("Inserted #{} raw URLs, Source-Search {}-{}".format(len(urls_to_insert), obj_source.source, obj_search.search)) + logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search)) except Exception as e: logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/api/src/fetch_search.py b/app_urls/api/src/fetch_search.py index 9da003f..728e165 100644 --- a/app_urls/api/src/fetch_search.py +++ b/app_urls/api/src/fetch_search.py @@ -44,9 +44,7 @@ class FetchSearcher(): args = { "language": "en", "country": "US", - "period": "7d", - "max_results": 100, - "max_pages": 1, + # "period": ["7d", "1d"], # TODO: List of periods to iterate } for SearchInstance in ListSearchInstances: diff --git a/app_urls/api/src/fetch_search_instances.py b/app_urls/api/src/fetch_search_instances.py index 10397f4..22ba642 100644 --- a/app_urls/api/src/fetch_search_instances.py +++ b/app_urls/api/src/fetch_search_instances.py @@ -59,17 +59,17 @@ class FetcherAbstract(ABC): ########################################################################### class SearchGNews(FetcherAbstract): - def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_results":100}): + def __init__(self, args={}): super().__init__() # Parameters - self.language = args.get("language") - self.country = args.get("country") - self.period = args.get("period") - self.max_results = args.get("max_results") + self.language = args.get("language", "en") + self.country = args.get("country", "US") + self.period = args.get("period", "7d") + self.max_results = args.get("max_results", 100) def _get_name(self): # [source] [period] [language-country] [max_results] - return "gnews {} {}-{} results={}".format("news", self.period, self.language, self.country, self.max_results).replace("results=None", "").strip() + return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip() def _fetch_raw_urls(self, keyword_search): try: @@ -85,12 +85,12 @@ class SearchGNews(FetcherAbstract): return urls class SearchDuckDuckGoGeneral(FetcherAbstract): - def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}): + def __init__(self, args={}): super().__init__() # Parameters - self.language = args.get("language") - self.country = args.get("country") - self.max_results = args.get("max_results") + self.language = args.get("language", "wt") + self.country = args.get("country", "wt") + self.max_results = args.get("max_results", 20) self.region = "{}-{}".format(self.language, self.country).lower() self.period = None @@ -108,12 +108,12 @@ class SearchDuckDuckGoGeneral(FetcherAbstract): return urls class SearchDuckDuckGoNews(FetcherAbstract): - def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}): + def __init__(self, args={}): super().__init__() # Parameters - self.language = args.get("language") - self.country = args.get("country") - self.max_results = args.get("max_results") + self.language = args.get("language", "wt") + self.country = args.get("country", "wt") + self.max_results = args.get("max_results", 100) self.region = "{}-{}".format(self.language, self.country).lower() self.period = None @@ -131,12 +131,12 @@ class SearchDuckDuckGoNews(FetcherAbstract): return urls class SearchGoogleNews(FetcherAbstract): - def __init__(self, args={"language":"en", "country":"US", "period":"7d"}): + def __init__(self, args={}): super().__init__() # Parameters - self.language = args.get("language") - self.country = args.get("country") - self.period = args.get("period") + self.language = args.get("language", "en") + self.country = args.get("country", "US") + self.period = args.get("period", "7d") def _get_name(self): # [source] [period] [language-country] @@ -159,13 +159,13 @@ class SearchGoogleNews(FetcherAbstract): return urls class SearchGoogleGeneral(FetcherAbstract): - def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_pages":1}): + def __init__(self, args={}): super().__init__() # Parameters - self.language = args.get("language") - self.country = args.get("country") - self.period = args.get("period") - self.max_pages = args.get("max_pages") + self.language = args.get("language", "en") + self.country = args.get("country", "US") + self.period = args.get("period", "7d") + self.max_pages = args.get("max_pages", 1) def _get_name(self): # [source] [period] [language-country] [pages] @@ -202,16 +202,16 @@ class SearchGoogleGeneral(FetcherAbstract): # To list urls = list(set_links) except Exception as e: - logger.warning("Exception fetching {}: {}\n{}".format(self._get_name(), str(e))) + logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) urls = [] return urls class SearchGoogleNewsRSS(FetcherAbstract): - def __init__(self, args={"language":"en", "country":"US"}): + def __init__(self, args={}): super().__init__() # Parameters - self.language = args.get("language") - self.country = args.get("country") + self.language = args.get("language", "en") + self.country = args.get("country", "US") def _get_name(self): # [source] [language-country] diff --git a/app_urls/api/src/fetch_utils.py b/app_urls/api/src/fetch_utils.py index 29621e1..e5c1346 100644 --- a/app_urls/api/src/fetch_utils.py +++ b/app_urls/api/src/fetch_utils.py @@ -13,6 +13,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE for url in encoded_urls: # Already cached? decoded_url = cache.get("gnews_decode_{}".format(url)) + if (decoded_url is not None): logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url)) # Append decoded URL @@ -29,7 +30,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE # Cache decoded URL cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12) else: - logger.warning("Error decoding news.google.com, URL {}".format(url)) + logger.info("Bad status while decoding news.google.com, URL {}".format(url)) except Exception as e: - logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc())) + logger.warning("Error decoding news.google.com, URL: {}".format(url)) return list_decoded_urls \ No newline at end of file