Fetch search fix
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
conda create -n matitos_urls python=3.12
|
||||
conda activate matitos_urls
|
||||
# Core
|
||||
pip install django psycopg[binary] django-redis django-tasks-scheduler
|
||||
pip install django==5.1 psycopg[binary] django-redis django-tasks-scheduler
|
||||
# Fetcher
|
||||
pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect
|
||||
# News visualization
|
||||
|
||||
@@ -76,7 +76,7 @@ class DB_Handler():
|
||||
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
|
||||
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
|
||||
|
||||
logger.info("Inserted #{} raw URLs, Source-Search {}-{}".format(len(urls_to_insert), obj_source.source, obj_search.search))
|
||||
logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
@@ -44,9 +44,7 @@ class FetchSearcher():
|
||||
args = {
|
||||
"language": "en",
|
||||
"country": "US",
|
||||
"period": "7d",
|
||||
"max_results": 100,
|
||||
"max_pages": 1,
|
||||
# "period": ["7d", "1d"], # TODO: List of periods to iterate
|
||||
}
|
||||
|
||||
for SearchInstance in ListSearchInstances:
|
||||
|
||||
@@ -59,17 +59,17 @@ class FetcherAbstract(ABC):
|
||||
###########################################################################
|
||||
|
||||
class SearchGNews(FetcherAbstract):
|
||||
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_results":100}):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language")
|
||||
self.country = args.get("country")
|
||||
self.period = args.get("period")
|
||||
self.max_results = args.get("max_results")
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
self.period = args.get("period", "7d")
|
||||
self.max_results = args.get("max_results", 100)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [period] [language-country] [max_results]
|
||||
return "gnews {} {}-{} results={}".format("news", self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
|
||||
return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
@@ -85,12 +85,12 @@ class SearchGNews(FetcherAbstract):
|
||||
return urls
|
||||
|
||||
class SearchDuckDuckGoGeneral(FetcherAbstract):
|
||||
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language")
|
||||
self.country = args.get("country")
|
||||
self.max_results = args.get("max_results")
|
||||
self.language = args.get("language", "wt")
|
||||
self.country = args.get("country", "wt")
|
||||
self.max_results = args.get("max_results", 20)
|
||||
self.region = "{}-{}".format(self.language, self.country).lower()
|
||||
self.period = None
|
||||
|
||||
@@ -108,12 +108,12 @@ class SearchDuckDuckGoGeneral(FetcherAbstract):
|
||||
return urls
|
||||
|
||||
class SearchDuckDuckGoNews(FetcherAbstract):
|
||||
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language")
|
||||
self.country = args.get("country")
|
||||
self.max_results = args.get("max_results")
|
||||
self.language = args.get("language", "wt")
|
||||
self.country = args.get("country", "wt")
|
||||
self.max_results = args.get("max_results", 100)
|
||||
self.region = "{}-{}".format(self.language, self.country).lower()
|
||||
self.period = None
|
||||
|
||||
@@ -131,12 +131,12 @@ class SearchDuckDuckGoNews(FetcherAbstract):
|
||||
return urls
|
||||
|
||||
class SearchGoogleNews(FetcherAbstract):
|
||||
def __init__(self, args={"language":"en", "country":"US", "period":"7d"}):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language")
|
||||
self.country = args.get("country")
|
||||
self.period = args.get("period")
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
self.period = args.get("period", "7d")
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [period] [language-country]
|
||||
@@ -159,13 +159,13 @@ class SearchGoogleNews(FetcherAbstract):
|
||||
return urls
|
||||
|
||||
class SearchGoogleGeneral(FetcherAbstract):
|
||||
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_pages":1}):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language")
|
||||
self.country = args.get("country")
|
||||
self.period = args.get("period")
|
||||
self.max_pages = args.get("max_pages")
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
self.period = args.get("period", "7d")
|
||||
self.max_pages = args.get("max_pages", 1)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [period] [language-country] [pages]
|
||||
@@ -202,16 +202,16 @@ class SearchGoogleGeneral(FetcherAbstract):
|
||||
# To list
|
||||
urls = list(set_links)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}\n{}".format(self._get_name(), str(e)))
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchGoogleNewsRSS(FetcherAbstract):
|
||||
def __init__(self, args={"language":"en", "country":"US"}):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language")
|
||||
self.country = args.get("country")
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country]
|
||||
|
||||
@@ -13,6 +13,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
|
||||
for url in encoded_urls:
|
||||
# Already cached?
|
||||
decoded_url = cache.get("gnews_decode_{}".format(url))
|
||||
|
||||
if (decoded_url is not None):
|
||||
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
|
||||
# Append decoded URL
|
||||
@@ -29,7 +30,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
|
||||
# Cache decoded URL
|
||||
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
|
||||
else:
|
||||
logger.warning("Error decoding news.google.com, URL {}".format(url))
|
||||
logger.info("Bad status while decoding news.google.com, URL {}".format(url))
|
||||
except Exception as e:
|
||||
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
|
||||
logger.warning("Error decoding news.google.com, URL: {}".format(url))
|
||||
return list_decoded_urls
|
||||
Reference in New Issue
Block a user