Fetch search fix
This commit is contained in:
@@ -3,7 +3,7 @@
|
|||||||
conda create -n matitos_urls python=3.12
|
conda create -n matitos_urls python=3.12
|
||||||
conda activate matitos_urls
|
conda activate matitos_urls
|
||||||
# Core
|
# Core
|
||||||
pip install django psycopg[binary] django-redis django-tasks-scheduler
|
pip install django==5.1 psycopg[binary] django-redis django-tasks-scheduler
|
||||||
# Fetcher
|
# Fetcher
|
||||||
pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect
|
pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect
|
||||||
# News visualization
|
# News visualization
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ class DB_Handler():
|
|||||||
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
|
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
|
||||||
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
|
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
|
||||||
|
|
||||||
logger.info("Inserted #{} raw URLs, Source-Search {}-{}".format(len(urls_to_insert), obj_source.source, obj_search.search))
|
logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||||
|
|||||||
@@ -44,9 +44,7 @@ class FetchSearcher():
|
|||||||
args = {
|
args = {
|
||||||
"language": "en",
|
"language": "en",
|
||||||
"country": "US",
|
"country": "US",
|
||||||
"period": "7d",
|
# "period": ["7d", "1d"], # TODO: List of periods to iterate
|
||||||
"max_results": 100,
|
|
||||||
"max_pages": 1,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for SearchInstance in ListSearchInstances:
|
for SearchInstance in ListSearchInstances:
|
||||||
|
|||||||
@@ -59,17 +59,17 @@ class FetcherAbstract(ABC):
|
|||||||
###########################################################################
|
###########################################################################
|
||||||
|
|
||||||
class SearchGNews(FetcherAbstract):
|
class SearchGNews(FetcherAbstract):
|
||||||
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_results":100}):
|
def __init__(self, args={}):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# Parameters
|
# Parameters
|
||||||
self.language = args.get("language")
|
self.language = args.get("language", "en")
|
||||||
self.country = args.get("country")
|
self.country = args.get("country", "US")
|
||||||
self.period = args.get("period")
|
self.period = args.get("period", "7d")
|
||||||
self.max_results = args.get("max_results")
|
self.max_results = args.get("max_results", 100)
|
||||||
|
|
||||||
def _get_name(self):
|
def _get_name(self):
|
||||||
# [source] [period] [language-country] [max_results]
|
# [source] [period] [language-country] [max_results]
|
||||||
return "gnews {} {}-{} results={}".format("news", self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
|
return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
|
||||||
|
|
||||||
def _fetch_raw_urls(self, keyword_search):
|
def _fetch_raw_urls(self, keyword_search):
|
||||||
try:
|
try:
|
||||||
@@ -85,12 +85,12 @@ class SearchGNews(FetcherAbstract):
|
|||||||
return urls
|
return urls
|
||||||
|
|
||||||
class SearchDuckDuckGoGeneral(FetcherAbstract):
|
class SearchDuckDuckGoGeneral(FetcherAbstract):
|
||||||
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}):
|
def __init__(self, args={}):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# Parameters
|
# Parameters
|
||||||
self.language = args.get("language")
|
self.language = args.get("language", "wt")
|
||||||
self.country = args.get("country")
|
self.country = args.get("country", "wt")
|
||||||
self.max_results = args.get("max_results")
|
self.max_results = args.get("max_results", 20)
|
||||||
self.region = "{}-{}".format(self.language, self.country).lower()
|
self.region = "{}-{}".format(self.language, self.country).lower()
|
||||||
self.period = None
|
self.period = None
|
||||||
|
|
||||||
@@ -108,12 +108,12 @@ class SearchDuckDuckGoGeneral(FetcherAbstract):
|
|||||||
return urls
|
return urls
|
||||||
|
|
||||||
class SearchDuckDuckGoNews(FetcherAbstract):
|
class SearchDuckDuckGoNews(FetcherAbstract):
|
||||||
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}):
|
def __init__(self, args={}):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# Parameters
|
# Parameters
|
||||||
self.language = args.get("language")
|
self.language = args.get("language", "wt")
|
||||||
self.country = args.get("country")
|
self.country = args.get("country", "wt")
|
||||||
self.max_results = args.get("max_results")
|
self.max_results = args.get("max_results", 100)
|
||||||
self.region = "{}-{}".format(self.language, self.country).lower()
|
self.region = "{}-{}".format(self.language, self.country).lower()
|
||||||
self.period = None
|
self.period = None
|
||||||
|
|
||||||
@@ -131,12 +131,12 @@ class SearchDuckDuckGoNews(FetcherAbstract):
|
|||||||
return urls
|
return urls
|
||||||
|
|
||||||
class SearchGoogleNews(FetcherAbstract):
|
class SearchGoogleNews(FetcherAbstract):
|
||||||
def __init__(self, args={"language":"en", "country":"US", "period":"7d"}):
|
def __init__(self, args={}):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# Parameters
|
# Parameters
|
||||||
self.language = args.get("language")
|
self.language = args.get("language", "en")
|
||||||
self.country = args.get("country")
|
self.country = args.get("country", "US")
|
||||||
self.period = args.get("period")
|
self.period = args.get("period", "7d")
|
||||||
|
|
||||||
def _get_name(self):
|
def _get_name(self):
|
||||||
# [source] [period] [language-country]
|
# [source] [period] [language-country]
|
||||||
@@ -159,13 +159,13 @@ class SearchGoogleNews(FetcherAbstract):
|
|||||||
return urls
|
return urls
|
||||||
|
|
||||||
class SearchGoogleGeneral(FetcherAbstract):
|
class SearchGoogleGeneral(FetcherAbstract):
|
||||||
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_pages":1}):
|
def __init__(self, args={}):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# Parameters
|
# Parameters
|
||||||
self.language = args.get("language")
|
self.language = args.get("language", "en")
|
||||||
self.country = args.get("country")
|
self.country = args.get("country", "US")
|
||||||
self.period = args.get("period")
|
self.period = args.get("period", "7d")
|
||||||
self.max_pages = args.get("max_pages")
|
self.max_pages = args.get("max_pages", 1)
|
||||||
|
|
||||||
def _get_name(self):
|
def _get_name(self):
|
||||||
# [source] [period] [language-country] [pages]
|
# [source] [period] [language-country] [pages]
|
||||||
@@ -202,16 +202,16 @@ class SearchGoogleGeneral(FetcherAbstract):
|
|||||||
# To list
|
# To list
|
||||||
urls = list(set_links)
|
urls = list(set_links)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception fetching {}: {}\n{}".format(self._get_name(), str(e)))
|
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||||
urls = []
|
urls = []
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
class SearchGoogleNewsRSS(FetcherAbstract):
|
class SearchGoogleNewsRSS(FetcherAbstract):
|
||||||
def __init__(self, args={"language":"en", "country":"US"}):
|
def __init__(self, args={}):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# Parameters
|
# Parameters
|
||||||
self.language = args.get("language")
|
self.language = args.get("language", "en")
|
||||||
self.country = args.get("country")
|
self.country = args.get("country", "US")
|
||||||
|
|
||||||
def _get_name(self):
|
def _get_name(self):
|
||||||
# [source] [language-country]
|
# [source] [language-country]
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
|
|||||||
for url in encoded_urls:
|
for url in encoded_urls:
|
||||||
# Already cached?
|
# Already cached?
|
||||||
decoded_url = cache.get("gnews_decode_{}".format(url))
|
decoded_url = cache.get("gnews_decode_{}".format(url))
|
||||||
|
|
||||||
if (decoded_url is not None):
|
if (decoded_url is not None):
|
||||||
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
|
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
|
||||||
# Append decoded URL
|
# Append decoded URL
|
||||||
@@ -29,7 +30,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
|
|||||||
# Cache decoded URL
|
# Cache decoded URL
|
||||||
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
|
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
|
||||||
else:
|
else:
|
||||||
logger.warning("Error decoding news.google.com, URL {}".format(url))
|
logger.info("Bad status while decoding news.google.com, URL {}".format(url))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
|
logger.warning("Error decoding news.google.com, URL: {}".format(url))
|
||||||
return list_decoded_urls
|
return list_decoded_urls
|
||||||
Reference in New Issue
Block a user