General search fix, status pattern match regex, find feeds on startup
This commit is contained in:
@@ -81,6 +81,7 @@ class DB_Handler():
|
||||
except Exception as e:
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
|
||||
|
||||
def set_status(obj_url, status):
|
||||
@@ -89,17 +90,17 @@ class DB_Handler():
|
||||
obj_url.status = status
|
||||
obj_url.save()
|
||||
|
||||
##### Filter URL? -> Invalid
|
||||
if (status_pattern_match == "invalid"):
|
||||
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
|
||||
# Found a pattern match -> Override status
|
||||
if (status_pattern_match is not None):
|
||||
logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
set_status(obj_url, status_pattern_match)
|
||||
##### Filter URL? -> Invalid (don't extract content)
|
||||
if (status_pattern_match == "invalid"):
|
||||
return
|
||||
|
||||
##### Process URL
|
||||
try:
|
||||
# Get data
|
||||
# Extract URL content
|
||||
dict_url_data = process_url(obj_url.url)
|
||||
except Exception as e:
|
||||
if (raise_exception_on_error):
|
||||
@@ -110,25 +111,10 @@ class DB_Handler():
|
||||
# Set status to error
|
||||
dict_url_data = None
|
||||
|
||||
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
|
||||
if (dict_url_data is None):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
# Invalid? e.g. binary data
|
||||
if (dict_url_data.get("override_status") == "invalid"):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Canonical URL different? -> Duplicate
|
||||
if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
||||
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||
# Get the source-search IDs associated to obj_url.id
|
||||
@@ -136,42 +122,54 @@ class DB_Handler():
|
||||
for obj_url_source_search in list_url_source_search:
|
||||
# Associate same sources to url_canonical (it might already exist)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
||||
|
||||
# URLs duplciate association
|
||||
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||
|
||||
# TODO: return obj_url_canonical so as to directly process the recently inserted URL
|
||||
# Wherever this function is called, add:
|
||||
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
|
||||
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
|
||||
# Not overriding status given pattern matching?
|
||||
if (status_pattern_match is None):
|
||||
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
|
||||
if (dict_url_data is None):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
# Invalid? e.g. binary data
|
||||
if (dict_url_data.get("override_status") == "invalid"):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
|
||||
try:
|
||||
# Create or update extracted URL data
|
||||
UrlContent.objects.update_or_create(
|
||||
id_url=obj_url,
|
||||
defaults = {
|
||||
"date_published" : dict_url_data.get("publish_date"),
|
||||
"title" : dict_url_data.get("title"),
|
||||
"description" : dict_url_data.get("description"),
|
||||
"content" : dict_url_data.get("content"),
|
||||
"valid_content" : dict_url_data.get("valid_content"),
|
||||
"language" : dict_url_data.get("language"),
|
||||
"keywords" : dict_url_data.get("keywords"),
|
||||
"tags" : dict_url_data.get("tags"),
|
||||
"authors" : dict_url_data.get("authors"),
|
||||
"image_main_url" : dict_url_data.get("image_main_url"),
|
||||
"images_url" : dict_url_data.get("images_url"),
|
||||
"videos_url" : dict_url_data.get("videos_url"),
|
||||
"url_host" : dict_url_data.get("url_host"),
|
||||
"site_name" : dict_url_data.get("site_name"),
|
||||
}
|
||||
)
|
||||
if (dict_url_data is not None):
|
||||
# Create or update extracted URL data
|
||||
UrlContent.objects.update_or_create(
|
||||
id_url=obj_url,
|
||||
defaults = {
|
||||
"date_published" : dict_url_data.get("publish_date"),
|
||||
"title" : dict_url_data.get("title"),
|
||||
"description" : dict_url_data.get("description"),
|
||||
"content" : dict_url_data.get("content"),
|
||||
"valid_content" : dict_url_data.get("valid_content"),
|
||||
"language" : dict_url_data.get("language"),
|
||||
"keywords" : dict_url_data.get("keywords"),
|
||||
"tags" : dict_url_data.get("tags"),
|
||||
"authors" : dict_url_data.get("authors"),
|
||||
"image_main_url" : dict_url_data.get("image_main_url"),
|
||||
"images_url" : dict_url_data.get("images_url"),
|
||||
"videos_url" : dict_url_data.get("videos_url"),
|
||||
"url_host" : dict_url_data.get("url_host"),
|
||||
"site_name" : dict_url_data.get("site_name"),
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug("Error in update_or_create UrlContent: {}\ndict_url_data: {}\n{}\n{}".format(obj_url.url, dict_url_data, str(e), traceback.format_exc()))
|
||||
|
||||
@@ -179,13 +177,12 @@ class DB_Handler():
|
||||
def process_raw_urls(self, batch_size):
|
||||
|
||||
def _get_status_pattern_matching(url, list_pattern_status_tuple):
|
||||
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
|
||||
"""
|
||||
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only """
|
||||
# Sort pattern tuples by priority. (pattern, priority, status)
|
||||
for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
|
||||
# Regular expression pattern matching: https://regexr.com/
|
||||
if bool(re.match(regex_pattern, obj_url.url)):
|
||||
logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
|
||||
# logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
|
||||
return status_if_match
|
||||
return None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user