Working fetch search, refactoring DB towards source search

This commit is contained in:
Luciano Gervasoni
2025-03-20 11:42:33 +01:00
parent 83f76232b2
commit 05e17266f1
14 changed files with 558 additions and 120 deletions

View File

@@ -12,7 +12,6 @@ logger = get_logger()
class DB_Handler():
def __init__(self):
logger.debug("Initializing URL DB Handler")
# Inserting raw URL, cache time: 1 day
self._cache_timeout_insert_url = 86400
# Processing error URL, cache time: 2 days
@@ -37,16 +36,15 @@ class DB_Handler():
else:
return cache.get(cache_key) is not None
def insert_raw_urls(self, urls, source):
def clean_protocol(url):
# http:// -> https://
url = url.replace("http://", "https://")
# "" -> https://
if not (url.startswith("https://")):
url = "https://" + url
return url
def _clean_protocol(self, url):
# http:// -> https://
url = url.replace("http://", "https://")
# "" -> https://
if not (url.startswith("https://")):
url = "https://" + url
return url
def insert_raw_urls(self, urls, source):
try:
logger.debug("Inserting raw URLs")
# Empty?
@@ -55,7 +53,7 @@ class DB_Handler():
return
# Default protocol https://
urls_clean = [clean_protocol(url) for url in urls]
urls_clean = [self._clean_protocol(url) for url in urls]
# Get the source (create if not exists)
source_obj, created = Source.objects.get_or_create(source=source)
@@ -90,7 +88,7 @@ class DB_Handler():
UrlsSource.objects.bulk_create([UrlsSource(id_source=source_obj, id_url=url_obj) for url_obj in bulk_created_urls], ignore_conflicts=True)
except IntegrityError as e:
### Fallback to one-by-one insert
logger.debug("bulk_create exception while inserting raw URLs, falling back to non-bulk method")
logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method")
# One by one
for url in urls_to_insert:
# URL
@@ -177,9 +175,16 @@ class DB_Handler():
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
# Invalid? e.g. binary data
if (dict_url_data.get("override_status") == "invalid"):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
##### Canonical URL different? -> Duplicate
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
@@ -194,6 +199,10 @@ class DB_Handler():
# URLs duplciate association
obj_urls_duplicate, created = UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
# TODO: return obj_url_canonical so as to directly process the recently inserted URL
# Whever this function is called, add:
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
# Next URL
return
@@ -273,6 +282,7 @@ class DB_Handler():
for obj_url in error_urls:
# URL ID cached? -> Tried to process recently already, skip
if (self._is_cached_key("error_{}".format(obj_url.id), hash_encoded=False)):
logger.debug("Already cached URL ID: {}".format(obj_url.id))
num_urls_skipped += 1
continue
@@ -299,7 +309,7 @@ class DB_Handler():
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
&
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID))
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
)[:batch_size]
# Per URL