Update missing kids poster status

This commit is contained in:
Luciano Gervasoni
2025-03-18 17:38:01 +01:00
parent fb4b30f05e
commit 83f76232b2
4 changed files with 58 additions and 24 deletions

View File

@@ -1,4 +1,5 @@
from ..models import Urls, UrlContent, UrlsSource, UrlsDuplicate, Source, StatusPatternMatching
from django.db.models import Q
from .url_processor import process_url
from django.core.cache import cache
from django.db import IntegrityError
@@ -135,13 +136,25 @@ class DB_Handler():
cache.set("processed_{}".format(url_host), time.time(), timeout=60*5) # Expire after 5 minutes
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
def set_status(obj_url, status):
# Update status if setting a new value
if (obj_url.status != status):
obj_url.status = status
obj_url.save()
# updating_urls.append(obj_url)
# TODO: Fix enum type issue. Bulk update instead of .save() for each object
# List of objects to bulk update
# updating_urls = []
# ... general processing, append to updating_urls
# Urls.objects.bulk_update(updating_urls, ['status'])
##### Filter URL? -> Invalid
if (status_pattern_match == "invalid"):
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
# Update status
obj_url.status = Urls.STATUS_ENUM.INVALID
obj_url.save()
# updating_urls.append(obj_url)
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
@@ -161,19 +174,15 @@ class DB_Handler():
# Set status to error
logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc()))
# Update status
obj_url.status = Urls.STATUS_ENUM.ERROR
obj_url.save()
# updating_urls.append(obj_url)
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
##### Canonical URL different? -> Duplicate
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status
obj_url.status = Urls.STATUS_ENUM.DUPLICATE
obj_url.save()
# updating_urls.append(obj_url)
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
# Get or create URL with canonical form
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
# Get the sources id associated to obj_url.id
@@ -190,9 +199,7 @@ class DB_Handler():
##### Valid URL
# Update status
obj_url.status = Urls.STATUS_ENUM.VALID
obj_url.save()
# updating_urls.append(obj_url)
set_status(obj_url, Urls.STATUS_ENUM.VALID)
# Create or update extracted URL data
UrlContent.objects.update_or_create(
@@ -240,8 +247,6 @@ class DB_Handler():
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
# List of objects to bulk update
# updating_urls = []
# Per URL
for obj_url in raw_urls:
@@ -250,9 +255,6 @@ class DB_Handler():
# Process URL
self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False)
# TODO: Fix enum type issue. Bulk update instead of .save() for each object
# Urls.objects.bulk_update(updating_urls, ['status'])
logger.info("Updated #{} raw URLs".format(len(raw_urls)))
except Exception as e:
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
@@ -288,4 +290,29 @@ class DB_Handler():
logger.info("Updated #{}, skipped #{} error URLs".format(num_urls_processed, num_urls_skipped))
except Exception as e:
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
def process_missing_kids_urls(self, batch_size):
try:
logger.debug("Processing MissingKids URLs")
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
&
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID))
)[:batch_size]
# Per URL
for obj_url in missingkids_urls:
try:
# Process URL. If no exception -> Valid
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
except Exception as e:
# Raised exception -> Invalid (404 error)
obj_url.status = Urls.STATUS_ENUM.INVALID
obj_url.save()
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
except Exception as e:
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))