Update missing kids poster status
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
from ..models import Urls, UrlContent, UrlsSource, UrlsDuplicate, Source, StatusPatternMatching
|
||||
from django.db.models import Q
|
||||
from .url_processor import process_url
|
||||
from django.core.cache import cache
|
||||
from django.db import IntegrityError
|
||||
@@ -135,13 +136,25 @@ class DB_Handler():
|
||||
cache.set("processed_{}".format(url_host), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
|
||||
|
||||
def set_status(obj_url, status):
|
||||
# Update status if setting a new value
|
||||
if (obj_url.status != status):
|
||||
obj_url.status = status
|
||||
obj_url.save()
|
||||
# updating_urls.append(obj_url)
|
||||
|
||||
# TODO: Fix enum type issue. Bulk update instead of .save() for each object
|
||||
# List of objects to bulk update
|
||||
# updating_urls = []
|
||||
# ... general processing, append to updating_urls
|
||||
# Urls.objects.bulk_update(updating_urls, ['status'])
|
||||
|
||||
##### Filter URL? -> Invalid
|
||||
if (status_pattern_match == "invalid"):
|
||||
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.INVALID
|
||||
obj_url.save()
|
||||
# updating_urls.append(obj_url)
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
@@ -161,19 +174,15 @@ class DB_Handler():
|
||||
# Set status to error
|
||||
logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc()))
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.ERROR
|
||||
obj_url.save()
|
||||
# updating_urls.append(obj_url)
|
||||
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Canonical URL different? -> Duplicate
|
||||
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.DUPLICATE
|
||||
obj_url.save()
|
||||
# updating_urls.append(obj_url)
|
||||
|
||||
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
||||
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||
# Get the sources id associated to obj_url.id
|
||||
@@ -190,9 +199,7 @@ class DB_Handler():
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.VALID
|
||||
obj_url.save()
|
||||
# updating_urls.append(obj_url)
|
||||
set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
|
||||
# Create or update extracted URL data
|
||||
UrlContent.objects.update_or_create(
|
||||
@@ -240,8 +247,6 @@ class DB_Handler():
|
||||
|
||||
# Get list of (pattern, priority, status) tuples to override status if required
|
||||
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
|
||||
# List of objects to bulk update
|
||||
# updating_urls = []
|
||||
|
||||
# Per URL
|
||||
for obj_url in raw_urls:
|
||||
@@ -250,9 +255,6 @@ class DB_Handler():
|
||||
# Process URL
|
||||
self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False)
|
||||
|
||||
# TODO: Fix enum type issue. Bulk update instead of .save() for each object
|
||||
# Urls.objects.bulk_update(updating_urls, ['status'])
|
||||
|
||||
logger.info("Updated #{} raw URLs".format(len(raw_urls)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
@@ -288,4 +290,29 @@ class DB_Handler():
|
||||
|
||||
logger.info("Updated #{}, skipped #{} error URLs".format(num_urls_processed, num_urls_skipped))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def process_missing_kids_urls(self, batch_size):
|
||||
try:
|
||||
logger.debug("Processing MissingKids URLs")
|
||||
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
|
||||
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
|
||||
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
||||
&
|
||||
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID))
|
||||
)[:batch_size]
|
||||
|
||||
# Per URL
|
||||
for obj_url in missingkids_urls:
|
||||
try:
|
||||
# Process URL. If no exception -> Valid
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
|
||||
except Exception as e:
|
||||
# Raised exception -> Invalid (404 error)
|
||||
obj_url.status = Urls.STATUS_ENUM.INVALID
|
||||
obj_url.save()
|
||||
|
||||
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
Reference in New Issue
Block a user