missing kids status code handling
This commit is contained in:
@@ -6,6 +6,7 @@ from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from .fetch_utils_url_processor import process_url, get_with_protocol
|
||||
import re
|
||||
import requests
|
||||
import os
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
@@ -80,24 +81,56 @@ class DB_Handler():
|
||||
except Exception as e:
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def _set_status(self, obj_url, status):
|
||||
# Update status if setting a new value
|
||||
if (obj_url.status != status):
|
||||
obj_url.status = status
|
||||
obj_url.save()
|
||||
|
||||
def _set_duplicate_and_insert_canonical(self, obj_url, url_canonical):
|
||||
# Update status
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=url_canonical)
|
||||
# Get the source-search IDs associated to obj_url.id
|
||||
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
|
||||
for obj_url_source_search in list_url_source_search:
|
||||
# Associate same sources to url_canonical (it might already exist)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
||||
# URLs duplciate association
|
||||
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
|
||||
|
||||
def set_status(obj_url, status):
|
||||
# Update status if setting a new value
|
||||
if (obj_url.status != status):
|
||||
obj_url.status = status
|
||||
obj_url.save()
|
||||
##########################################################################
|
||||
# URL pattern: missingkids.org/poster OR missingkids.org/new-poster
|
||||
if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url):
|
||||
# Request
|
||||
r = requests.get(obj_url.url, allow_redirects=True)
|
||||
|
||||
if (r.url != obj_url.url):
|
||||
# Canonical
|
||||
url_canonical = r.url
|
||||
# Set duplicate, and insert new canonical form
|
||||
self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
|
||||
elif (r.status_code == 200):
|
||||
self._set_status(self, obj_url, Urls.STATUS_ENUM.VALID)
|
||||
elif (r.status_code == 404):
|
||||
self._set_status(self, obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
else:
|
||||
logger.info("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
|
||||
|
||||
return
|
||||
##########################################################################
|
||||
|
||||
# Found a pattern match -> Override status
|
||||
if (status_pattern_match is not None):
|
||||
logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
|
||||
# Update status
|
||||
set_status(obj_url, status_pattern_match)
|
||||
self._set_status(obj_url, status_pattern_match)
|
||||
##### Filter URL? -> Invalid (don't extract content)
|
||||
if (status_pattern_match == "invalid"):
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
# Extract URL content
|
||||
dict_url_data = process_url(obj_url.url, paywall_bypass)
|
||||
@@ -111,19 +144,9 @@ class DB_Handler():
|
||||
dict_url_data = None
|
||||
|
||||
##### Canonical URL different? -> Duplicate
|
||||
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||
# Get the source-search IDs associated to obj_url.id
|
||||
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
|
||||
for obj_url_source_search in list_url_source_search:
|
||||
# Associate same sources to url_canonical (it might already exist)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
||||
# URLs duplciate association
|
||||
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||
|
||||
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# URL as duplicate, insert canonical URL
|
||||
self._set_duplicate_and_insert_canonical(obj_url, dict_url_data.get("url_canonical"))
|
||||
# Next URL
|
||||
return
|
||||
|
||||
@@ -132,20 +155,20 @@ class DB_Handler():
|
||||
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
|
||||
if (dict_url_data is None):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
# Invalid? e.g. binary data
|
||||
if (dict_url_data.get("override_status") == "invalid"):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
|
||||
try:
|
||||
if (dict_url_data is not None):
|
||||
@@ -260,14 +283,12 @@ class DB_Handler():
|
||||
# Per URL
|
||||
for obj_url in missingkids_urls:
|
||||
try:
|
||||
# Process URL. If no exception -> Valid
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
|
||||
# Process URL
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=None)
|
||||
except Exception as e:
|
||||
# Raised exception -> Invalid (404 error)
|
||||
obj_url.status = Urls.STATUS_ENUM.INVALID
|
||||
obj_url.save()
|
||||
logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e)))
|
||||
|
||||
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
|
||||
logger.info("Verified status of #{} missingkids.org/poster / missingkids.org/new-poster URLs".format(len(missingkids_urls)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user