missing kids status code handling

This commit is contained in:
Luciano Gervasoni
2025-07-07 12:57:57 +02:00
parent b559f8cd8c
commit 4c0dd70bc3
2 changed files with 51 additions and 31 deletions

View File

@@ -6,6 +6,7 @@ from django.utils import timezone
from datetime import timedelta
from .fetch_utils_url_processor import process_url, get_with_protocol
import re
import requests
import os
import traceback
from .logger import get_logger
@@ -80,24 +81,56 @@ class DB_Handler():
except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
def _set_status(self, obj_url, status):
# Update status if setting a new value
if (obj_url.status != status):
obj_url.status = status
obj_url.save()
def _set_duplicate_and_insert_canonical(self, obj_url, url_canonical):
# Update status
self._set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
# Get or create URL with canonical form
obj_url_canonical, created = Urls.objects.get_or_create(url=url_canonical)
# Get the source-search IDs associated to obj_url.id
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
for obj_url_source_search in list_url_source_search:
# Associate same sources to url_canonical (it might already exist)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
# URLs duplciate association
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
def set_status(obj_url, status):
# Update status if setting a new value
if (obj_url.status != status):
obj_url.status = status
obj_url.save()
##########################################################################
# URL pattern: missingkids.org/poster OR missingkids.org/new-poster
if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url):
# Request
r = requests.get(obj_url.url, allow_redirects=True)
if (r.url != obj_url.url):
# Canonical
url_canonical = r.url
# Set duplicate, and insert new canonical form
self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
elif (r.status_code == 200):
self._set_status(self, obj_url, Urls.STATUS_ENUM.VALID)
elif (r.status_code == 404):
self._set_status(self, obj_url, Urls.STATUS_ENUM.INVALID)
else:
logger.info("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
return
##########################################################################
# Found a pattern match -> Override status
if (status_pattern_match is not None):
logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
# Update status
set_status(obj_url, status_pattern_match)
self._set_status(obj_url, status_pattern_match)
##### Filter URL? -> Invalid (don't extract content)
if (status_pattern_match == "invalid"):
return
try:
# Extract URL content
dict_url_data = process_url(obj_url.url, paywall_bypass)
@@ -111,19 +144,9 @@ class DB_Handler():
dict_url_data = None
##### Canonical URL different? -> Duplicate
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
# Get or create URL with canonical form
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
# Get the source-search IDs associated to obj_url.id
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
for obj_url_source_search in list_url_source_search:
# Associate same sources to url_canonical (it might already exist)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
# URLs duplciate association
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# URL as duplicate, insert canonical URL
self._set_duplicate_and_insert_canonical(obj_url, dict_url_data.get("url_canonical"))
# Next URL
return
@@ -132,20 +155,20 @@ class DB_Handler():
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
if (dict_url_data is None):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
# Invalid? e.g. binary data
if (dict_url_data.get("override_status") == "invalid"):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
##### Valid URL
# Update status
set_status(obj_url, Urls.STATUS_ENUM.VALID)
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
try:
if (dict_url_data is not None):
@@ -260,14 +283,12 @@ class DB_Handler():
# Per URL
for obj_url in missingkids_urls:
try:
# Process URL. If no exception -> Valid
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
# Process URL
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=None)
except Exception as e:
# Raised exception -> Invalid (404 error)
obj_url.status = Urls.STATUS_ENUM.INVALID
obj_url.save()
logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e)))
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
logger.info("Verified status of #{} missingkids.org/poster / missingkids.org/new-poster URLs".format(len(missingkids_urls)))
except Exception as e:
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))

View File

@@ -30,6 +30,5 @@
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
[".*missingkids\\.org\\/poster\\/.*", "valid", 50]
]
}