missing kids status code handling
This commit is contained in:
@@ -6,6 +6,7 @@ from django.utils import timezone
|
|||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from .fetch_utils_url_processor import process_url, get_with_protocol
|
from .fetch_utils_url_processor import process_url, get_with_protocol
|
||||||
import re
|
import re
|
||||||
|
import requests
|
||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
from .logger import get_logger
|
from .logger import get_logger
|
||||||
@@ -80,24 +81,56 @@ class DB_Handler():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||||
|
|
||||||
|
def _set_status(self, obj_url, status):
|
||||||
|
# Update status if setting a new value
|
||||||
|
if (obj_url.status != status):
|
||||||
|
obj_url.status = status
|
||||||
|
obj_url.save()
|
||||||
|
|
||||||
|
def _set_duplicate_and_insert_canonical(self, obj_url, url_canonical):
|
||||||
|
# Update status
|
||||||
|
self._set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
||||||
|
# Get or create URL with canonical form
|
||||||
|
obj_url_canonical, created = Urls.objects.get_or_create(url=url_canonical)
|
||||||
|
# Get the source-search IDs associated to obj_url.id
|
||||||
|
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
|
||||||
|
for obj_url_source_search in list_url_source_search:
|
||||||
|
# Associate same sources to url_canonical (it might already exist)
|
||||||
|
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
||||||
|
# URLs duplciate association
|
||||||
|
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||||
|
|
||||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
|
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
|
||||||
|
##########################################################################
|
||||||
def set_status(obj_url, status):
|
# URL pattern: missingkids.org/poster OR missingkids.org/new-poster
|
||||||
# Update status if setting a new value
|
if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url):
|
||||||
if (obj_url.status != status):
|
# Request
|
||||||
obj_url.status = status
|
r = requests.get(obj_url.url, allow_redirects=True)
|
||||||
obj_url.save()
|
|
||||||
|
if (r.url != obj_url.url):
|
||||||
|
# Canonical
|
||||||
|
url_canonical = r.url
|
||||||
|
# Set duplicate, and insert new canonical form
|
||||||
|
self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
|
||||||
|
elif (r.status_code == 200):
|
||||||
|
self._set_status(self, obj_url, Urls.STATUS_ENUM.VALID)
|
||||||
|
elif (r.status_code == 404):
|
||||||
|
self._set_status(self, obj_url, Urls.STATUS_ENUM.INVALID)
|
||||||
|
else:
|
||||||
|
logger.info("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
|
||||||
|
|
||||||
|
return
|
||||||
|
##########################################################################
|
||||||
|
|
||||||
# Found a pattern match -> Override status
|
# Found a pattern match -> Override status
|
||||||
if (status_pattern_match is not None):
|
if (status_pattern_match is not None):
|
||||||
logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
|
logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
|
||||||
# Update status
|
# Update status
|
||||||
set_status(obj_url, status_pattern_match)
|
self._set_status(obj_url, status_pattern_match)
|
||||||
##### Filter URL? -> Invalid (don't extract content)
|
##### Filter URL? -> Invalid (don't extract content)
|
||||||
if (status_pattern_match == "invalid"):
|
if (status_pattern_match == "invalid"):
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Extract URL content
|
# Extract URL content
|
||||||
dict_url_data = process_url(obj_url.url, paywall_bypass)
|
dict_url_data = process_url(obj_url.url, paywall_bypass)
|
||||||
@@ -111,19 +144,9 @@ class DB_Handler():
|
|||||||
dict_url_data = None
|
dict_url_data = None
|
||||||
|
|
||||||
##### Canonical URL different? -> Duplicate
|
##### Canonical URL different? -> Duplicate
|
||||||
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||||
# Update status
|
# URL as duplicate, insert canonical URL
|
||||||
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
self._set_duplicate_and_insert_canonical(obj_url, dict_url_data.get("url_canonical"))
|
||||||
# Get or create URL with canonical form
|
|
||||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
|
||||||
# Get the source-search IDs associated to obj_url.id
|
|
||||||
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
|
|
||||||
for obj_url_source_search in list_url_source_search:
|
|
||||||
# Associate same sources to url_canonical (it might already exist)
|
|
||||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
|
||||||
# URLs duplciate association
|
|
||||||
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
|
||||||
|
|
||||||
# Next URL
|
# Next URL
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -132,20 +155,20 @@ class DB_Handler():
|
|||||||
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
|
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
|
||||||
if (dict_url_data is None):
|
if (dict_url_data is None):
|
||||||
# Update status
|
# Update status
|
||||||
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||||
# Next URL
|
# Next URL
|
||||||
return
|
return
|
||||||
|
|
||||||
# Invalid? e.g. binary data
|
# Invalid? e.g. binary data
|
||||||
if (dict_url_data.get("override_status") == "invalid"):
|
if (dict_url_data.get("override_status") == "invalid"):
|
||||||
# Update status
|
# Update status
|
||||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||||
# Next URL
|
# Next URL
|
||||||
return
|
return
|
||||||
|
|
||||||
##### Valid URL
|
##### Valid URL
|
||||||
# Update status
|
# Update status
|
||||||
set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if (dict_url_data is not None):
|
if (dict_url_data is not None):
|
||||||
@@ -260,14 +283,12 @@ class DB_Handler():
|
|||||||
# Per URL
|
# Per URL
|
||||||
for obj_url in missingkids_urls:
|
for obj_url in missingkids_urls:
|
||||||
try:
|
try:
|
||||||
# Process URL. If no exception -> Valid
|
# Process URL
|
||||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
|
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=None)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Raised exception -> Invalid (404 error)
|
logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e)))
|
||||||
obj_url.status = Urls.STATUS_ENUM.INVALID
|
|
||||||
obj_url.save()
|
|
||||||
|
|
||||||
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
|
logger.info("Verified status of #{} missingkids.org/poster / missingkids.org/new-poster URLs".format(len(missingkids_urls)))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
|
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||||
|
|
||||||
|
|||||||
@@ -30,6 +30,5 @@
|
|||||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
||||||
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
||||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
|
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
|
||||||
[".*missingkids\\.org\\/poster\\/.*", "valid", 50]
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user