Selenium based missing kid verify url
This commit is contained in:
@@ -115,12 +115,14 @@ class DB_Handler():
|
||||
# Set duplicate, and insert new canonical form
|
||||
self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
|
||||
elif (r.status_code == 200):
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
# Not enough to determine if it is valid. Need to wait to finish javascript, it might redirect to 404
|
||||
# self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
||||
elif (r.status_code == 404):
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
else:
|
||||
logger.info("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
|
||||
|
||||
logger.debug("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
||||
return
|
||||
##########################################################################
|
||||
|
||||
@@ -275,18 +277,37 @@ class DB_Handler():
|
||||
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
|
||||
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
||||
&
|
||||
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
|
||||
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR))
|
||||
)
|
||||
|
||||
# Get batch size
|
||||
if (batch_size is not None):
|
||||
missingkids_urls = missingkids_urls[:batch_size]
|
||||
|
||||
# TODO: Cache processed during last X hours, filter them...
|
||||
|
||||
# Per URL
|
||||
for obj_url in missingkids_urls:
|
||||
try:
|
||||
# Process URL
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=None)
|
||||
# Missing kids fetching endpoint, verify URL
|
||||
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
|
||||
data = {"url": obj_url.url}
|
||||
# POST
|
||||
r = requests.post(missingkids_fetch_endpoint, json=data)
|
||||
# Jsonify
|
||||
results = r.json()
|
||||
|
||||
if (results.get("status") == "valid"):
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
elif (results.get("status") == "invalid"):
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
elif (results.get("status") == "duplicate"):
|
||||
self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection"))
|
||||
elif (results.get("status") == "unknown"):
|
||||
# Nothing to do, not sure about it...
|
||||
logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url))
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e)))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user