Process missing kids url based on API endpoint
This commit is contained in:
@@ -4,7 +4,7 @@ from django.core.cache import cache
|
||||
from django.db import IntegrityError
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from .fetch_utils_url_processor import process_url, get_with_protocol, url_host_slowdown
|
||||
from .fetch_utils_url_processor import process_url, verify_missing_kid_url
|
||||
import re
|
||||
import requests
|
||||
import os
|
||||
@@ -16,7 +16,15 @@ class DB_Handler():
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def insert_raw_urls(self, urls, obj_source, obj_search):
|
||||
def insert_raw_urls(self, urls, obj_source, obj_search):
|
||||
def get_with_protocol(url):
|
||||
# http:// -> https://
|
||||
url = url.replace("http://", "https://")
|
||||
# "" -> https://
|
||||
if not (url.startswith("https://")):
|
||||
url = "https://" + url
|
||||
return url
|
||||
|
||||
try:
|
||||
logger.debug("Inserting raw URLs")
|
||||
# Empty?
|
||||
@@ -104,11 +112,9 @@ class DB_Handler():
|
||||
##########################################################################
|
||||
# URL pattern: missingkids.org/poster OR missingkids.org/new-poster
|
||||
if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url):
|
||||
# Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
|
||||
url_host_slowdown(obj_url.url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||
try:
|
||||
# Request
|
||||
r = requests.get(obj_url.url, allow_redirects=True)
|
||||
# Verify missing kid URL
|
||||
results = verify_missing_kid_url(obj_url.url)
|
||||
except Exception as e:
|
||||
if (raise_exception_on_error):
|
||||
# Simply raise exception, handled in a different way
|
||||
@@ -118,20 +124,16 @@ class DB_Handler():
|
||||
# Set status to error
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
return
|
||||
|
||||
if (r.url != obj_url.url):
|
||||
# Canonical
|
||||
url_canonical = r.url
|
||||
# Set duplicate, and insert new canonical form
|
||||
self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
|
||||
elif (r.status_code == 200):
|
||||
# Not enough to determine if it is valid. Need to wait to finish javascript, it might redirect to 404
|
||||
# self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
||||
elif (r.status_code == 404):
|
||||
|
||||
if (results.get("status") == "valid"):
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
elif (results.get("status") == "invalid"):
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
else:
|
||||
logger.debug("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
|
||||
elif (results.get("status") == "duplicate"):
|
||||
self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection"))
|
||||
elif (results.get("status") == "unknown"):
|
||||
# Nothing to do, not sure about it...
|
||||
logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url))
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
||||
return
|
||||
##########################################################################
|
||||
@@ -314,14 +316,20 @@ class DB_Handler():
|
||||
# Per URL
|
||||
for obj_url in missingkids_urls:
|
||||
try:
|
||||
# Missing kids fetching endpoint, verify URL
|
||||
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
|
||||
data = {"url": obj_url.url}
|
||||
# POST
|
||||
r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120)
|
||||
# Jsonify
|
||||
results = r.json()
|
||||
logger.debug("Selenium results for URL {}: {}".format(obj_url.url, str(results)))
|
||||
SELENIUM_BASED_MISSINGKID_VERIFICATION = False
|
||||
if (SELENIUM_BASED_MISSINGKID_VERIFICATION):
|
||||
# Missing kids fetching endpoint, verify URL
|
||||
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
|
||||
data = {"url": obj_url.url}
|
||||
# POST
|
||||
r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120)
|
||||
# Jsonify
|
||||
results = r.json()
|
||||
logger.debug("Missingkids Selenium results for URL {}: {}".format(obj_url.url, str(results)))
|
||||
else:
|
||||
# Verify
|
||||
results = verify_missing_kid_url(obj_url.url)
|
||||
logger.debug("Missingkids verify results for URL {}: {}".format(obj_url.url, str(results)))
|
||||
|
||||
if (results.get("status") == "valid"):
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
|
||||
Reference in New Issue
Block a user