Process missing kids url based on API endpoint
This commit is contained in:
@@ -4,7 +4,7 @@ from django.core.cache import cache
|
|||||||
from django.db import IntegrityError
|
from django.db import IntegrityError
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from .fetch_utils_url_processor import process_url, get_with_protocol, url_host_slowdown
|
from .fetch_utils_url_processor import process_url, verify_missing_kid_url
|
||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
@@ -16,7 +16,15 @@ class DB_Handler():
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def insert_raw_urls(self, urls, obj_source, obj_search):
|
def insert_raw_urls(self, urls, obj_source, obj_search):
|
||||||
|
def get_with_protocol(url):
|
||||||
|
# http:// -> https://
|
||||||
|
url = url.replace("http://", "https://")
|
||||||
|
# "" -> https://
|
||||||
|
if not (url.startswith("https://")):
|
||||||
|
url = "https://" + url
|
||||||
|
return url
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.debug("Inserting raw URLs")
|
logger.debug("Inserting raw URLs")
|
||||||
# Empty?
|
# Empty?
|
||||||
@@ -104,11 +112,9 @@ class DB_Handler():
|
|||||||
##########################################################################
|
##########################################################################
|
||||||
# URL pattern: missingkids.org/poster OR missingkids.org/new-poster
|
# URL pattern: missingkids.org/poster OR missingkids.org/new-poster
|
||||||
if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url):
|
if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url):
|
||||||
# Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
|
|
||||||
url_host_slowdown(obj_url.url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
|
||||||
try:
|
try:
|
||||||
# Request
|
# Verify missing kid URL
|
||||||
r = requests.get(obj_url.url, allow_redirects=True)
|
results = verify_missing_kid_url(obj_url.url)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if (raise_exception_on_error):
|
if (raise_exception_on_error):
|
||||||
# Simply raise exception, handled in a different way
|
# Simply raise exception, handled in a different way
|
||||||
@@ -118,20 +124,16 @@ class DB_Handler():
|
|||||||
# Set status to error
|
# Set status to error
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||||
return
|
return
|
||||||
|
|
||||||
if (r.url != obj_url.url):
|
if (results.get("status") == "valid"):
|
||||||
# Canonical
|
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||||
url_canonical = r.url
|
elif (results.get("status") == "invalid"):
|
||||||
# Set duplicate, and insert new canonical form
|
|
||||||
self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
|
|
||||||
elif (r.status_code == 200):
|
|
||||||
# Not enough to determine if it is valid. Need to wait to finish javascript, it might redirect to 404
|
|
||||||
# self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
|
||||||
elif (r.status_code == 404):
|
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||||
else:
|
elif (results.get("status") == "duplicate"):
|
||||||
logger.debug("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
|
self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection"))
|
||||||
|
elif (results.get("status") == "unknown"):
|
||||||
|
# Nothing to do, not sure about it...
|
||||||
|
logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url))
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
||||||
return
|
return
|
||||||
##########################################################################
|
##########################################################################
|
||||||
@@ -314,14 +316,20 @@ class DB_Handler():
|
|||||||
# Per URL
|
# Per URL
|
||||||
for obj_url in missingkids_urls:
|
for obj_url in missingkids_urls:
|
||||||
try:
|
try:
|
||||||
# Missing kids fetching endpoint, verify URL
|
SELENIUM_BASED_MISSINGKID_VERIFICATION = False
|
||||||
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
|
if (SELENIUM_BASED_MISSINGKID_VERIFICATION):
|
||||||
data = {"url": obj_url.url}
|
# Missing kids fetching endpoint, verify URL
|
||||||
# POST
|
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
|
||||||
r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120)
|
data = {"url": obj_url.url}
|
||||||
# Jsonify
|
# POST
|
||||||
results = r.json()
|
r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120)
|
||||||
logger.debug("Selenium results for URL {}: {}".format(obj_url.url, str(results)))
|
# Jsonify
|
||||||
|
results = r.json()
|
||||||
|
logger.debug("Missingkids Selenium results for URL {}: {}".format(obj_url.url, str(results)))
|
||||||
|
else:
|
||||||
|
# Verify
|
||||||
|
results = verify_missing_kid_url(obj_url.url)
|
||||||
|
logger.debug("Missingkids verify results for URL {}: {}".format(obj_url.url, str(results)))
|
||||||
|
|
||||||
if (results.get("status") == "valid"):
|
if (results.get("status") == "valid"):
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||||
|
|||||||
@@ -9,14 +9,6 @@ from urllib.parse import unquote
|
|||||||
import langdetect
|
import langdetect
|
||||||
langdetect.DetectorFactory.seed = 0
|
langdetect.DetectorFactory.seed = 0
|
||||||
|
|
||||||
def get_with_protocol(url):
|
|
||||||
# http:// -> https://
|
|
||||||
url = url.replace("http://", "https://")
|
|
||||||
# "" -> https://
|
|
||||||
if not (url.startswith("https://")):
|
|
||||||
url = "https://" + url
|
|
||||||
return url
|
|
||||||
|
|
||||||
def get_url_host(url):
|
def get_url_host(url):
|
||||||
# URL no protocol, first substring before '/'
|
# URL no protocol, first substring before '/'
|
||||||
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
|
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||||
@@ -39,8 +31,48 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
|||||||
# About to process URL host, cache time
|
# About to process URL host, cache time
|
||||||
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||||
|
|
||||||
|
|
||||||
|
def verify_missing_kid_url(url):
|
||||||
|
# Sleep required? To avoid too many requests error
|
||||||
|
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||||
|
|
||||||
|
# Request, get redirection
|
||||||
|
r = requests.get(url, allow_redirects=True)
|
||||||
|
# Redirection?
|
||||||
|
if (url != r.url):
|
||||||
|
url_redirection = r.url
|
||||||
|
return {"status": "duplicate", "redirection": url_redirection}
|
||||||
|
|
||||||
|
# Sample URL: "https://www.missingkids.org/poster/NCMC/2058896/1"
|
||||||
|
org_prefix, case_num = url.split("/")[-3], url.split("/")[-2]
|
||||||
|
# Fill details to API endpoint
|
||||||
|
base_url = "https://www.missingkids.org/bin/ncmecEndpoint?action=childDetail&orgPrefix={}&caseNum={}"
|
||||||
|
url_endpoint = base_url.format(org_prefix, case_num)
|
||||||
|
|
||||||
|
# Cache timeout missingkids.org
|
||||||
|
time.sleep(0.25)
|
||||||
|
|
||||||
|
# Request
|
||||||
|
r = requests.get(url_endpoint)
|
||||||
|
# Analyze status code and status result
|
||||||
|
if (r.status_code == 200):
|
||||||
|
r_json = r.json()
|
||||||
|
# Valid poster
|
||||||
|
if (r_json.get("status") == "success"):
|
||||||
|
return {"status": "valid"}
|
||||||
|
# Invalid poster
|
||||||
|
elif (r_json.get("status") == "error"):
|
||||||
|
return {"status": "invalid"}
|
||||||
|
else:
|
||||||
|
# ?
|
||||||
|
logger.info("Unknown json status: {} when verifying missing kid: {}".format(str(r_json), url))
|
||||||
|
return {"status": "unknown"}
|
||||||
|
else:
|
||||||
|
# Error status code
|
||||||
|
logger.info("Unknown request status: {} when verifying missing kid: {}".format(r.status_code, url))
|
||||||
|
return {"status": "unknown"}
|
||||||
|
|
||||||
def process_url(url, paywall_bypass=False, request_timeout=15):
|
def process_url(url, paywall_bypass=False, request_timeout=15):
|
||||||
logger.debug("Processing raw URL 1: {}".format(url))
|
|
||||||
|
|
||||||
if (paywall_bypass):
|
if (paywall_bypass):
|
||||||
# TODO: Implement self-hosted instance
|
# TODO: Implement self-hosted instance
|
||||||
@@ -72,7 +104,6 @@ def process_url(url, paywall_bypass=False, request_timeout=15):
|
|||||||
# Default mode
|
# Default mode
|
||||||
article = newspaper.article(url_of_interest, config=config)
|
article = newspaper.article(url_of_interest, config=config)
|
||||||
|
|
||||||
logger.debug("Processing raw URL 2: {}".format(url))
|
|
||||||
except newspaper.ArticleBinaryDataException:
|
except newspaper.ArticleBinaryDataException:
|
||||||
logger.warning("ArticleException for input URL {}".format(url))
|
logger.warning("ArticleException for input URL {}".format(url))
|
||||||
return {"override_status": "invalid"}
|
return {"override_status": "invalid"}
|
||||||
@@ -121,9 +152,7 @@ def process_url(url, paywall_bypass=False, request_timeout=15):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
logger.debug("Processing raw URL 3: {}".format(url))
|
|
||||||
|
|
||||||
# Not a valid URL?
|
# Not a valid URL?
|
||||||
if (not article.is_valid_url()):
|
if (not article.is_valid_url()):
|
||||||
logger.debug("Invalid URL found: {}".format(url))
|
logger.debug("Invalid URL found: {}".format(url))
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import requests
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
def notify_telegram(last_hours=24):
|
def notify_telegram(last_hours=12):
|
||||||
start_date = timezone.now() - timedelta(hours=last_hours)
|
start_date = timezone.now() - timedelta(hours=last_hours)
|
||||||
|
|
||||||
# Count the number of URLs grouped by status within the date range
|
# Count the number of URLs grouped by status within the date range
|
||||||
|
|||||||
@@ -17,19 +17,20 @@
|
|||||||
"cnbc.com"
|
"cnbc.com"
|
||||||
],
|
],
|
||||||
"keyword_search": [
|
"keyword_search": [
|
||||||
"child abuse"
|
"child abuse",
|
||||||
|
"child neglect"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
"REGEX_PATTERN_STATUS_PRIORITY": [
|
||||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
||||||
["https:\\/\\/x.com\\/.*", "invalid", 50],
|
["https:\\/\\/x.com\\/.*", "invalid", 50],
|
||||||
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
||||||
[".*foxnews\\.com\\/(video|category|person|books|html-sitemap)\\/.*", "invalid", 75],
|
|
||||||
[".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
|
|
||||||
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
||||||
|
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
||||||
[".*zerohedge\\.com\\/(user|contributors)\\/.*", "invalid", 75],
|
[".*zerohedge\\.com\\/(user|contributors)\\/.*", "invalid", 75],
|
||||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
||||||
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
[".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
|
||||||
|
[".*foxnews\\.com\\/(video|category|person|books|html-sitemap)\\/.*", "invalid", 75],
|
||||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50],
|
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50],
|
||||||
[".*foxnews\\.com\\/[^\\/]+\\/?$", "invalid", 25]
|
[".*foxnews\\.com\\/[^\\/]+\\/?$", "invalid", 25]
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -160,7 +160,7 @@
|
|||||||
"expire_seconds": null,
|
"expire_seconds": null,
|
||||||
"one_off": false,
|
"one_off": false,
|
||||||
"start_time": null,
|
"start_time": null,
|
||||||
"enabled": false,
|
"enabled": true,
|
||||||
"last_run_at": null,
|
"last_run_at": null,
|
||||||
"total_run_count": 0,
|
"total_run_count": 0,
|
||||||
"date_changed": "2025-07-17T16:20:19.969Z",
|
"date_changed": "2025-07-17T16:20:19.969Z",
|
||||||
@@ -188,7 +188,7 @@
|
|||||||
"expire_seconds": null,
|
"expire_seconds": null,
|
||||||
"one_off": false,
|
"one_off": false,
|
||||||
"start_time": null,
|
"start_time": null,
|
||||||
"enabled": false,
|
"enabled": true,
|
||||||
"last_run_at": null,
|
"last_run_at": null,
|
||||||
"total_run_count": 0,
|
"total_run_count": 0,
|
||||||
"date_changed": "2025-07-17T16:21:30.809Z",
|
"date_changed": "2025-07-17T16:21:30.809Z",
|
||||||
@@ -356,7 +356,7 @@
|
|||||||
"expire_seconds": null,
|
"expire_seconds": null,
|
||||||
"one_off": false,
|
"one_off": false,
|
||||||
"start_time": null,
|
"start_time": null,
|
||||||
"enabled": false,
|
"enabled": true,
|
||||||
"last_run_at": null,
|
"last_run_at": null,
|
||||||
"total_run_count": 0,
|
"total_run_count": 0,
|
||||||
"date_changed": "2025-07-17T16:25:50.597Z",
|
"date_changed": "2025-07-17T16:25:50.597Z",
|
||||||
|
|||||||
Reference in New Issue
Block a user