diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index 5ef6fa1..6b5a5fd 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -4,7 +4,7 @@ from django.core.cache import cache from django.db import IntegrityError from django.utils import timezone from datetime import timedelta -from .fetch_utils_url_processor import process_url, get_with_protocol, url_host_slowdown +from .fetch_utils_url_processor import process_url, verify_missing_kid_url import re import requests import os @@ -16,7 +16,15 @@ class DB_Handler(): def __init__(self): pass - def insert_raw_urls(self, urls, obj_source, obj_search): + def insert_raw_urls(self, urls, obj_source, obj_search): + def get_with_protocol(url): + # http:// -> https:// + url = url.replace("http://", "https://") + # "" -> https:// + if not (url.startswith("https://")): + url = "https://" + url + return url + try: logger.debug("Inserting raw URLs") # Empty? @@ -104,11 +112,9 @@ class DB_Handler(): ########################################################################## # URL pattern: missingkids.org/poster OR missingkids.org/new-poster if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url): - # Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint) - url_host_slowdown(obj_url.url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5))) try: - # Request - r = requests.get(obj_url.url, allow_redirects=True) + # Verify missing kid URL + results = verify_missing_kid_url(obj_url.url) except Exception as e: if (raise_exception_on_error): # Simply raise exception, handled in a different way @@ -118,20 +124,16 @@ class DB_Handler(): # Set status to error self._set_status(obj_url, Urls.STATUS_ENUM.ERROR) return - - if (r.url != obj_url.url): - # Canonical - url_canonical = r.url - # Set duplicate, and insert new canonical form - self._set_duplicate_and_insert_canonical(obj_url, url_canonical) - elif (r.status_code == 200): - # Not enough to determine if it is valid. Need to wait to finish javascript, it might redirect to 404 - # self._set_status(obj_url, Urls.STATUS_ENUM.VALID) - self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN) - elif (r.status_code == 404): + + if (results.get("status") == "valid"): + self._set_status(obj_url, Urls.STATUS_ENUM.VALID) + elif (results.get("status") == "invalid"): self._set_status(obj_url, Urls.STATUS_ENUM.INVALID) - else: - logger.debug("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url)) + elif (results.get("status") == "duplicate"): + self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection")) + elif (results.get("status") == "unknown"): + # Nothing to do, not sure about it... + logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url)) self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN) return ########################################################################## @@ -314,14 +316,20 @@ class DB_Handler(): # Per URL for obj_url in missingkids_urls: try: - # Missing kids fetching endpoint, verify URL - missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/") - data = {"url": obj_url.url} - # POST - r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120) - # Jsonify - results = r.json() - logger.debug("Selenium results for URL {}: {}".format(obj_url.url, str(results))) + SELENIUM_BASED_MISSINGKID_VERIFICATION = False + if (SELENIUM_BASED_MISSINGKID_VERIFICATION): + # Missing kids fetching endpoint, verify URL + missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/") + data = {"url": obj_url.url} + # POST + r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120) + # Jsonify + results = r.json() + logger.debug("Missingkids Selenium results for URL {}: {}".format(obj_url.url, str(results))) + else: + # Verify + results = verify_missing_kid_url(obj_url.url) + logger.debug("Missingkids verify results for URL {}: {}".format(obj_url.url, str(results))) if (results.get("status") == "valid"): self._set_status(obj_url, Urls.STATUS_ENUM.VALID) diff --git a/app_urls/fetcher/src/fetch_utils_url_processor.py b/app_urls/fetcher/src/fetch_utils_url_processor.py index 5ae6d9e..d7b25cb 100644 --- a/app_urls/fetcher/src/fetch_utils_url_processor.py +++ b/app_urls/fetcher/src/fetch_utils_url_processor.py @@ -9,14 +9,6 @@ from urllib.parse import unquote import langdetect langdetect.DetectorFactory.seed = 0 -def get_with_protocol(url): - # http:// -> https:// - url = url.replace("http://", "https://") - # "" -> https:// - if not (url.startswith("https://")): - url = "https://" + url - return url - def get_url_host(url): # URL no protocol, first substring before '/' url_host = url.replace("https://", "").replace("http://", "").split("/")[0] @@ -39,8 +31,48 @@ def url_host_slowdown(url, url_host_slowdown_seconds): # About to process URL host, cache time cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes + +def verify_missing_kid_url(url): + # Sleep required? To avoid too many requests error + url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5))) + + # Request, get redirection + r = requests.get(url, allow_redirects=True) + # Redirection? + if (url != r.url): + url_redirection = r.url + return {"status": "duplicate", "redirection": url_redirection} + + # Sample URL: "https://www.missingkids.org/poster/NCMC/2058896/1" + org_prefix, case_num = url.split("/")[-3], url.split("/")[-2] + # Fill details to API endpoint + base_url = "https://www.missingkids.org/bin/ncmecEndpoint?action=childDetail&orgPrefix={}&caseNum={}" + url_endpoint = base_url.format(org_prefix, case_num) + + # Cache timeout missingkids.org + time.sleep(0.25) + + # Request + r = requests.get(url_endpoint) + # Analyze status code and status result + if (r.status_code == 200): + r_json = r.json() + # Valid poster + if (r_json.get("status") == "success"): + return {"status": "valid"} + # Invalid poster + elif (r_json.get("status") == "error"): + return {"status": "invalid"} + else: + # ? + logger.info("Unknown json status: {} when verifying missing kid: {}".format(str(r_json), url)) + return {"status": "unknown"} + else: + # Error status code + logger.info("Unknown request status: {} when verifying missing kid: {}".format(r.status_code, url)) + return {"status": "unknown"} + def process_url(url, paywall_bypass=False, request_timeout=15): - logger.debug("Processing raw URL 1: {}".format(url)) if (paywall_bypass): # TODO: Implement self-hosted instance @@ -72,7 +104,6 @@ def process_url(url, paywall_bypass=False, request_timeout=15): # Default mode article = newspaper.article(url_of_interest, config=config) - logger.debug("Processing raw URL 2: {}".format(url)) except newspaper.ArticleBinaryDataException: logger.warning("ArticleException for input URL {}".format(url)) return {"override_status": "invalid"} @@ -121,9 +152,7 @@ def process_url(url, paywall_bypass=False, request_timeout=15): except Exception as e: logger.warning("Exception for input URL {}\n{}".format(url, str(e))) return None - - logger.debug("Processing raw URL 3: {}".format(url)) - + # Not a valid URL? if (not article.is_valid_url()): logger.debug("Invalid URL found: {}".format(url)) diff --git a/app_urls/fetcher/src/notifier.py b/app_urls/fetcher/src/notifier.py index 89685f2..eabb1cb 100644 --- a/app_urls/fetcher/src/notifier.py +++ b/app_urls/fetcher/src/notifier.py @@ -6,7 +6,7 @@ import requests import os -def notify_telegram(last_hours=24): +def notify_telegram(last_hours=12): start_date = timezone.now() - timedelta(hours=last_hours) # Count the number of URLs grouped by status within the date range diff --git a/app_urls/init_data.json b/app_urls/init_data.json index 2940549..859c84e 100644 --- a/app_urls/init_data.json +++ b/app_urls/init_data.json @@ -17,19 +17,20 @@ "cnbc.com" ], "keyword_search": [ - "child abuse" + "child abuse", + "child neglect" ] }, "REGEX_PATTERN_STATUS_PRIORITY": [ [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50], ["https:\\/\\/x.com\\/.*", "invalid", 50], [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75], - [".*foxnews\\.com\\/(video|category|person|books|html-sitemap)\\/.*", "invalid", 75], - [".*radio\\.foxnews\\.com\\/.*", "invalid", 75], [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75], + [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50], [".*zerohedge\\.com\\/(user|contributors)\\/.*", "invalid", 75], [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50], - [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50], + [".*radio\\.foxnews\\.com\\/.*", "invalid", 75], + [".*foxnews\\.com\\/(video|category|person|books|html-sitemap)\\/.*", "invalid", 75], [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50], [".*foxnews\\.com\\/[^\\/]+\\/?$", "invalid", 25] ] diff --git a/app_urls/scheduled_tasks.json b/app_urls/scheduled_tasks.json index f6c422e..3712ab0 100644 --- a/app_urls/scheduled_tasks.json +++ b/app_urls/scheduled_tasks.json @@ -160,7 +160,7 @@ "expire_seconds": null, "one_off": false, "start_time": null, - "enabled": false, + "enabled": true, "last_run_at": null, "total_run_count": 0, "date_changed": "2025-07-17T16:20:19.969Z", @@ -188,7 +188,7 @@ "expire_seconds": null, "one_off": false, "start_time": null, - "enabled": false, + "enabled": true, "last_run_at": null, "total_run_count": 0, "date_changed": "2025-07-17T16:21:30.809Z", @@ -356,7 +356,7 @@ "expire_seconds": null, "one_off": false, "start_time": null, - "enabled": false, + "enabled": true, "last_run_at": null, "total_run_count": 0, "date_changed": "2025-07-17T16:25:50.597Z",