Selenium based missing kid verify url

This commit is contained in:
Luciano Gervasoni
2025-07-07 16:02:11 +02:00
parent 15035c108d
commit a8b236bac0
3 changed files with 82 additions and 6 deletions

View File

@@ -1,4 +1,5 @@
from fastapi import FastAPI from fastapi import FastAPI
from pydantic import BaseModel
from missing_kids import MissingKidsFetcher from missing_kids import MissingKidsFetcher
from logger import get_logger from logger import get_logger
logger = get_logger() logger = get_logger()
@@ -12,3 +13,14 @@ def get_missing_kids(pages: int = -1):
except Exception as e: except Exception as e:
res = {} res = {}
return res return res
class Body(BaseModel):
url: str
@app.post("/verify_missing_kid/")
def get_missing_kids(data: Body):
try:
res = MissingKidsFetcher().verify_missing_kid_url(data.url)
except Exception as e:
res = {}
return res

View File

@@ -2,6 +2,8 @@ from selenium import webdriver
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time import time
import os import os
@@ -22,6 +24,47 @@ class MissingKidsFetcher():
def __init__(self) -> None: def __init__(self) -> None:
pass pass
def verify_missing_kid_url(self, url):
# Initialize
driver = get_webdriver()
# Load URL
driver.get(url)
# Wait for 404?
WebDriverWait(driver, 1).until(EC.title_contains("404"))
if ("404" in driver.title):
# Status invalid
return "invalid"
def load_finished(driver):
# Find all <img> tags with src attributes. Extract src URLs
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
# If base64 image exists, loading finished
finished = any(["data:image/png;base64" in i for i in image_urls])
# logger.debug("Finished loading URL")
return finished
# Check until finished loading
num_checks = 5
while (not load_finished(driver)) and (num_checks>=0):
time.sleep(1)
# Find all <img> tags with src attributes. Extract src URLs
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
# Status invalid
return {"status": "invalid"}
elif ("Haven you seen this child?" in driver.title):
# Status valid
return {"status": "valid"}
elif (driver.current_url != url):
# Redirection (duplicate)
return {"status": "duplicate", "redirection": driver.current_url}
else:
return {"status": "unknown"}
def get_missing_kids_urls(self, first_n_pages=-1): def get_missing_kids_urls(self, first_n_pages=-1):
logger.info("Get MissingKids, #pages: {}".format(first_n_pages)) logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
# Poster URL # Poster URL

View File

@@ -115,12 +115,14 @@ class DB_Handler():
# Set duplicate, and insert new canonical form # Set duplicate, and insert new canonical form
self._set_duplicate_and_insert_canonical(obj_url, url_canonical) self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
elif (r.status_code == 200): elif (r.status_code == 200):
self._set_status(obj_url, Urls.STATUS_ENUM.VALID) # Not enough to determine if it is valid. Need to wait to finish javascript, it might redirect to 404
# self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
elif (r.status_code == 404): elif (r.status_code == 404):
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID) self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
else: else:
logger.info("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url)) logger.debug("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
return return
########################################################################## ##########################################################################
@@ -275,18 +277,37 @@ class DB_Handler():
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter( missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster")) (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
& &
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR)) (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR))
) )
# Get batch size # Get batch size
if (batch_size is not None): if (batch_size is not None):
missingkids_urls = missingkids_urls[:batch_size] missingkids_urls = missingkids_urls[:batch_size]
# TODO: Cache processed during last X hours, filter them...
# Per URL # Per URL
for obj_url in missingkids_urls: for obj_url in missingkids_urls:
try: try:
# Process URL # Missing kids fetching endpoint, verify URL
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=None) missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
data = {"url": obj_url.url}
# POST
r = requests.post(missingkids_fetch_endpoint, json=data)
# Jsonify
results = r.json()
if (results.get("status") == "valid"):
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
elif (results.get("status") == "invalid"):
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
elif (results.get("status") == "duplicate"):
self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection"))
elif (results.get("status") == "unknown"):
# Nothing to do, not sure about it...
logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url))
pass
except Exception as e: except Exception as e:
logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e))) logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e)))