Selenium based missing kid verify url

This commit is contained in:
Luciano Gervasoni
2025-07-07 16:02:11 +02:00
parent 15035c108d
commit a8b236bac0
3 changed files with 82 additions and 6 deletions

View File

@@ -1,4 +1,5 @@
from fastapi import FastAPI
from pydantic import BaseModel
from missing_kids import MissingKidsFetcher
from logger import get_logger
logger = get_logger()
@@ -12,3 +13,14 @@ def get_missing_kids(pages: int = -1):
except Exception as e:
res = {}
return res
class Body(BaseModel):
url: str
@app.post("/verify_missing_kid/")
def get_missing_kids(data: Body):
try:
res = MissingKidsFetcher().verify_missing_kid_url(data.url)
except Exception as e:
res = {}
return res

View File

@@ -2,6 +2,8 @@ from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
@@ -22,6 +24,47 @@ class MissingKidsFetcher():
def __init__(self) -> None:
pass
def verify_missing_kid_url(self, url):
# Initialize
driver = get_webdriver()
# Load URL
driver.get(url)
# Wait for 404?
WebDriverWait(driver, 1).until(EC.title_contains("404"))
if ("404" in driver.title):
# Status invalid
return "invalid"
def load_finished(driver):
# Find all <img> tags with src attributes. Extract src URLs
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
# If base64 image exists, loading finished
finished = any(["data:image/png;base64" in i for i in image_urls])
# logger.debug("Finished loading URL")
return finished
# Check until finished loading
num_checks = 5
while (not load_finished(driver)) and (num_checks>=0):
time.sleep(1)
# Find all <img> tags with src attributes. Extract src URLs
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
# Status invalid
return {"status": "invalid"}
elif ("Haven you seen this child?" in driver.title):
# Status valid
return {"status": "valid"}
elif (driver.current_url != url):
# Redirection (duplicate)
return {"status": "duplicate", "redirection": driver.current_url}
else:
return {"status": "unknown"}
def get_missing_kids_urls(self, first_n_pages=-1):
logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
# Poster URL