Selenium based missing kid verify url fix (2)

This commit is contained in:
Luciano Gervasoni
2025-07-07 16:34:21 +02:00
parent a8b236bac0
commit 8cf2b52325
2 changed files with 44 additions and 30 deletions

View File

@@ -25,17 +25,6 @@ class MissingKidsFetcher():
pass
def verify_missing_kid_url(self, url):
# Initialize
driver = get_webdriver()
# Load URL
driver.get(url)
# Wait for 404?
WebDriverWait(driver, 1).until(EC.title_contains("404"))
if ("404" in driver.title):
# Status invalid
return "invalid"
def load_finished(driver):
# Find all <img> tags with src attributes. Extract src URLs
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
@@ -44,25 +33,50 @@ class MissingKidsFetcher():
# logger.debug("Finished loading URL")
return finished
# Check until finished loading
num_checks = 5
while (not load_finished(driver)) and (num_checks>=0):
try:
# Initialize
driver = get_webdriver()
# Load URL
driver.get(url)
# Wait for 404?
WebDriverWait(driver, 1).until(EC.title_contains("404"))
if ("404" in driver.title):
# Status invalid
results = {"status": "invalid"}
else:
# Check until finished loading
num_checks = 5
while (not load_finished(driver)) and (num_checks>=0):
time.sleep(1)
# Find all <img> tags with src attributes. Extract src URLs
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
# Status invalid
results = {"status": "invalid"}
elif ("Haven you seen this child?" in driver.title):
# Status valid
results = {"status": "valid"}
elif (driver.current_url != url):
# Redirection (duplicate)
results = {"status": "duplicate", "redirection": driver.current_url}
else:
results = {"status": "unknown"}
except Exception as e:
logger.warning("Exception while verifying MissingKid URL {}\n{}".format(url, str(e)), exc_info=True)
results = {}
# Release memory
try:
driver.quit() #driver.close()
time.sleep(1)
# Find all <img> tags with src attributes. Extract src URLs
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
# Status invalid
return {"status": "invalid"}
elif ("Haven you seen this child?" in driver.title):
# Status valid
return {"status": "valid"}
elif (driver.current_url != url):
# Redirection (duplicate)
return {"status": "duplicate", "redirection": driver.current_url}
else:
return {"status": "unknown"}
# import atexit
# atexit.register(driver.quit) # Will always be called on exit
except Exception as e:
logger.warning("Exception while closing/quitting driver: {}".format(str(e)), exc_info=True)
return results
def get_missing_kids_urls(self, first_n_pages=-1):