From 8cf2b52325e9eb882bbcd6e19531a99ef890c69d Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Mon, 7 Jul 2025 16:34:21 +0200 Subject: [PATCH] Selenium based missing kid verify url fix (2) --- app_selenium/missing_kids.py | 72 +++++++++++++++++++------------- app_urls/fetcher/src/db_utils.py | 2 +- 2 files changed, 44 insertions(+), 30 deletions(-) diff --git a/app_selenium/missing_kids.py b/app_selenium/missing_kids.py index 7c8a03f..da89073 100644 --- a/app_selenium/missing_kids.py +++ b/app_selenium/missing_kids.py @@ -25,17 +25,6 @@ class MissingKidsFetcher(): pass def verify_missing_kid_url(self, url): - # Initialize - driver = get_webdriver() - # Load URL - driver.get(url) - # Wait for 404? - WebDriverWait(driver, 1).until(EC.title_contains("404")) - - if ("404" in driver.title): - # Status invalid - return "invalid" - def load_finished(driver): # Find all tags with src attributes. Extract src URLs image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")] @@ -44,25 +33,50 @@ class MissingKidsFetcher(): # logger.debug("Finished loading URL") return finished - # Check until finished loading - num_checks = 5 - while (not load_finished(driver)) and (num_checks>=0): + try: + # Initialize + driver = get_webdriver() + # Load URL + driver.get(url) + # Wait for 404? + WebDriverWait(driver, 1).until(EC.title_contains("404")) + + if ("404" in driver.title): + # Status invalid + results = {"status": "invalid"} + else: + # Check until finished loading + num_checks = 5 + while (not load_finished(driver)) and (num_checks>=0): + time.sleep(1) + + # Find all tags with src attributes. Extract src URLs + image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")] + + if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])): + # Status invalid + results = {"status": "invalid"} + elif ("Haven you seen this child?" in driver.title): + # Status valid + results = {"status": "valid"} + elif (driver.current_url != url): + # Redirection (duplicate) + results = {"status": "duplicate", "redirection": driver.current_url} + else: + results = {"status": "unknown"} + except Exception as e: + logger.warning("Exception while verifying MissingKid URL {}\n{}".format(url, str(e)), exc_info=True) + results = {} + + # Release memory + try: + driver.quit() #driver.close() time.sleep(1) - - # Find all tags with src attributes. Extract src URLs - image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")] - - if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])): - # Status invalid - return {"status": "invalid"} - elif ("Haven you seen this child?" in driver.title): - # Status valid - return {"status": "valid"} - elif (driver.current_url != url): - # Redirection (duplicate) - return {"status": "duplicate", "redirection": driver.current_url} - else: - return {"status": "unknown"} + # import atexit + # atexit.register(driver.quit) # Will always be called on exit + except Exception as e: + logger.warning("Exception while closing/quitting driver: {}".format(str(e)), exc_info=True) + return results def get_missing_kids_urls(self, first_n_pages=-1): diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index 0c9c077..bf09b2e 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -293,7 +293,7 @@ class DB_Handler(): missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/") data = {"url": obj_url.url} # POST - r = requests.post(missingkids_fetch_endpoint, json=data) + r = requests.post(missingkids_fetch_endpoint, json=data, timeout=30) # Jsonify results = r.json()