diff --git a/app_selenium/app.py b/app_selenium/app.py index 7aa7323..822de05 100644 --- a/app_selenium/app.py +++ b/app_selenium/app.py @@ -1,4 +1,5 @@ from fastapi import FastAPI +from pydantic import BaseModel from missing_kids import MissingKidsFetcher from logger import get_logger logger = get_logger() @@ -12,3 +13,14 @@ def get_missing_kids(pages: int = -1): except Exception as e: res = {} return res + +class Body(BaseModel): + url: str + +@app.post("/verify_missing_kid/") +def get_missing_kids(data: Body): + try: + res = MissingKidsFetcher().verify_missing_kid_url(data.url) + except Exception as e: + res = {} + return res \ No newline at end of file diff --git a/app_selenium/missing_kids.py b/app_selenium/missing_kids.py index f42d00d..7c8a03f 100644 --- a/app_selenium/missing_kids.py +++ b/app_selenium/missing_kids.py @@ -2,6 +2,8 @@ from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.firefox.options import Options from selenium.webdriver.firefox.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC import time import os @@ -22,6 +24,47 @@ class MissingKidsFetcher(): def __init__(self) -> None: pass + def verify_missing_kid_url(self, url): + # Initialize + driver = get_webdriver() + # Load URL + driver.get(url) + # Wait for 404? + WebDriverWait(driver, 1).until(EC.title_contains("404")) + + if ("404" in driver.title): + # Status invalid + return "invalid" + + def load_finished(driver): + # Find all tags with src attributes. Extract src URLs + image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")] + # If base64 image exists, loading finished + finished = any(["data:image/png;base64" in i for i in image_urls]) + # logger.debug("Finished loading URL") + return finished + + # Check until finished loading + num_checks = 5 + while (not load_finished(driver)) and (num_checks>=0): + time.sleep(1) + + # Find all tags with src attributes. Extract src URLs + image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")] + + if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])): + # Status invalid + return {"status": "invalid"} + elif ("Haven you seen this child?" in driver.title): + # Status valid + return {"status": "valid"} + elif (driver.current_url != url): + # Redirection (duplicate) + return {"status": "duplicate", "redirection": driver.current_url} + else: + return {"status": "unknown"} + + def get_missing_kids_urls(self, first_n_pages=-1): logger.info("Get MissingKids, #pages: {}".format(first_n_pages)) # Poster URL diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index f65df66..0c9c077 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -115,12 +115,14 @@ class DB_Handler(): # Set duplicate, and insert new canonical form self._set_duplicate_and_insert_canonical(obj_url, url_canonical) elif (r.status_code == 200): - self._set_status(obj_url, Urls.STATUS_ENUM.VALID) + # Not enough to determine if it is valid. Need to wait to finish javascript, it might redirect to 404 + # self._set_status(obj_url, Urls.STATUS_ENUM.VALID) + self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN) elif (r.status_code == 404): self._set_status(obj_url, Urls.STATUS_ENUM.INVALID) else: - logger.info("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url)) - + logger.debug("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url)) + self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN) return ########################################################################## @@ -275,18 +277,37 @@ class DB_Handler(): missingkids_urls = Urls.objects.order_by("-ts_fetch").filter( (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster")) & - (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR)) + (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR)) ) # Get batch size if (batch_size is not None): missingkids_urls = missingkids_urls[:batch_size] + # TODO: Cache processed during last X hours, filter them... + # Per URL for obj_url in missingkids_urls: try: - # Process URL - self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=None) + # Missing kids fetching endpoint, verify URL + missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/") + data = {"url": obj_url.url} + # POST + r = requests.post(missingkids_fetch_endpoint, json=data) + # Jsonify + results = r.json() + + if (results.get("status") == "valid"): + self._set_status(obj_url, Urls.STATUS_ENUM.VALID) + elif (results.get("status") == "invalid"): + self._set_status(obj_url, Urls.STATUS_ENUM.INVALID) + elif (results.get("status") == "duplicate"): + self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection")) + elif (results.get("status") == "unknown"): + # Nothing to do, not sure about it... + logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url)) + pass + except Exception as e: logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e)))