diff --git a/app_selenium/app.py b/app_selenium/app.py
index 7aa7323..822de05 100644
--- a/app_selenium/app.py
+++ b/app_selenium/app.py
@@ -1,4 +1,5 @@
from fastapi import FastAPI
+from pydantic import BaseModel
from missing_kids import MissingKidsFetcher
from logger import get_logger
logger = get_logger()
@@ -12,3 +13,14 @@ def get_missing_kids(pages: int = -1):
except Exception as e:
res = {}
return res
+
+class Body(BaseModel):
+ url: str
+
+@app.post("/verify_missing_kid/")
+def get_missing_kids(data: Body):
+ try:
+ res = MissingKidsFetcher().verify_missing_kid_url(data.url)
+ except Exception as e:
+ res = {}
+ return res
\ No newline at end of file
diff --git a/app_selenium/missing_kids.py b/app_selenium/missing_kids.py
index f42d00d..7c8a03f 100644
--- a/app_selenium/missing_kids.py
+++ b/app_selenium/missing_kids.py
@@ -2,6 +2,8 @@ from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
import time
import os
@@ -22,6 +24,47 @@ class MissingKidsFetcher():
def __init__(self) -> None:
pass
+ def verify_missing_kid_url(self, url):
+ # Initialize
+ driver = get_webdriver()
+ # Load URL
+ driver.get(url)
+ # Wait for 404?
+ WebDriverWait(driver, 1).until(EC.title_contains("404"))
+
+ if ("404" in driver.title):
+ # Status invalid
+ return "invalid"
+
+ def load_finished(driver):
+ # Find all
tags with src attributes. Extract src URLs
+ image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
+ # If base64 image exists, loading finished
+ finished = any(["data:image/png;base64" in i for i in image_urls])
+ # logger.debug("Finished loading URL")
+ return finished
+
+ # Check until finished loading
+ num_checks = 5
+ while (not load_finished(driver)) and (num_checks>=0):
+ time.sleep(1)
+
+ # Find all
tags with src attributes. Extract src URLs
+ image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
+
+ if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
+ # Status invalid
+ return {"status": "invalid"}
+ elif ("Haven you seen this child?" in driver.title):
+ # Status valid
+ return {"status": "valid"}
+ elif (driver.current_url != url):
+ # Redirection (duplicate)
+ return {"status": "duplicate", "redirection": driver.current_url}
+ else:
+ return {"status": "unknown"}
+
+
def get_missing_kids_urls(self, first_n_pages=-1):
logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
# Poster URL
diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py
index f65df66..0c9c077 100644
--- a/app_urls/fetcher/src/db_utils.py
+++ b/app_urls/fetcher/src/db_utils.py
@@ -115,12 +115,14 @@ class DB_Handler():
# Set duplicate, and insert new canonical form
self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
elif (r.status_code == 200):
- self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
+ # Not enough to determine if it is valid. Need to wait to finish javascript, it might redirect to 404
+ # self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
+ self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
elif (r.status_code == 404):
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
else:
- logger.info("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
-
+ logger.debug("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
+ self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
return
##########################################################################
@@ -275,18 +277,37 @@ class DB_Handler():
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
&
- (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
+ (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR))
)
# Get batch size
if (batch_size is not None):
missingkids_urls = missingkids_urls[:batch_size]
+ # TODO: Cache processed during last X hours, filter them...
+
# Per URL
for obj_url in missingkids_urls:
try:
- # Process URL
- self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=None)
+ # Missing kids fetching endpoint, verify URL
+ missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
+ data = {"url": obj_url.url}
+ # POST
+ r = requests.post(missingkids_fetch_endpoint, json=data)
+ # Jsonify
+ results = r.json()
+
+ if (results.get("status") == "valid"):
+ self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
+ elif (results.get("status") == "invalid"):
+ self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
+ elif (results.get("status") == "duplicate"):
+ self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection"))
+ elif (results.get("status") == "unknown"):
+ # Nothing to do, not sure about it...
+ logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url))
+ pass
+
except Exception as e:
logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e)))