Selenium based missing kid verify url
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
from pydantic import BaseModel
|
||||||
from missing_kids import MissingKidsFetcher
|
from missing_kids import MissingKidsFetcher
|
||||||
from logger import get_logger
|
from logger import get_logger
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
@@ -12,3 +13,14 @@ def get_missing_kids(pages: int = -1):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
res = {}
|
res = {}
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
class Body(BaseModel):
|
||||||
|
url: str
|
||||||
|
|
||||||
|
@app.post("/verify_missing_kid/")
|
||||||
|
def get_missing_kids(data: Body):
|
||||||
|
try:
|
||||||
|
res = MissingKidsFetcher().verify_missing_kid_url(data.url)
|
||||||
|
except Exception as e:
|
||||||
|
res = {}
|
||||||
|
return res
|
||||||
@@ -2,6 +2,8 @@ from selenium import webdriver
|
|||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.webdriver.firefox.options import Options
|
from selenium.webdriver.firefox.options import Options
|
||||||
from selenium.webdriver.firefox.service import Service
|
from selenium.webdriver.firefox.service import Service
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
|
|
||||||
@@ -22,6 +24,47 @@ class MissingKidsFetcher():
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def verify_missing_kid_url(self, url):
|
||||||
|
# Initialize
|
||||||
|
driver = get_webdriver()
|
||||||
|
# Load URL
|
||||||
|
driver.get(url)
|
||||||
|
# Wait for 404?
|
||||||
|
WebDriverWait(driver, 1).until(EC.title_contains("404"))
|
||||||
|
|
||||||
|
if ("404" in driver.title):
|
||||||
|
# Status invalid
|
||||||
|
return "invalid"
|
||||||
|
|
||||||
|
def load_finished(driver):
|
||||||
|
# Find all <img> tags with src attributes. Extract src URLs
|
||||||
|
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
|
||||||
|
# If base64 image exists, loading finished
|
||||||
|
finished = any(["data:image/png;base64" in i for i in image_urls])
|
||||||
|
# logger.debug("Finished loading URL")
|
||||||
|
return finished
|
||||||
|
|
||||||
|
# Check until finished loading
|
||||||
|
num_checks = 5
|
||||||
|
while (not load_finished(driver)) and (num_checks>=0):
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Find all <img> tags with src attributes. Extract src URLs
|
||||||
|
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
|
||||||
|
|
||||||
|
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
|
||||||
|
# Status invalid
|
||||||
|
return {"status": "invalid"}
|
||||||
|
elif ("Haven you seen this child?" in driver.title):
|
||||||
|
# Status valid
|
||||||
|
return {"status": "valid"}
|
||||||
|
elif (driver.current_url != url):
|
||||||
|
# Redirection (duplicate)
|
||||||
|
return {"status": "duplicate", "redirection": driver.current_url}
|
||||||
|
else:
|
||||||
|
return {"status": "unknown"}
|
||||||
|
|
||||||
|
|
||||||
def get_missing_kids_urls(self, first_n_pages=-1):
|
def get_missing_kids_urls(self, first_n_pages=-1):
|
||||||
logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
|
logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
|
||||||
# Poster URL
|
# Poster URL
|
||||||
|
|||||||
@@ -115,12 +115,14 @@ class DB_Handler():
|
|||||||
# Set duplicate, and insert new canonical form
|
# Set duplicate, and insert new canonical form
|
||||||
self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
|
self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
|
||||||
elif (r.status_code == 200):
|
elif (r.status_code == 200):
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
# Not enough to determine if it is valid. Need to wait to finish javascript, it might redirect to 404
|
||||||
|
# self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||||
|
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
||||||
elif (r.status_code == 404):
|
elif (r.status_code == 404):
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||||
else:
|
else:
|
||||||
logger.info("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
|
logger.debug("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
|
||||||
|
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
||||||
return
|
return
|
||||||
##########################################################################
|
##########################################################################
|
||||||
|
|
||||||
@@ -275,18 +277,37 @@ class DB_Handler():
|
|||||||
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
|
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
|
||||||
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
||||||
&
|
&
|
||||||
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
|
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR))
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get batch size
|
# Get batch size
|
||||||
if (batch_size is not None):
|
if (batch_size is not None):
|
||||||
missingkids_urls = missingkids_urls[:batch_size]
|
missingkids_urls = missingkids_urls[:batch_size]
|
||||||
|
|
||||||
|
# TODO: Cache processed during last X hours, filter them...
|
||||||
|
|
||||||
# Per URL
|
# Per URL
|
||||||
for obj_url in missingkids_urls:
|
for obj_url in missingkids_urls:
|
||||||
try:
|
try:
|
||||||
# Process URL
|
# Missing kids fetching endpoint, verify URL
|
||||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=None)
|
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
|
||||||
|
data = {"url": obj_url.url}
|
||||||
|
# POST
|
||||||
|
r = requests.post(missingkids_fetch_endpoint, json=data)
|
||||||
|
# Jsonify
|
||||||
|
results = r.json()
|
||||||
|
|
||||||
|
if (results.get("status") == "valid"):
|
||||||
|
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||||
|
elif (results.get("status") == "invalid"):
|
||||||
|
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||||
|
elif (results.get("status") == "duplicate"):
|
||||||
|
self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection"))
|
||||||
|
elif (results.get("status") == "unknown"):
|
||||||
|
# Nothing to do, not sure about it...
|
||||||
|
logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url))
|
||||||
|
pass
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e)))
|
logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e)))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user