Selenium based missing kid verify url
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
from missing_kids import MissingKidsFetcher
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
@@ -12,3 +13,14 @@ def get_missing_kids(pages: int = -1):
|
||||
except Exception as e:
|
||||
res = {}
|
||||
return res
|
||||
|
||||
class Body(BaseModel):
|
||||
url: str
|
||||
|
||||
@app.post("/verify_missing_kid/")
|
||||
def get_missing_kids(data: Body):
|
||||
try:
|
||||
res = MissingKidsFetcher().verify_missing_kid_url(data.url)
|
||||
except Exception as e:
|
||||
res = {}
|
||||
return res
|
||||
@@ -2,6 +2,8 @@ from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.firefox.service import Service
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
import time
|
||||
import os
|
||||
|
||||
@@ -22,6 +24,47 @@ class MissingKidsFetcher():
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def verify_missing_kid_url(self, url):
|
||||
# Initialize
|
||||
driver = get_webdriver()
|
||||
# Load URL
|
||||
driver.get(url)
|
||||
# Wait for 404?
|
||||
WebDriverWait(driver, 1).until(EC.title_contains("404"))
|
||||
|
||||
if ("404" in driver.title):
|
||||
# Status invalid
|
||||
return "invalid"
|
||||
|
||||
def load_finished(driver):
|
||||
# Find all <img> tags with src attributes. Extract src URLs
|
||||
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
|
||||
# If base64 image exists, loading finished
|
||||
finished = any(["data:image/png;base64" in i for i in image_urls])
|
||||
# logger.debug("Finished loading URL")
|
||||
return finished
|
||||
|
||||
# Check until finished loading
|
||||
num_checks = 5
|
||||
while (not load_finished(driver)) and (num_checks>=0):
|
||||
time.sleep(1)
|
||||
|
||||
# Find all <img> tags with src attributes. Extract src URLs
|
||||
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
|
||||
|
||||
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
|
||||
# Status invalid
|
||||
return {"status": "invalid"}
|
||||
elif ("Haven you seen this child?" in driver.title):
|
||||
# Status valid
|
||||
return {"status": "valid"}
|
||||
elif (driver.current_url != url):
|
||||
# Redirection (duplicate)
|
||||
return {"status": "duplicate", "redirection": driver.current_url}
|
||||
else:
|
||||
return {"status": "unknown"}
|
||||
|
||||
|
||||
def get_missing_kids_urls(self, first_n_pages=-1):
|
||||
logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
|
||||
# Poster URL
|
||||
|
||||
Reference in New Issue
Block a user