Selenium based missing kid verify url fix (2)
This commit is contained in:
@@ -25,6 +25,15 @@ class MissingKidsFetcher():
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def verify_missing_kid_url(self, url):
|
def verify_missing_kid_url(self, url):
|
||||||
|
def load_finished(driver):
|
||||||
|
# Find all <img> tags with src attributes. Extract src URLs
|
||||||
|
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
|
||||||
|
# If base64 image exists, loading finished
|
||||||
|
finished = any(["data:image/png;base64" in i for i in image_urls])
|
||||||
|
# logger.debug("Finished loading URL")
|
||||||
|
return finished
|
||||||
|
|
||||||
|
try:
|
||||||
# Initialize
|
# Initialize
|
||||||
driver = get_webdriver()
|
driver = get_webdriver()
|
||||||
# Load URL
|
# Load URL
|
||||||
@@ -34,16 +43,8 @@ class MissingKidsFetcher():
|
|||||||
|
|
||||||
if ("404" in driver.title):
|
if ("404" in driver.title):
|
||||||
# Status invalid
|
# Status invalid
|
||||||
return "invalid"
|
results = {"status": "invalid"}
|
||||||
|
else:
|
||||||
def load_finished(driver):
|
|
||||||
# Find all <img> tags with src attributes. Extract src URLs
|
|
||||||
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
|
|
||||||
# If base64 image exists, loading finished
|
|
||||||
finished = any(["data:image/png;base64" in i for i in image_urls])
|
|
||||||
# logger.debug("Finished loading URL")
|
|
||||||
return finished
|
|
||||||
|
|
||||||
# Check until finished loading
|
# Check until finished loading
|
||||||
num_checks = 5
|
num_checks = 5
|
||||||
while (not load_finished(driver)) and (num_checks>=0):
|
while (not load_finished(driver)) and (num_checks>=0):
|
||||||
@@ -54,15 +55,28 @@ class MissingKidsFetcher():
|
|||||||
|
|
||||||
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
|
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
|
||||||
# Status invalid
|
# Status invalid
|
||||||
return {"status": "invalid"}
|
results = {"status": "invalid"}
|
||||||
elif ("Haven you seen this child?" in driver.title):
|
elif ("Haven you seen this child?" in driver.title):
|
||||||
# Status valid
|
# Status valid
|
||||||
return {"status": "valid"}
|
results = {"status": "valid"}
|
||||||
elif (driver.current_url != url):
|
elif (driver.current_url != url):
|
||||||
# Redirection (duplicate)
|
# Redirection (duplicate)
|
||||||
return {"status": "duplicate", "redirection": driver.current_url}
|
results = {"status": "duplicate", "redirection": driver.current_url}
|
||||||
else:
|
else:
|
||||||
return {"status": "unknown"}
|
results = {"status": "unknown"}
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Exception while verifying MissingKid URL {}\n{}".format(url, str(e)), exc_info=True)
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
# Release memory
|
||||||
|
try:
|
||||||
|
driver.quit() #driver.close()
|
||||||
|
time.sleep(1)
|
||||||
|
# import atexit
|
||||||
|
# atexit.register(driver.quit) # Will always be called on exit
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Exception while closing/quitting driver: {}".format(str(e)), exc_info=True)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
def get_missing_kids_urls(self, first_n_pages=-1):
|
def get_missing_kids_urls(self, first_n_pages=-1):
|
||||||
|
|||||||
@@ -293,7 +293,7 @@ class DB_Handler():
|
|||||||
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
|
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
|
||||||
data = {"url": obj_url.url}
|
data = {"url": obj_url.url}
|
||||||
# POST
|
# POST
|
||||||
r = requests.post(missingkids_fetch_endpoint, json=data)
|
r = requests.post(missingkids_fetch_endpoint, json=data, timeout=30)
|
||||||
# Jsonify
|
# Jsonify
|
||||||
results = r.json()
|
results = r.json()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user