85 lines
3.1 KiB
Python
85 lines
3.1 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from utils import get_chrome_options
|
|
import time
|
|
import os
|
|
|
|
from logger import get_logger
|
|
logger = get_logger()
|
|
|
|
class MissingKidsFetcher():
|
|
def __init__(self) -> None:
|
|
pass
|
|
|
|
def get_missing_kids_urls(self, first_n_pages=-1):
|
|
logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
|
|
# Poster URL
|
|
url = "https://www.missingkids.org/gethelpnow/search/poster-search-results"
|
|
# URLs
|
|
set_urls = set()
|
|
|
|
try:
|
|
# Initialize
|
|
driver = webdriver.Chrome(options=get_chrome_options())
|
|
# Go to URL
|
|
driver.get(url)
|
|
# Iterate
|
|
i, continue_iterating, num_exceptions = 1, True, 0
|
|
while (continue_iterating):
|
|
logger.debug("Processing page: {}...".format(i))
|
|
|
|
try:
|
|
time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3)
|
|
# Fetch poster URLs
|
|
for element_type in ["a"]: # ["a", "p", "div"]:
|
|
for elem in driver.find_elements(By.TAG_NAME, element_type):
|
|
href = elem.get_attribute('href')
|
|
if (href is not None) and ("missingkids.org/poster" in href):
|
|
set_urls.add(href)
|
|
|
|
logger.debug("#URLS: {}".format(len(set_urls)))
|
|
|
|
# Next page
|
|
elem = driver.find_element(By.LINK_TEXT, str(i+1))
|
|
logger.debug("Clicking: {}...".format(elem.text))
|
|
elem.click()
|
|
# Ok
|
|
processed_ok = True
|
|
except Exception as e:
|
|
# +1 exception
|
|
num_exceptions += 1
|
|
processed_ok = False
|
|
|
|
if (num_exceptions == 3):
|
|
continue_iterating = False
|
|
else:
|
|
logger.info("Exception while clicking page {}, retrying...".format(i+1))
|
|
|
|
start_print = False
|
|
for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""):
|
|
if (e.text == "<<"):
|
|
start_print = True
|
|
if (e.text == ">>"):
|
|
break
|
|
if (start_print):
|
|
logger.info(e.text)
|
|
|
|
# driver.refresh()
|
|
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4));
|
|
|
|
if (i == first_n_pages):
|
|
continue_iterating = False
|
|
if (processed_ok):
|
|
i += 1
|
|
num_exceptions = 0
|
|
|
|
except Exception as e:
|
|
logger.warning("Exception while clicking page {}. {}".format(i+1, str(e)), exc_info=True)
|
|
finally:
|
|
try:
|
|
driver.close()
|
|
except Exception as e:
|
|
pass
|
|
|
|
return set_urls
|