from selenium import webdriver from selenium.webdriver.common.by import By from utils import get_chrome_options import time import os from logger import get_logger logger = get_logger() class MissingKidsFetcher(): def __init__(self) -> None: pass def get_missing_kids_urls(self, first_n_pages=-1): logger.info("Get MissingKids, #pages: {}".format(first_n_pages)) # Poster URL url = "https://www.missingkids.org/gethelpnow/search/poster-search-results" # URLs set_urls = set() # Initialize driver = webdriver.Chrome(options=get_chrome_options()) try: # Go to URL driver.get(url) # Iterate i, continue_iterating, num_exceptions = 1, True, 0 while (continue_iterating): logger.debug("Processing page: {}...".format(i)) try: time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3) # Fetch poster URLs for element_type in ["a"]: # ["a", "p", "div"]: for elem in driver.find_elements(By.TAG_NAME, element_type): href = elem.get_attribute('href') if (href is not None) and ("missingkids.org/poster" in href): set_urls.add(href) logger.debug("#URLS: {}".format(len(set_urls))) # Next page elem = driver.find_element(By.LINK_TEXT, str(i+1)) logger.debug("Clicking: {}...".format(elem.text)) elem.click() # Ok processed_ok = True except Exception as e: # +1 exception num_exceptions += 1 processed_ok = False if (num_exceptions == 3): continue_iterating = False else: logger.info("Exception while clicking page {}, retrying...".format(i+1)) start_print = False for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""): if (e.text == "<<"): start_print = True if (e.text == ">>"): break if (start_print): logger.info(e.text) # driver.refresh() time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); if (i == first_n_pages): continue_iterating = False if (processed_ok): i += 1 num_exceptions = 0 except Exception as e: logger.warning("Exception while clicking page {}. {}".format(i+1, str(e)), exc_info=True) finally: try: logger.info("Closing web driver and returning results") driver.close() except Exception as e: logger.warning("Exception closing web driver: {}".format(str(e))) pass return set_urls