from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.firefox.options import Options from selenium.webdriver.firefox.service import Service import time import os from logger import get_logger logger = get_logger() def get_webdriver(): options = Options() options.add_argument('--headless') # Optional options.binary_location = '/opt/firefox/firefox' service = Service('/usr/local/bin/geckodriver') driver = webdriver.Firefox(options=options, service=service) return driver class MissingKidsFetcher(): def __init__(self) -> None: pass def get_missing_kids_urls(self, first_n_pages=-1): logger.info("Get MissingKids, #pages: {}".format(first_n_pages)) # Poster URL url = "https://www.missingkids.org/gethelpnow/search/poster-search-results" # URLs set_urls = set() try: driver = get_webdriver() # Go to URL driver.get(url) # Iterate i, continue_iterating, num_exceptions = 1, True, 0 while (continue_iterating): logger.debug("Processing page: {}...".format(i)) try: time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3) # Fetch poster URLs for element_type in ["a"]: # ["a", "p", "div"]: for elem in driver.find_elements(By.TAG_NAME, element_type): href = elem.get_attribute('href') if (href is not None) and ("missingkids.org/poster" in href): set_urls.add(href) logger.debug("#URLS: {}".format(len(set_urls))) # Next page elem = driver.find_element(By.LINK_TEXT, str(i+1)) logger.debug("Clicking: {}...".format(elem.text)) elem.click() # Ok processed_ok = True except Exception as e: # +1 exception num_exceptions += 1 processed_ok = False if (num_exceptions == 2): continue_iterating = False else: logger.info("Exception while clicking page {}, retrying...".format(i+1)) start_print = False for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""): if (e.text == "<<"): start_print = True if (e.text == ">>"): break if (start_print): logger.info(e.text) # driver.refresh() time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) if (i == first_n_pages): continue_iterating = False if (processed_ok): i += 1 num_exceptions = 0 except Exception as e: logger.warning("Exception while fetching MissingKids {}".format(str(e)), exc_info=True) set_urls = set() return set_urls