Files
matitos_news/app_selenium/missing_kids.py
2025-04-07 09:51:58 +02:00

88 lines
3.2 KiB
Python

from selenium import webdriver
from selenium.webdriver.common.by import By
from utils import get_chrome_options
import time
import os
from logger import get_logger
logger = get_logger()
class MissingKidsFetcher():
def __init__(self) -> None:
pass
def get_missing_kids_urls(self, first_n_pages=-1):
logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
# Poster URL
url = "https://www.missingkids.org/gethelpnow/search/poster-search-results"
# URLs
set_urls = set()
# Initialize
driver = webdriver.Chrome(options=get_chrome_options())
try:
# Go to URL
driver.get(url)
# Iterate
i, continue_iterating, num_exceptions = 1, True, 0
while (continue_iterating):
logger.debug("Processing page: {}...".format(i))
try:
time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3)
# Fetch poster URLs
for element_type in ["a"]: # ["a", "p", "div"]:
for elem in driver.find_elements(By.TAG_NAME, element_type):
href = elem.get_attribute('href')
if (href is not None) and ("missingkids.org/poster" in href):
set_urls.add(href)
logger.debug("#URLS: {}".format(len(set_urls)))
# Next page
elem = driver.find_element(By.LINK_TEXT, str(i+1))
logger.debug("Clicking: {}...".format(elem.text))
elem.click()
# Ok
processed_ok = True
except Exception as e:
# +1 exception
num_exceptions += 1
processed_ok = False
if (num_exceptions == 3):
continue_iterating = False
else:
logger.info("Exception while clicking page {}, retrying...".format(i+1))
start_print = False
for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""):
if (e.text == "<<"):
start_print = True
if (e.text == ">>"):
break
if (start_print):
logger.info(e.text)
# driver.refresh()
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4));
if (i == first_n_pages):
continue_iterating = False
if (processed_ok):
i += 1
num_exceptions = 0
except Exception as e:
logger.warning("Exception while clicking page {}. {}".format(i+1, str(e)), exc_info=True)
finally:
try:
logger.info("Closing web driver and returning results")
driver.close()
except Exception as e:
logger.warning("Exception closing web driver: {}".format(str(e)))
pass
return set_urls