Valid content filter, language detect on min chars, fetch missingkids.org
This commit is contained in:
83
app_selenium/missing_kids.py
Normal file
83
app_selenium/missing_kids.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from utils import get_chrome_options
|
||||
import time
|
||||
import os
|
||||
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class MissingKidsFetcher():
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def get_missing_kids_urls(self, first_n_pages=-1):
|
||||
# Poster URL
|
||||
url = "https://www.missingkids.org/gethelpnow/search/poster-search-results"
|
||||
# URLs
|
||||
set_urls = set()
|
||||
|
||||
try:
|
||||
# Initialize
|
||||
driver = webdriver.Chrome(options=get_chrome_options())
|
||||
# Go to URL
|
||||
driver.get(url)
|
||||
# Iterate
|
||||
i, continue_iterating, num_exceptions = 1, True, 0
|
||||
while (continue_iterating):
|
||||
logger.debug("Processing page: {}...".format(i))
|
||||
|
||||
try:
|
||||
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3)
|
||||
# Fetch poster URLs
|
||||
for element_type in ["a"]: # ["a", "p", "div"]:
|
||||
for elem in driver.find_elements(By.TAG_NAME, element_type):
|
||||
href = elem.get_attribute('href')
|
||||
if (href is not None) and ("missingkids.org/poster" in href):
|
||||
set_urls.add(href)
|
||||
|
||||
logger.debug("#URLS: {}".format(len(set_urls)))
|
||||
|
||||
# Next page
|
||||
elem = driver.find_element(By.LINK_TEXT, str(i+1))
|
||||
logger.debug("Clicking: {}...".format(elem.text))
|
||||
elem.click()
|
||||
# Ok
|
||||
processed_ok = True
|
||||
except Exception as e:
|
||||
# +1 exception
|
||||
num_exceptions += 1
|
||||
processed_ok = False
|
||||
|
||||
if (num_exceptions == 3):
|
||||
continue_iterating = False
|
||||
else:
|
||||
logger.info("Exception while clicking page {}, retrying...".format(i+1))
|
||||
|
||||
start_print = False
|
||||
for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""):
|
||||
if (e.text == "<<"):
|
||||
start_print = True
|
||||
if (e.text == ">>"):
|
||||
break
|
||||
if (start_print):
|
||||
logger.info(e.text)
|
||||
|
||||
# driver.refresh()
|
||||
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4));
|
||||
|
||||
if (i == first_n_pages):
|
||||
continue_iterating = False
|
||||
if (processed_ok):
|
||||
i += 1
|
||||
num_exceptions = 0
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception while clicking page {}. {}".format(i+1, str(e)), exc_info=True)
|
||||
finally:
|
||||
try:
|
||||
driver.close()
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return set_urls
|
||||
Reference in New Issue
Block a user