Valid content filter, language detect on min chars, fetch missingkids.org
This commit is contained in:
46
app_selenium/Dev.ipynb
Normal file
46
app_selenium/Dev.ipynb
Normal file
@@ -0,0 +1,46 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"endpoint = \"http://localhost:80/get_missing_kids?pages=2\"\n",
|
||||
"r = requests.get(endpoint)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"r.text"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos_urls",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
16
app_selenium/Dockerfile
Normal file
16
app_selenium/Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
||||
FROM python:3.12
|
||||
|
||||
RUN apt update && apt install -y --no-install-recommends chromium chromium-driver curl
|
||||
RUN apt autoclean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /opt/app
|
||||
RUN pip install --no-cache-dir selenium fastapi "uvicorn[standard]"
|
||||
COPY . /opt/app/
|
||||
|
||||
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
|
||||
|
||||
# docker build -f Dockerfile -t selenium_app .
|
||||
# docker run --rm -it --shm-size=512m --name selenium_app selenium_app
|
||||
|
||||
# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=5"
|
||||
# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=-1"
|
||||
@@ -1,3 +1,8 @@
|
||||
# Selenium app
|
||||
|
||||
* Missing kids posters fetch (num_pages=X)
|
||||
* ...
|
||||
|
||||
```
|
||||
SELENIUM_SLEEP_PER_PAGE=4
|
||||
PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log"
|
||||
```
|
||||
14
app_selenium/app.py
Normal file
14
app_selenium/app.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from fastapi import FastAPI
|
||||
from missing_kids import MissingKidsFetcher
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@app.get("/get_missing_kids/")
|
||||
def get_missing_kids(pages: int = -1):
|
||||
try:
|
||||
res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)}
|
||||
except Exception as e:
|
||||
res = {}
|
||||
return res
|
||||
34
app_selenium/logger.py
Normal file
34
app_selenium/logger.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
# Get env var
|
||||
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log")
|
||||
|
||||
# Directory of logs
|
||||
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.INFO)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.WARNING)
|
||||
logger.addHandler(fh)
|
||||
|
||||
def get_logger():
|
||||
return logger
|
||||
83
app_selenium/missing_kids.py
Normal file
83
app_selenium/missing_kids.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from utils import get_chrome_options
|
||||
import time
|
||||
import os
|
||||
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class MissingKidsFetcher():
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def get_missing_kids_urls(self, first_n_pages=-1):
|
||||
# Poster URL
|
||||
url = "https://www.missingkids.org/gethelpnow/search/poster-search-results"
|
||||
# URLs
|
||||
set_urls = set()
|
||||
|
||||
try:
|
||||
# Initialize
|
||||
driver = webdriver.Chrome(options=get_chrome_options())
|
||||
# Go to URL
|
||||
driver.get(url)
|
||||
# Iterate
|
||||
i, continue_iterating, num_exceptions = 1, True, 0
|
||||
while (continue_iterating):
|
||||
logger.debug("Processing page: {}...".format(i))
|
||||
|
||||
try:
|
||||
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3)
|
||||
# Fetch poster URLs
|
||||
for element_type in ["a"]: # ["a", "p", "div"]:
|
||||
for elem in driver.find_elements(By.TAG_NAME, element_type):
|
||||
href = elem.get_attribute('href')
|
||||
if (href is not None) and ("missingkids.org/poster" in href):
|
||||
set_urls.add(href)
|
||||
|
||||
logger.debug("#URLS: {}".format(len(set_urls)))
|
||||
|
||||
# Next page
|
||||
elem = driver.find_element(By.LINK_TEXT, str(i+1))
|
||||
logger.debug("Clicking: {}...".format(elem.text))
|
||||
elem.click()
|
||||
# Ok
|
||||
processed_ok = True
|
||||
except Exception as e:
|
||||
# +1 exception
|
||||
num_exceptions += 1
|
||||
processed_ok = False
|
||||
|
||||
if (num_exceptions == 3):
|
||||
continue_iterating = False
|
||||
else:
|
||||
logger.info("Exception while clicking page {}, retrying...".format(i+1))
|
||||
|
||||
start_print = False
|
||||
for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""):
|
||||
if (e.text == "<<"):
|
||||
start_print = True
|
||||
if (e.text == ">>"):
|
||||
break
|
||||
if (start_print):
|
||||
logger.info(e.text)
|
||||
|
||||
# driver.refresh()
|
||||
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4));
|
||||
|
||||
if (i == first_n_pages):
|
||||
continue_iterating = False
|
||||
if (processed_ok):
|
||||
i += 1
|
||||
num_exceptions = 0
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception while clicking page {}. {}".format(i+1, str(e)), exc_info=True)
|
||||
finally:
|
||||
try:
|
||||
driver.close()
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return set_urls
|
||||
14
app_selenium/utils.py
Normal file
14
app_selenium/utils.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
def get_chrome_options():
|
||||
"""Sets chrome options for Selenium.
|
||||
Chrome options for headless browser is enabled.
|
||||
"""
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_prefs = {}
|
||||
chrome_options.experimental_options["prefs"] = chrome_prefs
|
||||
chrome_prefs["profile.default_content_settings"] = {"images": 2}
|
||||
return chrome_options
|
||||
Reference in New Issue
Block a user