Valid content filter, language detect on min chars, fetch missingkids.org

This commit is contained in:
Luciano Gervasoni
2025-04-03 09:44:46 +02:00
parent 3b54e247e7
commit 5addfa5ba9
18 changed files with 533 additions and 66 deletions

46
app_selenium/Dev.ipynb Normal file
View File

@@ -0,0 +1,46 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"\n",
"endpoint = \"http://localhost:80/get_missing_kids?pages=2\"\n",
"r = requests.get(endpoint)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"r.text"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos_urls",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

16
app_selenium/Dockerfile Normal file
View File

@@ -0,0 +1,16 @@
FROM python:3.12
RUN apt update && apt install -y --no-install-recommends chromium chromium-driver curl
RUN apt autoclean && rm -rf /var/lib/apt/lists/*
WORKDIR /opt/app
RUN pip install --no-cache-dir selenium fastapi "uvicorn[standard]"
COPY . /opt/app/
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
# docker build -f Dockerfile -t selenium_app .
# docker run --rm -it --shm-size=512m --name selenium_app selenium_app
# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=5"
# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=-1"

View File

@@ -1,3 +1,8 @@
# Selenium app
* Missing kids posters fetch (num_pages=X)
* ...
```
SELENIUM_SLEEP_PER_PAGE=4
PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log"
```

14
app_selenium/app.py Normal file
View File

@@ -0,0 +1,14 @@
from fastapi import FastAPI
from missing_kids import MissingKidsFetcher
from logger import get_logger
logger = get_logger()
app = FastAPI()
@app.get("/get_missing_kids/")
def get_missing_kids(pages: int = -1):
try:
res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)}
except Exception as e:
res = {}
return res

34
app_selenium/logger.py Normal file
View File

@@ -0,0 +1,34 @@
import logging
import os
# Get env var
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log")
# Directory of logs
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
os.makedirs(directory, exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# To file log: INFO / WARNING / ERROR
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.INFO)
logger.addHandler(fh)
# To file log: WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.WARNING)
logger.addHandler(fh)
def get_logger():
return logger

View File

@@ -0,0 +1,83 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
from utils import get_chrome_options
import time
import os
from logger import get_logger
logger = get_logger()
class MissingKidsFetcher():
def __init__(self) -> None:
pass
def get_missing_kids_urls(self, first_n_pages=-1):
# Poster URL
url = "https://www.missingkids.org/gethelpnow/search/poster-search-results"
# URLs
set_urls = set()
try:
# Initialize
driver = webdriver.Chrome(options=get_chrome_options())
# Go to URL
driver.get(url)
# Iterate
i, continue_iterating, num_exceptions = 1, True, 0
while (continue_iterating):
logger.debug("Processing page: {}...".format(i))
try:
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3)
# Fetch poster URLs
for element_type in ["a"]: # ["a", "p", "div"]:
for elem in driver.find_elements(By.TAG_NAME, element_type):
href = elem.get_attribute('href')
if (href is not None) and ("missingkids.org/poster" in href):
set_urls.add(href)
logger.debug("#URLS: {}".format(len(set_urls)))
# Next page
elem = driver.find_element(By.LINK_TEXT, str(i+1))
logger.debug("Clicking: {}...".format(elem.text))
elem.click()
# Ok
processed_ok = True
except Exception as e:
# +1 exception
num_exceptions += 1
processed_ok = False
if (num_exceptions == 3):
continue_iterating = False
else:
logger.info("Exception while clicking page {}, retrying...".format(i+1))
start_print = False
for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""):
if (e.text == "<<"):
start_print = True
if (e.text == ">>"):
break
if (start_print):
logger.info(e.text)
# driver.refresh()
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4));
if (i == first_n_pages):
continue_iterating = False
if (processed_ok):
i += 1
num_exceptions = 0
except Exception as e:
logger.warning("Exception while clicking page {}. {}".format(i+1, str(e)), exc_info=True)
finally:
try:
driver.close()
except Exception as e:
pass
return set_urls

14
app_selenium/utils.py Normal file
View File

@@ -0,0 +1,14 @@
from selenium.webdriver.chrome.options import Options
def get_chrome_options():
"""Sets chrome options for Selenium.
Chrome options for headless browser is enabled.
"""
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_prefs = {}
chrome_options.experimental_options["prefs"] = chrome_prefs
chrome_prefs["profile.default_content_settings"] = {"images": 2}
return chrome_options