Missing kids selenium fixes

This commit is contained in:
Luciano Gervasoni
2025-07-08 09:43:40 +02:00
parent e81a96f4bd
commit 522c1cb8b3
8 changed files with 105 additions and 37 deletions

View File

@@ -22,9 +22,6 @@ PATH_LOGS_DIRECTORY=/opt/logs
DB_NAME=matitos DB_NAME=matitos
DB_PASSWORD=supermatitos DB_PASSWORD=supermatitos
DB_USER=supermatitos DB_USER=supermatitos
PATH_DB_DATA=.
# Database: Django
DB_HOST=fetcher_db DB_HOST=fetcher_db
DB_PORT=5432 DB_PORT=5432
REDIS_HOST=fetcher_redis REDIS_HOST=fetcher_redis
@@ -46,7 +43,6 @@ FETCHER_ERROR_URL_CACHE_TIME=172800
SELENIUM_ENDPOINT=http://fetcher_app_selenium:80 SELENIUM_ENDPOINT=http://fetcher_app_selenium:80
ARCH=amd64 # arm64, amd64 ARCH=amd64 # arm64, amd64
SELENIUM_SLEEP_PER_PAGE=4 SELENIUM_SLEEP_PER_PAGE=4
PATH_LOGS_DIRECTORY=/opt/logs
# Deploy resources per App # Deploy resources per App
DEPLOY_CPUS=2 DEPLOY_CPUS=2
@@ -54,7 +50,7 @@ DEPLOY_RAM=4G
# Ghost # Ghost
GHOST_ADMIN_API_URL=https://news.matitos.org/ghost/api/admin/ GHOST_ADMIN_API_URL=https://news.matitos.org/ghost/api/admin/
GHOST_ADMIN_API_KEY=67fffe1b8a57a80001ecec5b:59f580020c196f92e05e208d288702082f8edad6366e2b2c8940b54e41cc355a GHOST_ADMIN_API_KEY=
PEXELS_API_KEY=Y6clJkY32eihf34ukX4JsINYu9lzxh3xDdNq2HMAmGwXp0a0tt6vr6S9 PEXELS_API_KEY=Y6clJkY32eihf34ukX4JsINYu9lzxh3xDdNq2HMAmGwXp0a0tt6vr6S9
# Ollama # Ollama
ENDPOINT_OLLAMA=https://ollamamodelnpu.matitos.org ENDPOINT_OLLAMA=https://ollamamodelnpu.matitos.org

View File

@@ -57,15 +57,18 @@ class MissingKidsFetcher():
# Find all <img> tags with src attributes. Extract src URLs # Find all <img> tags with src attributes. Extract src URLs
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")] image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
# Redirects to 404?
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])): if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
# Status invalid # Status invalid
results = {"status": "invalid"} results = {"status": "invalid"}
elif ("Have you seen this child?" in driver.title): # Redirection to valid URL? -> Duplicate
# Status valid
results = {"status": "valid"}
elif (driver.current_url != url): elif (driver.current_url != url):
# Redirection (duplicate) # Redirection (duplicate)
results = {"status": "duplicate", "redirection": driver.current_url} results = {"status": "duplicate", "redirection": driver.current_url}
# Valid
elif ("Have you seen this child?" in driver.title):
# Status valid
results = {"status": "valid"}
else: else:
results = {"status": "unknown"} results = {"status": "unknown"}
except Exception as e: except Exception as e:

View File

@@ -270,22 +270,37 @@ class DB_Handler():
except Exception as e: except Exception as e:
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc())) logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
def process_missing_kids_urls(self, batch_size=None): def process_missing_kids_urls(self, batch_size=None, process_status_only=None):
try: try:
logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size)) logger.info("Processing MissingKids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only))
if (process_status_only is not None):
filter = (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR))
else:
if (process_status_only == "valid"):
filter = Q(status=Urls.STATUS_ENUM.VALID)
elif (process_status_only == "invalid"):
filter = Q(status=Urls.STATUS_ENUM.INVALID)
elif (process_status_only == "error"):
filter = Q(status=Urls.STATUS_ENUM.ERROR)
elif (process_status_only == "unknown"):
filter = Q(status=Urls.STATUS_ENUM.UNKNOWN)
elif (process_status_only == "raw"):
filter = Q(status=Urls.STATUS_ENUM.RAW)
elif (process_status_only == "duplicate"):
filter = Q(status=Urls.STATUS_ENUM.DUPLICATE)
else:
logger.info("Unknown status to filter: {}".format(process_status_only))
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid') # Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter( missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster")) filter & (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
&
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR))
) )
# Get batch size # Get batch size
if (batch_size is not None): if (batch_size is not None):
missingkids_urls = missingkids_urls[:batch_size] missingkids_urls = missingkids_urls[:batch_size]
# TODO: Cache processed during last X hours, filter them...
# Per URL # Per URL
for obj_url in missingkids_urls: for obj_url in missingkids_urls:
try: try:
@@ -296,6 +311,7 @@ class DB_Handler():
r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120) r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120)
# Jsonify # Jsonify
results = r.json() results = r.json()
logger.debug("Selenium results for URL {}: {}".format(obj_url.url, str(results)))
if (results.get("status") == "valid"): if (results.get("status") == "valid"):
self._set_status(obj_url, Urls.STATUS_ENUM.VALID) self._set_status(obj_url, Urls.STATUS_ENUM.VALID)

View File

@@ -60,17 +60,10 @@ def process_error_urls(batch_size=50):
logger.info("Task completed: {}".format(task)) logger.info("Task completed: {}".format(task))
@job('default') @job('default')
def process_missing_kids_urls(batch_size=50): def process_missing_kids_urls(batch_size=None, process_status_only=None):
task = "Process Missing Kids URLs" task = "Process Missing Kids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only)
logger.info("Task triggered: {}".format(task)) logger.info("Task triggered: {}".format(task))
DB_Handler().process_missing_kids_urls(batch_size=batch_size) DB_Handler().process_missing_kids_urls(batch_size=batch_size, process_status_only=process_status_only)
logger.info("Task completed: {}".format(task))
@job('default')
def process_missing_kids_urls_all(batch_size=None):
task = "Process Missing Kids URLs ALL"
logger.info("Task triggered: {}".format(task))
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
logger.info("Task completed: {}".format(task)) logger.info("Task completed: {}".format(task))
@job('default') @job('default')

View File

@@ -46,12 +46,18 @@
"name": "Process MissingKids URLs", "name": "Process MissingKids URLs",
"callable": "fetcher.tasks.process_missing_kids_urls", "callable": "fetcher.tasks.process_missing_kids_urls",
"callable_args": [], "callable_args": [],
"callable_kwargs": [], "callable_kwargs": [
{
"arg_type": "int",
"key": "batch_size",
"val": 50
}
],
"enabled": false, "enabled": false,
"queue": "default", "queue": "default",
"repeat": null, "repeat": null,
"at_front": false, "at_front": false,
"timeout": 1800, "timeout": 10800,
"result_ttl": 86400, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00", "scheduled_time": "2025-01-01T00:00:00+00:00",
@@ -64,19 +70,79 @@
}, },
{ {
"model": "RepeatableTaskType", "model": "RepeatableTaskType",
"name": "Process MissingKids URLs ALL", "name": "Process MissingKids URLs ALL - unknown",
"callable": "fetcher.tasks.process_missing_kids_urls_all", "callable": "fetcher.tasks.process_missing_kids_urls",
"callable_args": [], "callable_args": [],
"callable_kwargs": [], "callable_kwargs": [
{
"arg_type": "str",
"key": "process_status_only",
"val": "unknown"
}
],
"enabled": false, "enabled": false,
"queue": "default", "queue": "default",
"repeat": null, "repeat": null,
"at_front": false, "at_front": false,
"timeout": 43200, "timeout": 86400,
"result_ttl": 86400, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00", "scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 1, "interval": 1,
"interval_unit": "days",
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process MissingKids URLs ALL - valid",
"callable": "fetcher.tasks.process_missing_kids_urls",
"callable_args": [],
"callable_kwargs": [
{
"arg_type": "str",
"key": "process_status_only",
"val": "valid"
}
],
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": 86400,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 2,
"interval_unit": "days",
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process MissingKids URLs ALL - invalid",
"callable": "fetcher.tasks.process_missing_kids_urls",
"callable_args": [],
"callable_kwargs": [
{
"arg_type": "str",
"key": "process_status_only",
"val": "invalid"
}
],
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": 86400,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 4,
"interval_unit": "weeks", "interval_unit": "weeks",
"successful_runs": 0, "successful_runs": 0,
"failed_runs": 0, "failed_runs": 0,

View File

@@ -1,5 +1,3 @@
version: '3.9'
services: services:
fetcher_app_selenium: fetcher_app_selenium:

View File

@@ -1,5 +1,3 @@
version: '3.9'
services: services:
fetcher_app_selenium: fetcher_app_selenium:

View File

@@ -1,5 +1,3 @@
version: '3.9'
services: services:
fetcher_app_selenium: fetcher_app_selenium: