Missing kids selenium fixes
This commit is contained in:
@@ -22,9 +22,6 @@ PATH_LOGS_DIRECTORY=/opt/logs
|
|||||||
DB_NAME=matitos
|
DB_NAME=matitos
|
||||||
DB_PASSWORD=supermatitos
|
DB_PASSWORD=supermatitos
|
||||||
DB_USER=supermatitos
|
DB_USER=supermatitos
|
||||||
PATH_DB_DATA=.
|
|
||||||
|
|
||||||
# Database: Django
|
|
||||||
DB_HOST=fetcher_db
|
DB_HOST=fetcher_db
|
||||||
DB_PORT=5432
|
DB_PORT=5432
|
||||||
REDIS_HOST=fetcher_redis
|
REDIS_HOST=fetcher_redis
|
||||||
@@ -46,7 +43,6 @@ FETCHER_ERROR_URL_CACHE_TIME=172800
|
|||||||
SELENIUM_ENDPOINT=http://fetcher_app_selenium:80
|
SELENIUM_ENDPOINT=http://fetcher_app_selenium:80
|
||||||
ARCH=amd64 # arm64, amd64
|
ARCH=amd64 # arm64, amd64
|
||||||
SELENIUM_SLEEP_PER_PAGE=4
|
SELENIUM_SLEEP_PER_PAGE=4
|
||||||
PATH_LOGS_DIRECTORY=/opt/logs
|
|
||||||
|
|
||||||
# Deploy resources per App
|
# Deploy resources per App
|
||||||
DEPLOY_CPUS=2
|
DEPLOY_CPUS=2
|
||||||
@@ -54,7 +50,7 @@ DEPLOY_RAM=4G
|
|||||||
|
|
||||||
# Ghost
|
# Ghost
|
||||||
GHOST_ADMIN_API_URL=https://news.matitos.org/ghost/api/admin/
|
GHOST_ADMIN_API_URL=https://news.matitos.org/ghost/api/admin/
|
||||||
GHOST_ADMIN_API_KEY=67fffe1b8a57a80001ecec5b:59f580020c196f92e05e208d288702082f8edad6366e2b2c8940b54e41cc355a
|
GHOST_ADMIN_API_KEY=
|
||||||
PEXELS_API_KEY=Y6clJkY32eihf34ukX4JsINYu9lzxh3xDdNq2HMAmGwXp0a0tt6vr6S9
|
PEXELS_API_KEY=Y6clJkY32eihf34ukX4JsINYu9lzxh3xDdNq2HMAmGwXp0a0tt6vr6S9
|
||||||
# Ollama
|
# Ollama
|
||||||
ENDPOINT_OLLAMA=https://ollamamodelnpu.matitos.org
|
ENDPOINT_OLLAMA=https://ollamamodelnpu.matitos.org
|
||||||
@@ -57,15 +57,18 @@ class MissingKidsFetcher():
|
|||||||
# Find all <img> tags with src attributes. Extract src URLs
|
# Find all <img> tags with src attributes. Extract src URLs
|
||||||
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
|
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
|
||||||
|
|
||||||
|
# Redirects to 404?
|
||||||
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
|
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
|
||||||
# Status invalid
|
# Status invalid
|
||||||
results = {"status": "invalid"}
|
results = {"status": "invalid"}
|
||||||
elif ("Have you seen this child?" in driver.title):
|
# Redirection to valid URL? -> Duplicate
|
||||||
# Status valid
|
|
||||||
results = {"status": "valid"}
|
|
||||||
elif (driver.current_url != url):
|
elif (driver.current_url != url):
|
||||||
# Redirection (duplicate)
|
# Redirection (duplicate)
|
||||||
results = {"status": "duplicate", "redirection": driver.current_url}
|
results = {"status": "duplicate", "redirection": driver.current_url}
|
||||||
|
# Valid
|
||||||
|
elif ("Have you seen this child?" in driver.title):
|
||||||
|
# Status valid
|
||||||
|
results = {"status": "valid"}
|
||||||
else:
|
else:
|
||||||
results = {"status": "unknown"}
|
results = {"status": "unknown"}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -270,22 +270,37 @@ class DB_Handler():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
|
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||||
|
|
||||||
def process_missing_kids_urls(self, batch_size=None):
|
def process_missing_kids_urls(self, batch_size=None, process_status_only=None):
|
||||||
try:
|
try:
|
||||||
logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size))
|
logger.info("Processing MissingKids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only))
|
||||||
|
|
||||||
|
if (process_status_only is not None):
|
||||||
|
filter = (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR))
|
||||||
|
else:
|
||||||
|
if (process_status_only == "valid"):
|
||||||
|
filter = Q(status=Urls.STATUS_ENUM.VALID)
|
||||||
|
elif (process_status_only == "invalid"):
|
||||||
|
filter = Q(status=Urls.STATUS_ENUM.INVALID)
|
||||||
|
elif (process_status_only == "error"):
|
||||||
|
filter = Q(status=Urls.STATUS_ENUM.ERROR)
|
||||||
|
elif (process_status_only == "unknown"):
|
||||||
|
filter = Q(status=Urls.STATUS_ENUM.UNKNOWN)
|
||||||
|
elif (process_status_only == "raw"):
|
||||||
|
filter = Q(status=Urls.STATUS_ENUM.RAW)
|
||||||
|
elif (process_status_only == "duplicate"):
|
||||||
|
filter = Q(status=Urls.STATUS_ENUM.DUPLICATE)
|
||||||
|
else:
|
||||||
|
logger.info("Unknown status to filter: {}".format(process_status_only))
|
||||||
|
|
||||||
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
|
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
|
||||||
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
|
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
|
||||||
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
filter & (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
||||||
&
|
|
||||||
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR))
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get batch size
|
# Get batch size
|
||||||
if (batch_size is not None):
|
if (batch_size is not None):
|
||||||
missingkids_urls = missingkids_urls[:batch_size]
|
missingkids_urls = missingkids_urls[:batch_size]
|
||||||
|
|
||||||
# TODO: Cache processed during last X hours, filter them...
|
|
||||||
|
|
||||||
# Per URL
|
# Per URL
|
||||||
for obj_url in missingkids_urls:
|
for obj_url in missingkids_urls:
|
||||||
try:
|
try:
|
||||||
@@ -296,6 +311,7 @@ class DB_Handler():
|
|||||||
r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120)
|
r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120)
|
||||||
# Jsonify
|
# Jsonify
|
||||||
results = r.json()
|
results = r.json()
|
||||||
|
logger.debug("Selenium results for URL {}: {}".format(obj_url.url, str(results)))
|
||||||
|
|
||||||
if (results.get("status") == "valid"):
|
if (results.get("status") == "valid"):
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||||
|
|||||||
@@ -60,17 +60,10 @@ def process_error_urls(batch_size=50):
|
|||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
@job('default')
|
@job('default')
|
||||||
def process_missing_kids_urls(batch_size=50):
|
def process_missing_kids_urls(batch_size=None, process_status_only=None):
|
||||||
task = "Process Missing Kids URLs"
|
task = "Process Missing Kids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only)
|
||||||
logger.info("Task triggered: {}".format(task))
|
logger.info("Task triggered: {}".format(task))
|
||||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
DB_Handler().process_missing_kids_urls(batch_size=batch_size, process_status_only=process_status_only)
|
||||||
logger.info("Task completed: {}".format(task))
|
|
||||||
|
|
||||||
@job('default')
|
|
||||||
def process_missing_kids_urls_all(batch_size=None):
|
|
||||||
task = "Process Missing Kids URLs ALL"
|
|
||||||
logger.info("Task triggered: {}".format(task))
|
|
||||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
|
||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
@job('default')
|
@job('default')
|
||||||
|
|||||||
@@ -46,12 +46,18 @@
|
|||||||
"name": "Process MissingKids URLs",
|
"name": "Process MissingKids URLs",
|
||||||
"callable": "fetcher.tasks.process_missing_kids_urls",
|
"callable": "fetcher.tasks.process_missing_kids_urls",
|
||||||
"callable_args": [],
|
"callable_args": [],
|
||||||
"callable_kwargs": [],
|
"callable_kwargs": [
|
||||||
|
{
|
||||||
|
"arg_type": "int",
|
||||||
|
"key": "batch_size",
|
||||||
|
"val": 50
|
||||||
|
}
|
||||||
|
],
|
||||||
"enabled": false,
|
"enabled": false,
|
||||||
"queue": "default",
|
"queue": "default",
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"at_front": false,
|
"at_front": false,
|
||||||
"timeout": 1800,
|
"timeout": 10800,
|
||||||
"result_ttl": 86400,
|
"result_ttl": 86400,
|
||||||
"cron_string": null,
|
"cron_string": null,
|
||||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||||
@@ -64,19 +70,79 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model": "RepeatableTaskType",
|
"model": "RepeatableTaskType",
|
||||||
"name": "Process MissingKids URLs ALL",
|
"name": "Process MissingKids URLs ALL - unknown",
|
||||||
"callable": "fetcher.tasks.process_missing_kids_urls_all",
|
"callable": "fetcher.tasks.process_missing_kids_urls",
|
||||||
"callable_args": [],
|
"callable_args": [],
|
||||||
"callable_kwargs": [],
|
"callable_kwargs": [
|
||||||
|
{
|
||||||
|
"arg_type": "str",
|
||||||
|
"key": "process_status_only",
|
||||||
|
"val": "unknown"
|
||||||
|
}
|
||||||
|
],
|
||||||
"enabled": false,
|
"enabled": false,
|
||||||
"queue": "default",
|
"queue": "default",
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"at_front": false,
|
"at_front": false,
|
||||||
"timeout": 43200,
|
"timeout": 86400,
|
||||||
"result_ttl": 86400,
|
"result_ttl": 86400,
|
||||||
"cron_string": null,
|
"cron_string": null,
|
||||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||||
"interval": 1,
|
"interval": 1,
|
||||||
|
"interval_unit": "days",
|
||||||
|
"successful_runs": 0,
|
||||||
|
"failed_runs": 0,
|
||||||
|
"last_successful_run": null,
|
||||||
|
"last_failed_run": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "RepeatableTaskType",
|
||||||
|
"name": "Process MissingKids URLs ALL - valid",
|
||||||
|
"callable": "fetcher.tasks.process_missing_kids_urls",
|
||||||
|
"callable_args": [],
|
||||||
|
"callable_kwargs": [
|
||||||
|
{
|
||||||
|
"arg_type": "str",
|
||||||
|
"key": "process_status_only",
|
||||||
|
"val": "valid"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"enabled": false,
|
||||||
|
"queue": "default",
|
||||||
|
"repeat": null,
|
||||||
|
"at_front": false,
|
||||||
|
"timeout": 86400,
|
||||||
|
"result_ttl": 86400,
|
||||||
|
"cron_string": null,
|
||||||
|
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||||
|
"interval": 2,
|
||||||
|
"interval_unit": "days",
|
||||||
|
"successful_runs": 0,
|
||||||
|
"failed_runs": 0,
|
||||||
|
"last_successful_run": null,
|
||||||
|
"last_failed_run": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "RepeatableTaskType",
|
||||||
|
"name": "Process MissingKids URLs ALL - invalid",
|
||||||
|
"callable": "fetcher.tasks.process_missing_kids_urls",
|
||||||
|
"callable_args": [],
|
||||||
|
"callable_kwargs": [
|
||||||
|
{
|
||||||
|
"arg_type": "str",
|
||||||
|
"key": "process_status_only",
|
||||||
|
"val": "invalid"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"enabled": false,
|
||||||
|
"queue": "default",
|
||||||
|
"repeat": null,
|
||||||
|
"at_front": false,
|
||||||
|
"timeout": 86400,
|
||||||
|
"result_ttl": 86400,
|
||||||
|
"cron_string": null,
|
||||||
|
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||||
|
"interval": 4,
|
||||||
"interval_unit": "weeks",
|
"interval_unit": "weeks",
|
||||||
"successful_runs": 0,
|
"successful_runs": 0,
|
||||||
"failed_runs": 0,
|
"failed_runs": 0,
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
version: '3.9'
|
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
fetcher_app_selenium:
|
fetcher_app_selenium:
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
version: '3.9'
|
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
fetcher_app_selenium:
|
fetcher_app_selenium:
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
version: '3.9'
|
|
||||||
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
fetcher_app_selenium:
|
fetcher_app_selenium:
|
||||||
|
|||||||
Reference in New Issue
Block a user