diff --git a/.env b/.env.sample similarity index 88% rename from .env rename to .env.sample index d7f987e..847d0e8 100644 --- a/.env +++ b/.env.sample @@ -22,9 +22,6 @@ PATH_LOGS_DIRECTORY=/opt/logs DB_NAME=matitos DB_PASSWORD=supermatitos DB_USER=supermatitos -PATH_DB_DATA=. - -# Database: Django DB_HOST=fetcher_db DB_PORT=5432 REDIS_HOST=fetcher_redis @@ -46,7 +43,6 @@ FETCHER_ERROR_URL_CACHE_TIME=172800 SELENIUM_ENDPOINT=http://fetcher_app_selenium:80 ARCH=amd64 # arm64, amd64 SELENIUM_SLEEP_PER_PAGE=4 -PATH_LOGS_DIRECTORY=/opt/logs # Deploy resources per App DEPLOY_CPUS=2 @@ -54,7 +50,7 @@ DEPLOY_RAM=4G # Ghost GHOST_ADMIN_API_URL=https://news.matitos.org/ghost/api/admin/ -GHOST_ADMIN_API_KEY=67fffe1b8a57a80001ecec5b:59f580020c196f92e05e208d288702082f8edad6366e2b2c8940b54e41cc355a +GHOST_ADMIN_API_KEY= PEXELS_API_KEY=Y6clJkY32eihf34ukX4JsINYu9lzxh3xDdNq2HMAmGwXp0a0tt6vr6S9 # Ollama ENDPOINT_OLLAMA=https://ollamamodelnpu.matitos.org diff --git a/app_selenium/missing_kids.py b/app_selenium/missing_kids.py index b18c906..c0e8929 100644 --- a/app_selenium/missing_kids.py +++ b/app_selenium/missing_kids.py @@ -57,15 +57,18 @@ class MissingKidsFetcher(): # Find all tags with src attributes. Extract src URLs image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")] + # Redirects to 404? if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])): # Status invalid results = {"status": "invalid"} - elif ("Have you seen this child?" in driver.title): - # Status valid - results = {"status": "valid"} + # Redirection to valid URL? -> Duplicate elif (driver.current_url != url): # Redirection (duplicate) results = {"status": "duplicate", "redirection": driver.current_url} + # Valid + elif ("Have you seen this child?" in driver.title): + # Status valid + results = {"status": "valid"} else: results = {"status": "unknown"} except Exception as e: diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index 7d4a259..11b77d2 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -270,22 +270,37 @@ class DB_Handler(): except Exception as e: logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc())) - def process_missing_kids_urls(self, batch_size=None): + def process_missing_kids_urls(self, batch_size=None, process_status_only=None): try: - logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size)) + logger.info("Processing MissingKids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only)) + + if (process_status_only is not None): + filter = (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR)) + else: + if (process_status_only == "valid"): + filter = Q(status=Urls.STATUS_ENUM.VALID) + elif (process_status_only == "invalid"): + filter = Q(status=Urls.STATUS_ENUM.INVALID) + elif (process_status_only == "error"): + filter = Q(status=Urls.STATUS_ENUM.ERROR) + elif (process_status_only == "unknown"): + filter = Q(status=Urls.STATUS_ENUM.UNKNOWN) + elif (process_status_only == "raw"): + filter = Q(status=Urls.STATUS_ENUM.RAW) + elif (process_status_only == "duplicate"): + filter = Q(status=Urls.STATUS_ENUM.DUPLICATE) + else: + logger.info("Unknown status to filter: {}".format(process_status_only)) + # Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid') missingkids_urls = Urls.objects.order_by("-ts_fetch").filter( - (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster")) - & - (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR)) + filter & (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster")) ) # Get batch size if (batch_size is not None): missingkids_urls = missingkids_urls[:batch_size] - # TODO: Cache processed during last X hours, filter them... - # Per URL for obj_url in missingkids_urls: try: @@ -296,6 +311,7 @@ class DB_Handler(): r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120) # Jsonify results = r.json() + logger.debug("Selenium results for URL {}: {}".format(obj_url.url, str(results))) if (results.get("status") == "valid"): self._set_status(obj_url, Urls.STATUS_ENUM.VALID) diff --git a/app_urls/fetcher/tasks.py b/app_urls/fetcher/tasks.py index 08ae8ed..7f0a3cd 100644 --- a/app_urls/fetcher/tasks.py +++ b/app_urls/fetcher/tasks.py @@ -60,17 +60,10 @@ def process_error_urls(batch_size=50): logger.info("Task completed: {}".format(task)) @job('default') -def process_missing_kids_urls(batch_size=50): - task = "Process Missing Kids URLs" +def process_missing_kids_urls(batch_size=None, process_status_only=None): + task = "Process Missing Kids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only) logger.info("Task triggered: {}".format(task)) - DB_Handler().process_missing_kids_urls(batch_size=batch_size) - logger.info("Task completed: {}".format(task)) - -@job('default') -def process_missing_kids_urls_all(batch_size=None): - task = "Process Missing Kids URLs ALL" - logger.info("Task triggered: {}".format(task)) - DB_Handler().process_missing_kids_urls(batch_size=batch_size) + DB_Handler().process_missing_kids_urls(batch_size=batch_size, process_status_only=process_status_only) logger.info("Task completed: {}".format(task)) @job('default') diff --git a/app_urls/scheduled_tasks.json b/app_urls/scheduled_tasks.json index af70cb5..0901e12 100644 --- a/app_urls/scheduled_tasks.json +++ b/app_urls/scheduled_tasks.json @@ -46,12 +46,18 @@ "name": "Process MissingKids URLs", "callable": "fetcher.tasks.process_missing_kids_urls", "callable_args": [], - "callable_kwargs": [], + "callable_kwargs": [ + { + "arg_type": "int", + "key": "batch_size", + "val": 50 + } + ], "enabled": false, "queue": "default", "repeat": null, "at_front": false, - "timeout": 1800, + "timeout": 10800, "result_ttl": 86400, "cron_string": null, "scheduled_time": "2025-01-01T00:00:00+00:00", @@ -64,19 +70,79 @@ }, { "model": "RepeatableTaskType", - "name": "Process MissingKids URLs ALL", - "callable": "fetcher.tasks.process_missing_kids_urls_all", + "name": "Process MissingKids URLs ALL - unknown", + "callable": "fetcher.tasks.process_missing_kids_urls", "callable_args": [], - "callable_kwargs": [], + "callable_kwargs": [ + { + "arg_type": "str", + "key": "process_status_only", + "val": "unknown" + } + ], "enabled": false, "queue": "default", "repeat": null, "at_front": false, - "timeout": 43200, + "timeout": 86400, "result_ttl": 86400, "cron_string": null, "scheduled_time": "2025-01-01T00:00:00+00:00", "interval": 1, + "interval_unit": "days", + "successful_runs": 0, + "failed_runs": 0, + "last_successful_run": null, + "last_failed_run": null + }, + { + "model": "RepeatableTaskType", + "name": "Process MissingKids URLs ALL - valid", + "callable": "fetcher.tasks.process_missing_kids_urls", + "callable_args": [], + "callable_kwargs": [ + { + "arg_type": "str", + "key": "process_status_only", + "val": "valid" + } + ], + "enabled": false, + "queue": "default", + "repeat": null, + "at_front": false, + "timeout": 86400, + "result_ttl": 86400, + "cron_string": null, + "scheduled_time": "2025-01-01T00:00:00+00:00", + "interval": 2, + "interval_unit": "days", + "successful_runs": 0, + "failed_runs": 0, + "last_successful_run": null, + "last_failed_run": null + }, + { + "model": "RepeatableTaskType", + "name": "Process MissingKids URLs ALL - invalid", + "callable": "fetcher.tasks.process_missing_kids_urls", + "callable_args": [], + "callable_kwargs": [ + { + "arg_type": "str", + "key": "process_status_only", + "val": "invalid" + } + ], + "enabled": false, + "queue": "default", + "repeat": null, + "at_front": false, + "timeout": 86400, + "result_ttl": 86400, + "cron_string": null, + "scheduled_time": "2025-01-01T00:00:00+00:00", + "interval": 4, "interval_unit": "weeks", "successful_runs": 0, "failed_runs": 0, diff --git a/docker-compose-base.yml b/docker-compose-base.yml index e7b2e59..88eeac0 100644 --- a/docker-compose-base.yml +++ b/docker-compose-base.yml @@ -1,5 +1,3 @@ -version: '3.9' - services: fetcher_app_selenium: diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index def8ac3..8acb44f 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -1,5 +1,3 @@ -version: '3.9' - services: fetcher_app_selenium: diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml index e61d4fa..1992b1a 100644 --- a/docker-compose-prod.yml +++ b/docker-compose-prod.yml @@ -1,5 +1,3 @@ -version: '3.9' - services: fetcher_app_selenium: