django tasks scheduler update, .env and docker compose towards fetcher sca

2025-06-20 00:35:48 +02:00
parent 490f01d66c
commit 03a2949b2b
8 changed files with 149 additions and 51 deletions
--- a/.env
+++ b/.env
@@ -1,3 +1,7 @@
+# AutoSSH DB
+REMOTE_HOST=''
+REMOTE_USERNAME=''
+
 # Initialization
 INITIALIZE_DB=true
 DJANGO_SUPERUSER_USERNAME=matitos
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+.env
 __pycache__/
 *.pyc 
 **/credentials.py
--- a/app_urls/core/settings.py
+++ b/app_urls/core/settings.py
@@ -12,6 +12,9 @@ https://docs.djangoproject.com/en/5.1/ref/settings/

 from pathlib import Path
 import os
+from typing import Dict
+from scheduler.types import SchedulerConfiguration, Broker, QueueConfiguration
+

 # Build paths inside the project like this: BASE_DIR / 'subdir'.
 BASE_DIR = Path(__file__).resolve().parent.parent
@@ -107,57 +110,27 @@ CACHES = {
    }
 }

-'''
-from scheduler.types import SchedulerConfiguration, QueueConfiguration, Broker
-from typing import Dict
-
-# https://django-tasks-scheduler.readthedocs.io/en/latest/configuration/
 SCHEDULER_CONFIG = SchedulerConfiguration(
-    DEFAULT_JOB_TIMEOUT = os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30),  # 30 minutes
+    EXECUTIONS_IN_PAGE=20,
+    SCHEDULER_INTERVAL=10,
    BROKER=Broker.REDIS,
+    CALLBACK_TIMEOUT=60,  # Callback timeout in seconds (success/failure/stopped)
+    # Default values, can be overriden per task/job
+    DEFAULT_SUCCESS_TTL=10 * 60,  # Time To Live (TTL) in seconds to keep successful job results
+    DEFAULT_FAILURE_TTL=365 * 24 * 60 * 60,  # Time To Live (TTL) in seconds to keep job failure information
+    DEFAULT_JOB_TTL=10 * 60,  # Time To Live (TTL) in seconds to keep job information
+    DEFAULT_JOB_TIMEOUT=30 * 60,  # timeout (seconds) for a job
+    # General configuration values
+    DEFAULT_WORKER_TTL=10 * 60,  # Time To Live (TTL) in seconds to keep worker information after last heartbeat
+    DEFAULT_MAINTENANCE_TASK_INTERVAL=10 * 60,  # The interval to run maintenance tasks in seconds. 10 minutes.
+    DEFAULT_JOB_MONITORING_INTERVAL=30,  # The interval to monitor jobs in seconds.
+    SCHEDULER_FALLBACK_PERIOD_SECS=120,  # Period (secs) to wait before requiring to reacquire locks
 )
-
 SCHEDULER_QUEUES: Dict[str, QueueConfiguration] = {
-    'default': QueueConfiguration(
-        HOST = os.environ.get("REDIS_HOST", "localhost"),
-        PORT = os.environ.get("REDIS_PORT", 6379),
-        DB = os.environ.get("REDIS_DB", 0),
-    ),
-    'high': QueueConfiguration(
-        HOST = os.environ.get("REDIS_HOST", "localhost"),
-        PORT = os.environ.get("REDIS_PORT", 6379),
-        DB = os.environ.get("REDIS_DB", 0),
-    ),
-    'low': QueueConfiguration(
-        HOST = os.environ.get("REDIS_HOST", "localhost"),
-        PORT = os.environ.get("REDIS_PORT", 6379),
-        DB = os.environ.get("REDIS_DB", 0),
-    ),
-}
-'''
-
-SCHEDULER_QUEUES = {
-    'default': {
-        'HOST': os.environ.get("REDIS_HOST", "localhost"),
-        'PORT': os.environ.get("REDIS_PORT", 6379),
-        'DB': os.environ.get("REDIS_DB", 0),
-    },
-    'high': {
-        'HOST': os.environ.get("REDIS_HOST", "localhost"),
-        'PORT': os.environ.get("REDIS_PORT", 6379),
-        'DB': os.environ.get("REDIS_DB", 0),
-    },
-    'low': {
-        'HOST': os.environ.get("REDIS_HOST", "localhost"),
-        'PORT': os.environ.get("REDIS_PORT", 6379),
-        'DB': os.environ.get("REDIS_DB", 0),
-    }
-}
-SCHEDULER_CONFIG = {
-    'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30),  # 30 minutes
-    'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours
-    'EXECUTIONS_IN_PAGE': 20,
-    'SCHEDULER_INTERVAL': 10,  # 10 seconds
+    # 'default': QueueConfiguration(URL='redis://localhost:6379/0'),
+    'default': QueueConfiguration(URL='redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB", 0))),
+    'high': QueueConfiguration(URL='redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB", 0))),
+    'low': QueueConfiguration(URL='redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB", 0))),
 }


--- a/app_urls/fetcher/src/db_utils.py
+++ b/app_urls/fetcher/src/db_utils.py
@@ -43,7 +43,6 @@ class DB_Handler():
                        UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
                else:
                    # Add object to insert
-                    # url_object_to_insert.append(Urls(url=url))
                    urls_to_insert.append(url)

            ### Insert URLs & (URL_id, source_id)
--- a/app_urls/initialize.sh
+++ b/app_urls/initialize.sh
@@ -9,4 +9,6 @@ else
    python manage.py createsuperuser --noinput
    python manage.py collectstatic --no-input
    python manage.py import --filename scheduled_tasks.json
+    #
+    # python manage.py inspectdb # Debugging model
 fi
--- a/app_urls/requirements.txt
+++ b/app_urls/requirements.txt
@@ -1,5 +1,5 @@
 django==5.1
-django-tasks-scheduler==3.0.1
+django-tasks-scheduler==4.0.4
 django-redis
 psycopg[binary]
 gunicorn
--- a/app_urls/run.sh
+++ b/app_urls/run.sh
@@ -2,7 +2,7 @@

 if [ "${DJANGO_DEBUG}" = true ] | [ "${DJANGO_DEBUG}" == "True" ]; then
    echo "Running in DEBUG mode"
-    gunicorn core.wsgi:application --reload --log-level debug --bind 0.0.0.0:8000 --timeout 600 & python manage.py rqworker high default low
+    gunicorn core.wsgi:application --reload --log-level debug --bind 0.0.0.0:8000 --timeout 600 & python manage.py scheduler_worker high default low
 else
-    gunicorn core.wsgi:application --bind 0.0.0.0:8000 --timeout 600 & python manage.py rqworker high default low
+    gunicorn core.wsgi:application --bind 0.0.0.0:8000 --timeout 600 & python manage.py scheduler_worker high default low
 fi
--- a/docker-compose-prod.yml
+++ b/docker-compose-prod.yml
@@ -0,0 +1,119 @@
+version: '3.9'
+
+services:
+
+  fetcher_app_selenium:
+    image: fetcher_app_selenium
+    build:
+      context: ./app_selenium
+      args:
+        - ARCH=${ARCH} # arm64, amd64
+    container_name: fetcher_app_selenium
+    restart: unless-stopped
+    shm_size: 512mb
+    environment:
+      - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
+      - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
+    ports:
+      - 80
+    dns:
+      - 1.1.1.1
+      - 1.0.0.1
+    deploy:
+      resources:
+        limits:
+          cpus: '${DEPLOY_CPUS}'
+          memory: ${DEPLOY_RAM}
+
+  fetcher_app_urls:
+    image: fetcher_app_urls
+    build:
+      context: ./app_urls
+    container_name: fetcher_app_urls
+    restart: unless-stopped
+    environment:
+      # Initialization
+      - INITIALIZE_DB=${INITIALIZE_DB}  # Related to DB persistence
+      - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
+      - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
+      - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
+      # Django
+      - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
+      - DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
+      - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
+      - DJANGO_DEBUG=${DJANGO_DEBUG}
+      - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
+      # Database
+      - DB_NAME=${DB_NAME}
+      - DB_USER=${DB_USER}
+      - DB_PASSWORD=${DB_PASSWORD}
+      - DB_HOST=${DB_HOST}
+      - DB_PORT=${DB_PORT}
+      - REDIS_HOST=${REDIS_HOST}
+      - REDIS_PORT=${REDIS_PORT}
+      # Job timeout: 30 min
+      - JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
+      # Fetcher
+      - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
+      - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
+      - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP}  # Sleep time between each search
+      - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP}  # Sleep time between requests to same URL host
+      - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR}  # Min amonut of characters to run language detection
+      - FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
+      - FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
+      # Selenium
+      - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
+      - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
+      # Ghost
+      - GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
+      - GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
+      - PEXELS_API_KEY=${PEXELS_API_KEY}
+      - OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
+    ########################
+    #volumes:   # Development mode
+    #  - ./app_urls:/opt/app
+    ########################
+    ports:
+      - 8000:8000
+    depends_on:
+      - fetcher_db
+      - fetcher_redis
+    dns:
+      - 1.1.1.1
+      - 1.0.0.1
+    deploy:
+      resources:
+        limits:
+          cpus: '${DEPLOY_CPUS}'
+          memory: ${DEPLOY_RAM}
+
+  fetcher_db:
+    container_name: fetcher_db
+    image: alpine:latest
+    restart: unless-stopped
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+    volumes:
+      # REQUIREMENT: Add fetcher's SSH public key into the DB's .ssh/authorized_keys machine
+      - ~/.ssh:/root/.ssh:ro
+    command:
+      - sh
+      - -c
+      - |
+        apk add --update openssh autossh
+        autossh -M 15885 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
+        ### Alternative:
+        ### autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
+        ### -M 15882 monitors on port X, if already being used conflict!
+        ###autossh -M 15882 -N -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
+        ###ssh -N -o "StrictHostKeyChecking no" -o "ServerAliveInterval 60" -o "ServerAliveCountMax 3" -o 'PasswordAuthentication no' -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
+    network_mode: "host"
+
+  fetcher_redis:
+    image: redis:alpine
+    container_name: fetcher_redis
+    restart: unless-stopped
+    ports:
+      - 6379 #:6379