django tasks scheduler update, .env and docker compose towards fetcher sca

This commit is contained in:
Luciano Gervasoni
2025-06-20 00:35:48 +02:00
parent 490f01d66c
commit 03a2949b2b
8 changed files with 149 additions and 51 deletions

4
.env
View File

@@ -1,3 +1,7 @@
# AutoSSH DB
REMOTE_HOST=''
REMOTE_USERNAME=''
# Initialization
INITIALIZE_DB=true
DJANGO_SUPERUSER_USERNAME=matitos

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
.env
__pycache__/
*.pyc
**/credentials.py

View File

@@ -12,6 +12,9 @@ https://docs.djangoproject.com/en/5.1/ref/settings/
from pathlib import Path
import os
from typing import Dict
from scheduler.types import SchedulerConfiguration, Broker, QueueConfiguration
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
@@ -107,57 +110,27 @@ CACHES = {
}
}
'''
from scheduler.types import SchedulerConfiguration, QueueConfiguration, Broker
from typing import Dict
# https://django-tasks-scheduler.readthedocs.io/en/latest/configuration/
SCHEDULER_CONFIG = SchedulerConfiguration(
DEFAULT_JOB_TIMEOUT = os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 30 minutes
EXECUTIONS_IN_PAGE=20,
SCHEDULER_INTERVAL=10,
BROKER=Broker.REDIS,
CALLBACK_TIMEOUT=60, # Callback timeout in seconds (success/failure/stopped)
# Default values, can be overriden per task/job
DEFAULT_SUCCESS_TTL=10 * 60, # Time To Live (TTL) in seconds to keep successful job results
DEFAULT_FAILURE_TTL=365 * 24 * 60 * 60, # Time To Live (TTL) in seconds to keep job failure information
DEFAULT_JOB_TTL=10 * 60, # Time To Live (TTL) in seconds to keep job information
DEFAULT_JOB_TIMEOUT=30 * 60, # timeout (seconds) for a job
# General configuration values
DEFAULT_WORKER_TTL=10 * 60, # Time To Live (TTL) in seconds to keep worker information after last heartbeat
DEFAULT_MAINTENANCE_TASK_INTERVAL=10 * 60, # The interval to run maintenance tasks in seconds. 10 minutes.
DEFAULT_JOB_MONITORING_INTERVAL=30, # The interval to monitor jobs in seconds.
SCHEDULER_FALLBACK_PERIOD_SECS=120, # Period (secs) to wait before requiring to reacquire locks
)
SCHEDULER_QUEUES: Dict[str, QueueConfiguration] = {
'default': QueueConfiguration(
HOST = os.environ.get("REDIS_HOST", "localhost"),
PORT = os.environ.get("REDIS_PORT", 6379),
DB = os.environ.get("REDIS_DB", 0),
),
'high': QueueConfiguration(
HOST = os.environ.get("REDIS_HOST", "localhost"),
PORT = os.environ.get("REDIS_PORT", 6379),
DB = os.environ.get("REDIS_DB", 0),
),
'low': QueueConfiguration(
HOST = os.environ.get("REDIS_HOST", "localhost"),
PORT = os.environ.get("REDIS_PORT", 6379),
DB = os.environ.get("REDIS_DB", 0),
),
}
'''
SCHEDULER_QUEUES = {
'default': {
'HOST': os.environ.get("REDIS_HOST", "localhost"),
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),
},
'high': {
'HOST': os.environ.get("REDIS_HOST", "localhost"),
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),
},
'low': {
'HOST': os.environ.get("REDIS_HOST", "localhost"),
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),
}
}
SCHEDULER_CONFIG = {
'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 30 minutes
'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours
'EXECUTIONS_IN_PAGE': 20,
'SCHEDULER_INTERVAL': 10, # 10 seconds
# 'default': QueueConfiguration(URL='redis://localhost:6379/0'),
'default': QueueConfiguration(URL='redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB", 0))),
'high': QueueConfiguration(URL='redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB", 0))),
'low': QueueConfiguration(URL='redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB", 0))),
}

View File

@@ -43,7 +43,6 @@ class DB_Handler():
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
else:
# Add object to insert
# url_object_to_insert.append(Urls(url=url))
urls_to_insert.append(url)
### Insert URLs & (URL_id, source_id)

View File

@@ -9,4 +9,6 @@ else
python manage.py createsuperuser --noinput
python manage.py collectstatic --no-input
python manage.py import --filename scheduled_tasks.json
#
# python manage.py inspectdb # Debugging model
fi

View File

@@ -1,5 +1,5 @@
django==5.1
django-tasks-scheduler==3.0.1
django-tasks-scheduler==4.0.4
django-redis
psycopg[binary]
gunicorn

View File

@@ -2,7 +2,7 @@
if [ "${DJANGO_DEBUG}" = true ] | [ "${DJANGO_DEBUG}" == "True" ]; then
echo "Running in DEBUG mode"
gunicorn core.wsgi:application --reload --log-level debug --bind 0.0.0.0:8000 --timeout 600 & python manage.py rqworker high default low
gunicorn core.wsgi:application --reload --log-level debug --bind 0.0.0.0:8000 --timeout 600 & python manage.py scheduler_worker high default low
else
gunicorn core.wsgi:application --bind 0.0.0.0:8000 --timeout 600 & python manage.py rqworker high default low
gunicorn core.wsgi:application --bind 0.0.0.0:8000 --timeout 600 & python manage.py scheduler_worker high default low
fi

119
docker-compose-prod.yml Normal file
View File

@@ -0,0 +1,119 @@
version: '3.9'
services:
fetcher_app_selenium:
image: fetcher_app_selenium
build:
context: ./app_selenium
args:
- ARCH=${ARCH} # arm64, amd64
container_name: fetcher_app_selenium
restart: unless-stopped
shm_size: 512mb
environment:
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
ports:
- 80
dns:
- 1.1.1.1
- 1.0.0.1
deploy:
resources:
limits:
cpus: '${DEPLOY_CPUS}'
memory: ${DEPLOY_RAM}
fetcher_app_urls:
image: fetcher_app_urls
build:
context: ./app_urls
container_name: fetcher_app_urls
restart: unless-stopped
environment:
# Initialization
- INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
# Django
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
- DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
- DJANGO_DEBUG=${DJANGO_DEBUG}
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
# Database
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASSWORD=${DB_PASSWORD}
- DB_HOST=${DB_HOST}
- DB_PORT=${DB_PORT}
- REDIS_HOST=${REDIS_HOST}
- REDIS_PORT=${REDIS_PORT}
# Job timeout: 30 min
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
# Fetcher
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection
- FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
- FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
# Selenium
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
# Ghost
- GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
- GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
- PEXELS_API_KEY=${PEXELS_API_KEY}
- OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
########################
#volumes: # Development mode
# - ./app_urls:/opt/app
########################
ports:
- 8000:8000
depends_on:
- fetcher_db
- fetcher_redis
dns:
- 1.1.1.1
- 1.0.0.1
deploy:
resources:
limits:
cpus: '${DEPLOY_CPUS}'
memory: ${DEPLOY_RAM}
fetcher_db:
container_name: fetcher_db
image: alpine:latest
restart: unless-stopped
deploy:
resources:
limits:
memory: 256M
volumes:
# REQUIREMENT: Add fetcher's SSH public key into the DB's .ssh/authorized_keys machine
- ~/.ssh:/root/.ssh:ro
command:
- sh
- -c
- |
apk add --update openssh autossh
autossh -M 15885 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
### Alternative:
### autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
### -M 15882 monitors on port X, if already being used conflict!
###autossh -M 15882 -N -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
###ssh -N -o "StrictHostKeyChecking no" -o "ServerAliveInterval 60" -o "ServerAliveCountMax 3" -o 'PasswordAuthentication no' -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
network_mode: "host"
fetcher_redis:
image: redis:alpine
container_name: fetcher_redis
restart: unless-stopped
ports:
- 6379 #:6379