diff --git a/.env b/.env index af61fa0..d7f987e 100644 --- a/.env +++ b/.env @@ -1,3 +1,7 @@ +# AutoSSH DB +REMOTE_HOST='' +REMOTE_USERNAME='' + # Initialization INITIALIZE_DB=true DJANGO_SUPERUSER_USERNAME=matitos diff --git a/.gitignore b/.gitignore index ea65850..2986183 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.env __pycache__/ *.pyc **/credentials.py diff --git a/app_urls/core/settings.py b/app_urls/core/settings.py index dff2726..40f839a 100644 --- a/app_urls/core/settings.py +++ b/app_urls/core/settings.py @@ -12,6 +12,9 @@ https://docs.djangoproject.com/en/5.1/ref/settings/ from pathlib import Path import os +from typing import Dict +from scheduler.types import SchedulerConfiguration, Broker, QueueConfiguration + # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = Path(__file__).resolve().parent.parent @@ -107,57 +110,27 @@ CACHES = { } } -''' -from scheduler.types import SchedulerConfiguration, QueueConfiguration, Broker -from typing import Dict - -# https://django-tasks-scheduler.readthedocs.io/en/latest/configuration/ SCHEDULER_CONFIG = SchedulerConfiguration( - DEFAULT_JOB_TIMEOUT = os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 30 minutes + EXECUTIONS_IN_PAGE=20, + SCHEDULER_INTERVAL=10, BROKER=Broker.REDIS, + CALLBACK_TIMEOUT=60, # Callback timeout in seconds (success/failure/stopped) + # Default values, can be overriden per task/job + DEFAULT_SUCCESS_TTL=10 * 60, # Time To Live (TTL) in seconds to keep successful job results + DEFAULT_FAILURE_TTL=365 * 24 * 60 * 60, # Time To Live (TTL) in seconds to keep job failure information + DEFAULT_JOB_TTL=10 * 60, # Time To Live (TTL) in seconds to keep job information + DEFAULT_JOB_TIMEOUT=30 * 60, # timeout (seconds) for a job + # General configuration values + DEFAULT_WORKER_TTL=10 * 60, # Time To Live (TTL) in seconds to keep worker information after last heartbeat + DEFAULT_MAINTENANCE_TASK_INTERVAL=10 * 60, # The interval to run maintenance tasks in seconds. 10 minutes. + DEFAULT_JOB_MONITORING_INTERVAL=30, # The interval to monitor jobs in seconds. + SCHEDULER_FALLBACK_PERIOD_SECS=120, # Period (secs) to wait before requiring to reacquire locks ) - SCHEDULER_QUEUES: Dict[str, QueueConfiguration] = { - 'default': QueueConfiguration( - HOST = os.environ.get("REDIS_HOST", "localhost"), - PORT = os.environ.get("REDIS_PORT", 6379), - DB = os.environ.get("REDIS_DB", 0), - ), - 'high': QueueConfiguration( - HOST = os.environ.get("REDIS_HOST", "localhost"), - PORT = os.environ.get("REDIS_PORT", 6379), - DB = os.environ.get("REDIS_DB", 0), - ), - 'low': QueueConfiguration( - HOST = os.environ.get("REDIS_HOST", "localhost"), - PORT = os.environ.get("REDIS_PORT", 6379), - DB = os.environ.get("REDIS_DB", 0), - ), -} -''' - -SCHEDULER_QUEUES = { - 'default': { - 'HOST': os.environ.get("REDIS_HOST", "localhost"), - 'PORT': os.environ.get("REDIS_PORT", 6379), - 'DB': os.environ.get("REDIS_DB", 0), - }, - 'high': { - 'HOST': os.environ.get("REDIS_HOST", "localhost"), - 'PORT': os.environ.get("REDIS_PORT", 6379), - 'DB': os.environ.get("REDIS_DB", 0), - }, - 'low': { - 'HOST': os.environ.get("REDIS_HOST", "localhost"), - 'PORT': os.environ.get("REDIS_PORT", 6379), - 'DB': os.environ.get("REDIS_DB", 0), - } -} -SCHEDULER_CONFIG = { - 'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 30 minutes - 'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours - 'EXECUTIONS_IN_PAGE': 20, - 'SCHEDULER_INTERVAL': 10, # 10 seconds + # 'default': QueueConfiguration(URL='redis://localhost:6379/0'), + 'default': QueueConfiguration(URL='redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB", 0))), + 'high': QueueConfiguration(URL='redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB", 0))), + 'low': QueueConfiguration(URL='redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB", 0))), } diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index af06bc1..de8956a 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -43,7 +43,6 @@ class DB_Handler(): UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search) else: # Add object to insert - # url_object_to_insert.append(Urls(url=url)) urls_to_insert.append(url) ### Insert URLs & (URL_id, source_id) diff --git a/app_urls/initialize.sh b/app_urls/initialize.sh index 4e8e109..80d0651 100755 --- a/app_urls/initialize.sh +++ b/app_urls/initialize.sh @@ -9,4 +9,6 @@ else python manage.py createsuperuser --noinput python manage.py collectstatic --no-input python manage.py import --filename scheduled_tasks.json + # + # python manage.py inspectdb # Debugging model fi diff --git a/app_urls/requirements.txt b/app_urls/requirements.txt index d5f2786..fc0fe9b 100644 --- a/app_urls/requirements.txt +++ b/app_urls/requirements.txt @@ -1,5 +1,5 @@ django==5.1 -django-tasks-scheduler==3.0.1 +django-tasks-scheduler==4.0.4 django-redis psycopg[binary] gunicorn diff --git a/app_urls/run.sh b/app_urls/run.sh index f16eb3c..fe3db2f 100755 --- a/app_urls/run.sh +++ b/app_urls/run.sh @@ -2,7 +2,7 @@ if [ "${DJANGO_DEBUG}" = true ] | [ "${DJANGO_DEBUG}" == "True" ]; then echo "Running in DEBUG mode" - gunicorn core.wsgi:application --reload --log-level debug --bind 0.0.0.0:8000 --timeout 600 & python manage.py rqworker high default low + gunicorn core.wsgi:application --reload --log-level debug --bind 0.0.0.0:8000 --timeout 600 & python manage.py scheduler_worker high default low else - gunicorn core.wsgi:application --bind 0.0.0.0:8000 --timeout 600 & python manage.py rqworker high default low + gunicorn core.wsgi:application --bind 0.0.0.0:8000 --timeout 600 & python manage.py scheduler_worker high default low fi diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml new file mode 100644 index 0000000..c18d30b --- /dev/null +++ b/docker-compose-prod.yml @@ -0,0 +1,119 @@ +version: '3.9' + +services: + + fetcher_app_selenium: + image: fetcher_app_selenium + build: + context: ./app_selenium + args: + - ARCH=${ARCH} # arm64, amd64 + container_name: fetcher_app_selenium + restart: unless-stopped + shm_size: 512mb + environment: + - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE} + - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY} + ports: + - 80 + dns: + - 1.1.1.1 + - 1.0.0.1 + deploy: + resources: + limits: + cpus: '${DEPLOY_CPUS}' + memory: ${DEPLOY_RAM} + + fetcher_app_urls: + image: fetcher_app_urls + build: + context: ./app_urls + container_name: fetcher_app_urls + restart: unless-stopped + environment: + # Initialization + - INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence + - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME} + - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD} + - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL} + # Django + - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2 + - DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy + - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY} + - DJANGO_DEBUG=${DJANGO_DEBUG} + - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY} + # Database + - DB_NAME=${DB_NAME} + - DB_USER=${DB_USER} + - DB_PASSWORD=${DB_PASSWORD} + - DB_HOST=${DB_HOST} + - DB_PORT=${DB_PORT} + - REDIS_HOST=${REDIS_HOST} + - REDIS_PORT=${REDIS_PORT} + # Job timeout: 30 min + - JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT} + # Fetcher + - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP} + - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP} + - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search + - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host + - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection + - FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL + - FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL + # Selenium + - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT} + - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA} + # Ghost + - GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY} + - GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL} + - PEXELS_API_KEY=${PEXELS_API_KEY} + - OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT} + ######################## + #volumes: # Development mode + # - ./app_urls:/opt/app + ######################## + ports: + - 8000:8000 + depends_on: + - fetcher_db + - fetcher_redis + dns: + - 1.1.1.1 + - 1.0.0.1 + deploy: + resources: + limits: + cpus: '${DEPLOY_CPUS}' + memory: ${DEPLOY_RAM} + + fetcher_db: + container_name: fetcher_db + image: alpine:latest + restart: unless-stopped + deploy: + resources: + limits: + memory: 256M + volumes: + # REQUIREMENT: Add fetcher's SSH public key into the DB's .ssh/authorized_keys machine + - ~/.ssh:/root/.ssh:ro + command: + - sh + - -c + - | + apk add --update openssh autossh + autossh -M 15885 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST} + ### Alternative: + ### autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org + ### -M 15882 monitors on port X, if already being used conflict! + ###autossh -M 15882 -N -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org + ###ssh -N -o "StrictHostKeyChecking no" -o "ServerAliveInterval 60" -o "ServerAliveCountMax 3" -o 'PasswordAuthentication no' -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org + network_mode: "host" + + fetcher_redis: + image: redis:alpine + container_name: fetcher_redis + restart: unless-stopped + ports: + - 6379 #:6379