diff --git a/README.md b/README.md index b877ee0..c8220ce 100644 --- a/README.md +++ b/README.md @@ -55,3 +55,9 @@ docker compose -f docker-compose-dev.yml down -v docker compose -f docker-compose-dev.yml build --progress=plain docker compose -f docker-compose-dev.yml up ``` +* Prod mode +``` +docker compose -f docker-compose-prod.yml down -v +docker compose -f docker-compose-prod.yml build --progress=plain +docker compose -f docker-compose-prod.yml up -d +``` \ No newline at end of file diff --git a/app_urls/fetcher/src/fetch_search.py b/app_urls/fetcher/src/fetch_search.py index 6aadb78..1e25d1f 100644 --- a/app_urls/fetcher/src/fetch_search.py +++ b/app_urls/fetcher/src/fetch_search.py @@ -54,6 +54,7 @@ class FetchSearcher(): for SearchInstance in ListSearchInstances: # Sleep between requests, avoid too many requests... time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5))) + # TODO: Random proxy / VPN SearchInstance(args).fetch_articles(db_writer, obj_search) # TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master diff --git a/app_urls/fetcher/src/fetch_search_instances.py b/app_urls/fetcher/src/fetch_search_instances.py index d188976..ba87104 100644 --- a/app_urls/fetcher/src/fetch_search_instances.py +++ b/app_urls/fetcher/src/fetch_search_instances.py @@ -1,8 +1,6 @@ import time import feedparser import os -from django.utils import timezone -from datetime import timedelta from ..models import Search, Source from .fetch_utils_gnews import decode_gnews_urls from .logger import get_logger diff --git a/app_urls/fetcher/src/logger.py b/app_urls/fetcher/src/logger.py index 03aa8c3..b2f7e7b 100644 --- a/app_urls/fetcher/src/logger.py +++ b/app_urls/fetcher/src/logger.py @@ -11,7 +11,7 @@ logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(messa logger = logging.getLogger("fetcher") logger.setLevel(logging.DEBUG) -# To file log: INFO / WARNING / ERROR / CRITICAL +# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setLevel(logging.DEBUG) diff --git a/app_urls/fetcher/tasks.py b/app_urls/fetcher/tasks.py index 7fca626..08ae8ed 100644 --- a/app_urls/fetcher/tasks.py +++ b/app_urls/fetcher/tasks.py @@ -74,7 +74,7 @@ def process_missing_kids_urls_all(batch_size=None): logger.info("Task completed: {}".format(task)) @job('default') -def clean_old_url_content(older_than_days=60): +def clean_old_url_content(older_than_days=14): task = "Clean old URL content" logger.info("Task triggered: {}".format(task)) DB_Handler().clean_old_url_content(older_than_days=older_than_days) diff --git a/app_urls/init_data.json b/app_urls/init_data.json index 0ed7168..8a766e3 100644 --- a/app_urls/init_data.json +++ b/app_urls/init_data.json @@ -24,11 +24,12 @@ [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50], [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75], [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75], - [".*radio.foxnews\\.com\\/.*", "invalid", 75], + [".*radio\\.foxnews\\.com\\/.*", "invalid", 75], [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75], [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75], [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50], [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50], [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50] + [".*missingkids\\.org\\/poster\\/.*", "valid", 50] ] } diff --git a/app_urls/init_db.py b/app_urls/init_db.py index 33ddaeb..920042a 100644 --- a/app_urls/init_db.py +++ b/app_urls/init_db.py @@ -29,13 +29,15 @@ def wait_connection(): connected = True except psycopg.OperationalError as e: + print(str(e)) # Connection not ready... # print(".", end="") - time.sleep(2) + time.sleep(15) except Exception as e: + print(str(e)) # Connection not ready... # print("e", end="") - time.sleep(2) + time.sleep(15) print("DB connection ready") @@ -57,7 +59,8 @@ def initialize_tables(): ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(), status URL_STATUS NOT NULL DEFAULT 'raw' -- , -- status_wendy WENDY_STATUS DEFAULT NULL, - -- ts_wendy TIMESTAMPTZ DEFAULT NULL + -- ts_wendy TIMESTAMPTZ DEFAULT NULL, + -- child_abuse BOOLEAN DEFAULT NULL, ); CREATE INDEX idx_urls_status ON urls(status); CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch); diff --git a/app_urls/scheduled_tasks.json b/app_urls/scheduled_tasks.json index 23849e4..a6ed718 100644 --- a/app_urls/scheduled_tasks.json +++ b/app_urls/scheduled_tasks.json @@ -13,7 +13,7 @@ "result_ttl": 86400, "cron_string": null, "scheduled_time": "2025-01-01T00:00:00+00:00", - "interval": 4, + "interval": 8, "interval_unit": "hours", "successful_runs": 0, "failed_runs": 0, @@ -139,7 +139,7 @@ "result_ttl": 86400, "cron_string": null, "scheduled_time": "2025-01-01T00:00:00+00:00", - "interval": 2, + "interval": 4, "interval_unit": "hours", "successful_runs": 0, "failed_runs": 0, diff --git a/docker-compose.yml b/docker-compose-base.yml similarity index 70% rename from docker-compose.yml rename to docker-compose-base.yml index 57e3d3b..8d9c73a 100644 --- a/docker-compose.yml +++ b/docker-compose-base.yml @@ -19,11 +19,6 @@ services: dns: - 1.1.1.1 - 1.0.0.1 - deploy: - resources: - limits: - cpus: '${DEPLOY_CPUS}' - memory: ${DEPLOY_RAM} fetcher_app_urls: image: fetcher_app_urls @@ -70,55 +65,22 @@ services: - PEXELS_API_KEY=${PEXELS_API_KEY} - OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT} ######################## - #volumes: # Development mode - # - ./app_urls:/opt/app - ######################## ports: - - 8000 # :8000 + - 8000 depends_on: - fetcher_db - fetcher_redis dns: - 1.1.1.1 - 1.0.0.1 - deploy: - resources: - limits: - cpus: '${DEPLOY_CPUS}' - memory: ${DEPLOY_RAM} - labels: # Reverse proxy sample - - "traefik.enable=true" - - "traefik.http.routers.fetcher.rule=Host(`${REVERSE_PROXY_URL}`)" - - "traefik.http.routers.fetcher.entrypoints=websecure" - - "traefik.http.routers.fetcher.tls.certresolver=myresolvercd" - - "traefik.http.services.fetcher.loadbalancer.server.port=8000" - networks: - - default # This network - - docker_default # Reverse proxy network - - fetcher_db: - image: postgres:17 - container_name: fetcher_db - restart: unless-stopped - # Set shared memory limit when using docker-compose - shm_size: 128mb - environment: - POSTGRES_DB: ${DB_NAME} - POSTGRES_PASSWORD: ${DB_PASSWORD} - POSTGRES_USER: ${DB_USER} - POSTGRES_INITDB_ARGS: '--data-checksums' - volumes: # Persistent DB? - - ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data - ports: - - 5432 #:5432 fetcher_redis: image: redis:alpine container_name: fetcher_redis restart: unless-stopped ports: - - 6379 #:6379 + - 6379 -networks: - docker_default: - external: true + fetcher_db: + container_name: fetcher_db + restart: unless-stopped diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index ecdc4bc..def8ac3 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -3,22 +3,9 @@ version: '3.9' services: fetcher_app_selenium: - image: fetcher_app_selenium - build: - context: ./app_selenium - args: - - ARCH=${ARCH} # arm64, amd64 - container_name: fetcher_app_selenium - restart: unless-stopped - shm_size: 512mb - environment: - - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE} - - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY} - ports: - - 80:80 - dns: - - 1.1.1.1 - - 1.0.0.1 + extends: + file: docker-compose-base.yml + service: fetcher_app_selenium deploy: resources: limits: @@ -26,66 +13,11 @@ services: memory: ${DEPLOY_RAM} fetcher_app_urls: - image: fetcher_app_urls - build: - context: ./app_urls - container_name: fetcher_app_urls - restart: unless-stopped - environment: - # Initialization - - INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence - - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME} - - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD} - - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL} - # Django - - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2 - - DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy - - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY} - - DJANGO_DEBUG=${DJANGO_DEBUG} - - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY} - # Database - - DB_NAME=${DB_NAME} - - DB_USER=${DB_USER} - - DB_PASSWORD=${DB_PASSWORD} - - DB_HOST=${DB_HOST} - - DB_PORT=${DB_PORT} - - REDIS_HOST=${REDIS_HOST} - - REDIS_PORT=${REDIS_PORT} - # Job timeout: 30 min - - JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT} - # Fetcher - - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP} - - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP} - - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search - - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host - - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection - - FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL - - FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL - # Selenium - - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT} - - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA} - # Ghost - - GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY} - - GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL} - - PEXELS_API_KEY=${PEXELS_API_KEY} - - OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT} - ######################## - volumes: # Development mode - - ./app_urls:/opt/app - ######################## - ports: - - 8000:8000 - depends_on: - - fetcher_db - - fetcher_redis - dns: - - 1.1.1.1 - - 1.0.0.1 - deploy: - resources: - limits: - cpus: '${DEPLOY_CPUS}' - memory: ${DEPLOY_RAM} + extends: + file: docker-compose-base.yml + service: fetcher_app_urls + #env_files: + # - .env.dev #labels: # Reverse proxy sample # - "traefik.enable=true" # - "traefik.http.routers.fetcher.rule=Host(`urls.yourdomain.com`)" @@ -95,11 +27,21 @@ services: #networks: # - default # This network # - docker_default # Reverse proxy network + ports: + - 8000:8000 + volumes: # Development mode + - ./app_urls:/opt/app + deploy: + resources: + limits: + cpus: '${DEPLOY_CPUS}' + memory: ${DEPLOY_RAM} fetcher_db: + extends: + file: docker-compose-base.yml + service: fetcher_db image: postgres:17 - container_name: fetcher_db - restart: unless-stopped # Set shared memory limit when using docker-compose shm_size: 128mb environment: @@ -107,18 +49,14 @@ services: POSTGRES_PASSWORD: ${DB_PASSWORD} POSTGRES_USER: ${DB_USER} POSTGRES_INITDB_ARGS: '--data-checksums' - #volumes: # Persistent DB? - # - ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data ports: - 5432 #:5432 + #volumes: # Persistent DB? + # - ./postgres:/var/lib/postgresql/data fetcher_redis: - image: redis:alpine - container_name: fetcher_redis - restart: unless-stopped + extends: + file: docker-compose-base.yml + service: fetcher_redis ports: - - 6379 #:6379 - -#networks: -# docker_default: -# external: true + - 6379:6379 diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml index c18d30b..e61d4fa 100644 --- a/docker-compose-prod.yml +++ b/docker-compose-prod.yml @@ -3,22 +3,9 @@ version: '3.9' services: fetcher_app_selenium: - image: fetcher_app_selenium - build: - context: ./app_selenium - args: - - ARCH=${ARCH} # arm64, amd64 - container_name: fetcher_app_selenium - restart: unless-stopped - shm_size: 512mb - environment: - - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE} - - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY} - ports: - - 80 - dns: - - 1.1.1.1 - - 1.0.0.1 + extends: + file: docker-compose-base.yml + service: fetcher_app_selenium deploy: resources: limits: @@ -26,61 +13,11 @@ services: memory: ${DEPLOY_RAM} fetcher_app_urls: - image: fetcher_app_urls - build: - context: ./app_urls - container_name: fetcher_app_urls - restart: unless-stopped - environment: - # Initialization - - INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence - - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME} - - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD} - - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL} - # Django - - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2 - - DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy - - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY} - - DJANGO_DEBUG=${DJANGO_DEBUG} - - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY} - # Database - - DB_NAME=${DB_NAME} - - DB_USER=${DB_USER} - - DB_PASSWORD=${DB_PASSWORD} - - DB_HOST=${DB_HOST} - - DB_PORT=${DB_PORT} - - REDIS_HOST=${REDIS_HOST} - - REDIS_PORT=${REDIS_PORT} - # Job timeout: 30 min - - JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT} - # Fetcher - - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP} - - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP} - - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search - - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host - - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection - - FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL - - FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL - # Selenium - - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT} - - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA} - # Ghost - - GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY} - - GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL} - - PEXELS_API_KEY=${PEXELS_API_KEY} - - OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT} - ######################## - #volumes: # Development mode - # - ./app_urls:/opt/app - ######################## + extends: + file: docker-compose-base.yml + service: fetcher_app_urls ports: - 8000:8000 - depends_on: - - fetcher_db - - fetcher_redis - dns: - - 1.1.1.1 - - 1.0.0.1 deploy: resources: limits: @@ -88,7 +25,9 @@ services: memory: ${DEPLOY_RAM} fetcher_db: - container_name: fetcher_db + extends: + file: docker-compose-base.yml + service: fetcher_db image: alpine:latest restart: unless-stopped deploy: @@ -98,22 +37,21 @@ services: volumes: # REQUIREMENT: Add fetcher's SSH public key into the DB's .ssh/authorized_keys machine - ~/.ssh:/root/.ssh:ro + ports: + - 15885:15885 + - 5432:5432 command: - sh - -c - | apk add --update openssh autossh - autossh -M 15885 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST} - ### Alternative: - ### autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org - ### -M 15882 monitors on port X, if already being used conflict! - ###autossh -M 15882 -N -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org - ###ssh -N -o "StrictHostKeyChecking no" -o "ServerAliveInterval 60" -o "ServerAliveCountMax 3" -o 'PasswordAuthentication no' -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org - network_mode: "host" + # Monitor status on port 15885 + autossh -M 15885 -N -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST} + # autossh -M 15885 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST} fetcher_redis: - image: redis:alpine - container_name: fetcher_redis - restart: unless-stopped + extends: + file: docker-compose-base.yml + service: fetcher_redis ports: - - 6379 #:6379 + - 6379:6379 \ No newline at end of file diff --git a/utils/DB-Dev.ipynb b/utils/DB-Dev.ipynb new file mode 100644 index 0000000..1e0cc13 --- /dev/null +++ b/utils/DB-Dev.ipynb @@ -0,0 +1,79 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install python-dotenv\n", + "from dotenv import load_dotenv\n", + "\n", + "# Specify the path to your .env file (optional if in the current dir)\n", + "load_dotenv(dotenv_path=\".env\", override=True)\n", + "\n", + "import os\n", + "import psycopg\n", + "from sshtunnel import SSHTunnelForwarder\n", + "\n", + "if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n", + " print(\"SSH tunnel: True\")\n", + "else:\n", + " print(\"SSH tunnel: False\")\n", + "\n", + "connect_info = \"host={} port={} user={} password={} dbname={}\".format(os.environ.get(\"DB_HOST\"), os.environ.get(\"DB_PORT\"), os.environ.get(\"DB_USER\"), os.environ.get(\"DB_PASSWORD\"), os.environ.get(\"DB_NAME\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n", + " ssh_tunnel = SSHTunnelForwarder(\n", + " (os.environ.get(\"REMOTE_HOST\"), int(os.environ.get(\"REMOTE_SSH_PORT\"))), \n", + " ssh_username=os.environ.get(\"REMOTE_USERNAME\"), ssh_password=os.environ.get(\"REMOTE_PASSWORD\"), \n", + " remote_bind_address=('localhost', int(os.environ.get(\"REMOTE_PORT\"))), local_bind_address=('localhost', int(os.environ.get(\"DB_PORT\"))) \n", + " )\n", + " ssh_tunnel.start()\n", + "\n", + "try:\n", + " with psycopg.connect(connect_info) as conn:\n", + " if True:\n", + " for t in conn.execute(\"\"\"\n", + " SELECT * from URLS WHERE id IN (SELECT id_url FROM URLS_SOURCE_SEARCH INNER JOIN SEARCH ON URLS_SOURCE_SEARCH.id_search = SEARCH.id WHERE SEARCH.search LIKE '%child abuse%') LIMIT 5;\n", + " \"\"\").fetchall():\n", + " print(t)\n", + " \n", + "except Exception as e:\n", + " print(\"Err:\", str(e))\n", + "\n", + "if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n", + " ssh_tunnel.stop()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "matitos_urls", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}