Docker and deployment to fetcher server

This commit is contained in:
Luciano Gervasoni
2025-06-27 09:14:44 +02:00
parent f659d4adb3
commit 8b689729bf
12 changed files with 148 additions and 222 deletions

View File

@@ -55,3 +55,9 @@ docker compose -f docker-compose-dev.yml down -v
docker compose -f docker-compose-dev.yml build --progress=plain
docker compose -f docker-compose-dev.yml up
```
* Prod mode
```
docker compose -f docker-compose-prod.yml down -v
docker compose -f docker-compose-prod.yml build --progress=plain
docker compose -f docker-compose-prod.yml up -d
```

View File

@@ -54,6 +54,7 @@ class FetchSearcher():
for SearchInstance in ListSearchInstances:
# Sleep between requests, avoid too many requests...
time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
# TODO: Random proxy / VPN
SearchInstance(args).fetch_articles(db_writer, obj_search)
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master

View File

@@ -1,8 +1,6 @@
import time
import feedparser
import os
from django.utils import timezone
from datetime import timedelta
from ..models import Search, Source
from .fetch_utils_gnews import decode_gnews_urls
from .logger import get_logger

View File

@@ -11,7 +11,7 @@ logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(messa
logger = logging.getLogger("fetcher")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL
# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG)

View File

@@ -74,7 +74,7 @@ def process_missing_kids_urls_all(batch_size=None):
logger.info("Task completed: {}".format(task))
@job('default')
def clean_old_url_content(older_than_days=60):
def clean_old_url_content(older_than_days=14):
task = "Clean old URL content"
logger.info("Task triggered: {}".format(task))
DB_Handler().clean_old_url_content(older_than_days=older_than_days)

View File

@@ -24,11 +24,12 @@
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
[".*radio.foxnews\\.com\\/.*", "invalid", 75],
[".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
[".*missingkids\\.org\\/poster\\/.*", "valid", 50]
]
}

View File

@@ -29,13 +29,15 @@ def wait_connection():
connected = True
except psycopg.OperationalError as e:
print(str(e))
# Connection not ready...
# print(".", end="")
time.sleep(2)
time.sleep(15)
except Exception as e:
print(str(e))
# Connection not ready...
# print("e", end="")
time.sleep(2)
time.sleep(15)
print("DB connection ready")
@@ -57,7 +59,8 @@ def initialize_tables():
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
-- status_wendy WENDY_STATUS DEFAULT NULL,
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
-- ts_wendy TIMESTAMPTZ DEFAULT NULL,
-- child_abuse BOOLEAN DEFAULT NULL,
);
CREATE INDEX idx_urls_status ON urls(status);
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);

View File

@@ -13,7 +13,7 @@
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 4,
"interval": 8,
"interval_unit": "hours",
"successful_runs": 0,
"failed_runs": 0,
@@ -139,7 +139,7 @@
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 2,
"interval": 4,
"interval_unit": "hours",
"successful_runs": 0,
"failed_runs": 0,

View File

@@ -19,11 +19,6 @@ services:
dns:
- 1.1.1.1
- 1.0.0.1
deploy:
resources:
limits:
cpus: '${DEPLOY_CPUS}'
memory: ${DEPLOY_RAM}
fetcher_app_urls:
image: fetcher_app_urls
@@ -70,55 +65,22 @@ services:
- PEXELS_API_KEY=${PEXELS_API_KEY}
- OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
########################
#volumes: # Development mode
# - ./app_urls:/opt/app
########################
ports:
- 8000 # :8000
- 8000
depends_on:
- fetcher_db
- fetcher_redis
dns:
- 1.1.1.1
- 1.0.0.1
deploy:
resources:
limits:
cpus: '${DEPLOY_CPUS}'
memory: ${DEPLOY_RAM}
labels: # Reverse proxy sample
- "traefik.enable=true"
- "traefik.http.routers.fetcher.rule=Host(`${REVERSE_PROXY_URL}`)"
- "traefik.http.routers.fetcher.entrypoints=websecure"
- "traefik.http.routers.fetcher.tls.certresolver=myresolvercd"
- "traefik.http.services.fetcher.loadbalancer.server.port=8000"
networks:
- default # This network
- docker_default # Reverse proxy network
fetcher_db:
image: postgres:17
container_name: fetcher_db
restart: unless-stopped
# Set shared memory limit when using docker-compose
shm_size: 128mb
environment:
POSTGRES_DB: ${DB_NAME}
POSTGRES_PASSWORD: ${DB_PASSWORD}
POSTGRES_USER: ${DB_USER}
POSTGRES_INITDB_ARGS: '--data-checksums'
volumes: # Persistent DB?
- ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data
ports:
- 5432 #:5432
fetcher_redis:
image: redis:alpine
container_name: fetcher_redis
restart: unless-stopped
ports:
- 6379 #:6379
- 6379
networks:
docker_default:
external: true
fetcher_db:
container_name: fetcher_db
restart: unless-stopped

View File

@@ -3,22 +3,9 @@ version: '3.9'
services:
fetcher_app_selenium:
image: fetcher_app_selenium
build:
context: ./app_selenium
args:
- ARCH=${ARCH} # arm64, amd64
container_name: fetcher_app_selenium
restart: unless-stopped
shm_size: 512mb
environment:
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
ports:
- 80:80
dns:
- 1.1.1.1
- 1.0.0.1
extends:
file: docker-compose-base.yml
service: fetcher_app_selenium
deploy:
resources:
limits:
@@ -26,66 +13,11 @@ services:
memory: ${DEPLOY_RAM}
fetcher_app_urls:
image: fetcher_app_urls
build:
context: ./app_urls
container_name: fetcher_app_urls
restart: unless-stopped
environment:
# Initialization
- INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
# Django
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
- DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
- DJANGO_DEBUG=${DJANGO_DEBUG}
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
# Database
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASSWORD=${DB_PASSWORD}
- DB_HOST=${DB_HOST}
- DB_PORT=${DB_PORT}
- REDIS_HOST=${REDIS_HOST}
- REDIS_PORT=${REDIS_PORT}
# Job timeout: 30 min
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
# Fetcher
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection
- FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
- FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
# Selenium
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
# Ghost
- GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
- GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
- PEXELS_API_KEY=${PEXELS_API_KEY}
- OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
########################
volumes: # Development mode
- ./app_urls:/opt/app
########################
ports:
- 8000:8000
depends_on:
- fetcher_db
- fetcher_redis
dns:
- 1.1.1.1
- 1.0.0.1
deploy:
resources:
limits:
cpus: '${DEPLOY_CPUS}'
memory: ${DEPLOY_RAM}
extends:
file: docker-compose-base.yml
service: fetcher_app_urls
#env_files:
# - .env.dev
#labels: # Reverse proxy sample
# - "traefik.enable=true"
# - "traefik.http.routers.fetcher.rule=Host(`urls.yourdomain.com`)"
@@ -95,11 +27,21 @@ services:
#networks:
# - default # This network
# - docker_default # Reverse proxy network
ports:
- 8000:8000
volumes: # Development mode
- ./app_urls:/opt/app
deploy:
resources:
limits:
cpus: '${DEPLOY_CPUS}'
memory: ${DEPLOY_RAM}
fetcher_db:
extends:
file: docker-compose-base.yml
service: fetcher_db
image: postgres:17
container_name: fetcher_db
restart: unless-stopped
# Set shared memory limit when using docker-compose
shm_size: 128mb
environment:
@@ -107,18 +49,14 @@ services:
POSTGRES_PASSWORD: ${DB_PASSWORD}
POSTGRES_USER: ${DB_USER}
POSTGRES_INITDB_ARGS: '--data-checksums'
#volumes: # Persistent DB?
# - ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data
ports:
- 5432 #:5432
#volumes: # Persistent DB?
# - ./postgres:/var/lib/postgresql/data
fetcher_redis:
image: redis:alpine
container_name: fetcher_redis
restart: unless-stopped
extends:
file: docker-compose-base.yml
service: fetcher_redis
ports:
- 6379 #:6379
#networks:
# docker_default:
# external: true
- 6379:6379

View File

@@ -3,22 +3,9 @@ version: '3.9'
services:
fetcher_app_selenium:
image: fetcher_app_selenium
build:
context: ./app_selenium
args:
- ARCH=${ARCH} # arm64, amd64
container_name: fetcher_app_selenium
restart: unless-stopped
shm_size: 512mb
environment:
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
ports:
- 80
dns:
- 1.1.1.1
- 1.0.0.1
extends:
file: docker-compose-base.yml
service: fetcher_app_selenium
deploy:
resources:
limits:
@@ -26,61 +13,11 @@ services:
memory: ${DEPLOY_RAM}
fetcher_app_urls:
image: fetcher_app_urls
build:
context: ./app_urls
container_name: fetcher_app_urls
restart: unless-stopped
environment:
# Initialization
- INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
# Django
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
- DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
- DJANGO_DEBUG=${DJANGO_DEBUG}
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
# Database
- DB_NAME=${DB_NAME}
- DB_USER=${DB_USER}
- DB_PASSWORD=${DB_PASSWORD}
- DB_HOST=${DB_HOST}
- DB_PORT=${DB_PORT}
- REDIS_HOST=${REDIS_HOST}
- REDIS_PORT=${REDIS_PORT}
# Job timeout: 30 min
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
# Fetcher
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection
- FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
- FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
# Selenium
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
# Ghost
- GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
- GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
- PEXELS_API_KEY=${PEXELS_API_KEY}
- OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
########################
#volumes: # Development mode
# - ./app_urls:/opt/app
########################
extends:
file: docker-compose-base.yml
service: fetcher_app_urls
ports:
- 8000:8000
depends_on:
- fetcher_db
- fetcher_redis
dns:
- 1.1.1.1
- 1.0.0.1
deploy:
resources:
limits:
@@ -88,7 +25,9 @@ services:
memory: ${DEPLOY_RAM}
fetcher_db:
container_name: fetcher_db
extends:
file: docker-compose-base.yml
service: fetcher_db
image: alpine:latest
restart: unless-stopped
deploy:
@@ -98,22 +37,21 @@ services:
volumes:
# REQUIREMENT: Add fetcher's SSH public key into the DB's .ssh/authorized_keys machine
- ~/.ssh:/root/.ssh:ro
ports:
- 15885:15885
- 5432:5432
command:
- sh
- -c
- |
apk add --update openssh autossh
autossh -M 15885 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
### Alternative:
### autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
### -M 15882 monitors on port X, if already being used conflict!
###autossh -M 15882 -N -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
###ssh -N -o "StrictHostKeyChecking no" -o "ServerAliveInterval 60" -o "ServerAliveCountMax 3" -o 'PasswordAuthentication no' -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
network_mode: "host"
# Monitor status on port 15885
autossh -M 15885 -N -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
# autossh -M 15885 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
fetcher_redis:
image: redis:alpine
container_name: fetcher_redis
restart: unless-stopped
extends:
file: docker-compose-base.yml
service: fetcher_redis
ports:
- 6379 #:6379
- 6379:6379

79
utils/DB-Dev.ipynb Normal file
View File

@@ -0,0 +1,79 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#!pip install python-dotenv\n",
"from dotenv import load_dotenv\n",
"\n",
"# Specify the path to your .env file (optional if in the current dir)\n",
"load_dotenv(dotenv_path=\".env\", override=True)\n",
"\n",
"import os\n",
"import psycopg\n",
"from sshtunnel import SSHTunnelForwarder\n",
"\n",
"if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
" print(\"SSH tunnel: True\")\n",
"else:\n",
" print(\"SSH tunnel: False\")\n",
"\n",
"connect_info = \"host={} port={} user={} password={} dbname={}\".format(os.environ.get(\"DB_HOST\"), os.environ.get(\"DB_PORT\"), os.environ.get(\"DB_USER\"), os.environ.get(\"DB_PASSWORD\"), os.environ.get(\"DB_NAME\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
" ssh_tunnel = SSHTunnelForwarder(\n",
" (os.environ.get(\"REMOTE_HOST\"), int(os.environ.get(\"REMOTE_SSH_PORT\"))), \n",
" ssh_username=os.environ.get(\"REMOTE_USERNAME\"), ssh_password=os.environ.get(\"REMOTE_PASSWORD\"), \n",
" remote_bind_address=('localhost', int(os.environ.get(\"REMOTE_PORT\"))), local_bind_address=('localhost', int(os.environ.get(\"DB_PORT\"))) \n",
" )\n",
" ssh_tunnel.start()\n",
"\n",
"try:\n",
" with psycopg.connect(connect_info) as conn:\n",
" if True:\n",
" for t in conn.execute(\"\"\"\n",
" SELECT * from URLS WHERE id IN (SELECT id_url FROM URLS_SOURCE_SEARCH INNER JOIN SEARCH ON URLS_SOURCE_SEARCH.id_search = SEARCH.id WHERE SEARCH.search LIKE '%child abuse%') LIMIT 5;\n",
" \"\"\").fetchall():\n",
" print(t)\n",
" \n",
"except Exception as e:\n",
" print(\"Err:\", str(e))\n",
"\n",
"if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
" ssh_tunnel.stop()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos_urls",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}