Docker and deployment to fetcher server
This commit is contained in:
@@ -55,3 +55,9 @@ docker compose -f docker-compose-dev.yml down -v
|
|||||||
docker compose -f docker-compose-dev.yml build --progress=plain
|
docker compose -f docker-compose-dev.yml build --progress=plain
|
||||||
docker compose -f docker-compose-dev.yml up
|
docker compose -f docker-compose-dev.yml up
|
||||||
```
|
```
|
||||||
|
* Prod mode
|
||||||
|
```
|
||||||
|
docker compose -f docker-compose-prod.yml down -v
|
||||||
|
docker compose -f docker-compose-prod.yml build --progress=plain
|
||||||
|
docker compose -f docker-compose-prod.yml up -d
|
||||||
|
```
|
||||||
@@ -54,6 +54,7 @@ class FetchSearcher():
|
|||||||
for SearchInstance in ListSearchInstances:
|
for SearchInstance in ListSearchInstances:
|
||||||
# Sleep between requests, avoid too many requests...
|
# Sleep between requests, avoid too many requests...
|
||||||
time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
|
time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
|
||||||
|
# TODO: Random proxy / VPN
|
||||||
SearchInstance(args).fetch_articles(db_writer, obj_search)
|
SearchInstance(args).fetch_articles(db_writer, obj_search)
|
||||||
|
|
||||||
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
|
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
import time
|
import time
|
||||||
import feedparser
|
import feedparser
|
||||||
import os
|
import os
|
||||||
from django.utils import timezone
|
|
||||||
from datetime import timedelta
|
|
||||||
from ..models import Search, Source
|
from ..models import Search, Source
|
||||||
from .fetch_utils_gnews import decode_gnews_urls
|
from .fetch_utils_gnews import decode_gnews_urls
|
||||||
from .logger import get_logger
|
from .logger import get_logger
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(messa
|
|||||||
logger = logging.getLogger("fetcher")
|
logger = logging.getLogger("fetcher")
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL
|
||||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||||
fh.setLevel(logging.DEBUG)
|
fh.setLevel(logging.DEBUG)
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ def process_missing_kids_urls_all(batch_size=None):
|
|||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
@job('default')
|
@job('default')
|
||||||
def clean_old_url_content(older_than_days=60):
|
def clean_old_url_content(older_than_days=14):
|
||||||
task = "Clean old URL content"
|
task = "Clean old URL content"
|
||||||
logger.info("Task triggered: {}".format(task))
|
logger.info("Task triggered: {}".format(task))
|
||||||
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
||||||
|
|||||||
@@ -24,11 +24,12 @@
|
|||||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
||||||
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
||||||
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
|
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
|
||||||
[".*radio.foxnews\\.com\\/.*", "invalid", 75],
|
[".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
|
||||||
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
||||||
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
|
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
|
||||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
||||||
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
||||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
|
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
|
||||||
|
[".*missingkids\\.org\\/poster\\/.*", "valid", 50]
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,13 +29,15 @@ def wait_connection():
|
|||||||
connected = True
|
connected = True
|
||||||
|
|
||||||
except psycopg.OperationalError as e:
|
except psycopg.OperationalError as e:
|
||||||
|
print(str(e))
|
||||||
# Connection not ready...
|
# Connection not ready...
|
||||||
# print(".", end="")
|
# print(".", end="")
|
||||||
time.sleep(2)
|
time.sleep(15)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
# Connection not ready...
|
# Connection not ready...
|
||||||
# print("e", end="")
|
# print("e", end="")
|
||||||
time.sleep(2)
|
time.sleep(15)
|
||||||
|
|
||||||
print("DB connection ready")
|
print("DB connection ready")
|
||||||
|
|
||||||
@@ -57,7 +59,8 @@ def initialize_tables():
|
|||||||
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
|
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
|
||||||
-- status_wendy WENDY_STATUS DEFAULT NULL,
|
-- status_wendy WENDY_STATUS DEFAULT NULL,
|
||||||
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
|
-- ts_wendy TIMESTAMPTZ DEFAULT NULL,
|
||||||
|
-- child_abuse BOOLEAN DEFAULT NULL,
|
||||||
);
|
);
|
||||||
CREATE INDEX idx_urls_status ON urls(status);
|
CREATE INDEX idx_urls_status ON urls(status);
|
||||||
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
|
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
|
||||||
|
|||||||
@@ -13,7 +13,7 @@
|
|||||||
"result_ttl": 86400,
|
"result_ttl": 86400,
|
||||||
"cron_string": null,
|
"cron_string": null,
|
||||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||||
"interval": 4,
|
"interval": 8,
|
||||||
"interval_unit": "hours",
|
"interval_unit": "hours",
|
||||||
"successful_runs": 0,
|
"successful_runs": 0,
|
||||||
"failed_runs": 0,
|
"failed_runs": 0,
|
||||||
@@ -139,7 +139,7 @@
|
|||||||
"result_ttl": 86400,
|
"result_ttl": 86400,
|
||||||
"cron_string": null,
|
"cron_string": null,
|
||||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||||
"interval": 2,
|
"interval": 4,
|
||||||
"interval_unit": "hours",
|
"interval_unit": "hours",
|
||||||
"successful_runs": 0,
|
"successful_runs": 0,
|
||||||
"failed_runs": 0,
|
"failed_runs": 0,
|
||||||
|
|||||||
@@ -19,11 +19,6 @@ services:
|
|||||||
dns:
|
dns:
|
||||||
- 1.1.1.1
|
- 1.1.1.1
|
||||||
- 1.0.0.1
|
- 1.0.0.1
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpus: '${DEPLOY_CPUS}'
|
|
||||||
memory: ${DEPLOY_RAM}
|
|
||||||
|
|
||||||
fetcher_app_urls:
|
fetcher_app_urls:
|
||||||
image: fetcher_app_urls
|
image: fetcher_app_urls
|
||||||
@@ -70,55 +65,22 @@ services:
|
|||||||
- PEXELS_API_KEY=${PEXELS_API_KEY}
|
- PEXELS_API_KEY=${PEXELS_API_KEY}
|
||||||
- OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
|
- OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
|
||||||
########################
|
########################
|
||||||
#volumes: # Development mode
|
|
||||||
# - ./app_urls:/opt/app
|
|
||||||
########################
|
|
||||||
ports:
|
ports:
|
||||||
- 8000 # :8000
|
- 8000
|
||||||
depends_on:
|
depends_on:
|
||||||
- fetcher_db
|
- fetcher_db
|
||||||
- fetcher_redis
|
- fetcher_redis
|
||||||
dns:
|
dns:
|
||||||
- 1.1.1.1
|
- 1.1.1.1
|
||||||
- 1.0.0.1
|
- 1.0.0.1
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpus: '${DEPLOY_CPUS}'
|
|
||||||
memory: ${DEPLOY_RAM}
|
|
||||||
labels: # Reverse proxy sample
|
|
||||||
- "traefik.enable=true"
|
|
||||||
- "traefik.http.routers.fetcher.rule=Host(`${REVERSE_PROXY_URL}`)"
|
|
||||||
- "traefik.http.routers.fetcher.entrypoints=websecure"
|
|
||||||
- "traefik.http.routers.fetcher.tls.certresolver=myresolvercd"
|
|
||||||
- "traefik.http.services.fetcher.loadbalancer.server.port=8000"
|
|
||||||
networks:
|
|
||||||
- default # This network
|
|
||||||
- docker_default # Reverse proxy network
|
|
||||||
|
|
||||||
fetcher_db:
|
|
||||||
image: postgres:17
|
|
||||||
container_name: fetcher_db
|
|
||||||
restart: unless-stopped
|
|
||||||
# Set shared memory limit when using docker-compose
|
|
||||||
shm_size: 128mb
|
|
||||||
environment:
|
|
||||||
POSTGRES_DB: ${DB_NAME}
|
|
||||||
POSTGRES_PASSWORD: ${DB_PASSWORD}
|
|
||||||
POSTGRES_USER: ${DB_USER}
|
|
||||||
POSTGRES_INITDB_ARGS: '--data-checksums'
|
|
||||||
volumes: # Persistent DB?
|
|
||||||
- ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data
|
|
||||||
ports:
|
|
||||||
- 5432 #:5432
|
|
||||||
|
|
||||||
fetcher_redis:
|
fetcher_redis:
|
||||||
image: redis:alpine
|
image: redis:alpine
|
||||||
container_name: fetcher_redis
|
container_name: fetcher_redis
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ports:
|
ports:
|
||||||
- 6379 #:6379
|
- 6379
|
||||||
|
|
||||||
networks:
|
fetcher_db:
|
||||||
docker_default:
|
container_name: fetcher_db
|
||||||
external: true
|
restart: unless-stopped
|
||||||
@@ -3,22 +3,9 @@ version: '3.9'
|
|||||||
services:
|
services:
|
||||||
|
|
||||||
fetcher_app_selenium:
|
fetcher_app_selenium:
|
||||||
image: fetcher_app_selenium
|
extends:
|
||||||
build:
|
file: docker-compose-base.yml
|
||||||
context: ./app_selenium
|
service: fetcher_app_selenium
|
||||||
args:
|
|
||||||
- ARCH=${ARCH} # arm64, amd64
|
|
||||||
container_name: fetcher_app_selenium
|
|
||||||
restart: unless-stopped
|
|
||||||
shm_size: 512mb
|
|
||||||
environment:
|
|
||||||
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
|
|
||||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
|
|
||||||
ports:
|
|
||||||
- 80:80
|
|
||||||
dns:
|
|
||||||
- 1.1.1.1
|
|
||||||
- 1.0.0.1
|
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
@@ -26,66 +13,11 @@ services:
|
|||||||
memory: ${DEPLOY_RAM}
|
memory: ${DEPLOY_RAM}
|
||||||
|
|
||||||
fetcher_app_urls:
|
fetcher_app_urls:
|
||||||
image: fetcher_app_urls
|
extends:
|
||||||
build:
|
file: docker-compose-base.yml
|
||||||
context: ./app_urls
|
service: fetcher_app_urls
|
||||||
container_name: fetcher_app_urls
|
#env_files:
|
||||||
restart: unless-stopped
|
# - .env.dev
|
||||||
environment:
|
|
||||||
# Initialization
|
|
||||||
- INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence
|
|
||||||
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
|
|
||||||
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
|
|
||||||
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
|
|
||||||
# Django
|
|
||||||
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
|
|
||||||
- DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
|
|
||||||
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
|
|
||||||
- DJANGO_DEBUG=${DJANGO_DEBUG}
|
|
||||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
|
|
||||||
# Database
|
|
||||||
- DB_NAME=${DB_NAME}
|
|
||||||
- DB_USER=${DB_USER}
|
|
||||||
- DB_PASSWORD=${DB_PASSWORD}
|
|
||||||
- DB_HOST=${DB_HOST}
|
|
||||||
- DB_PORT=${DB_PORT}
|
|
||||||
- REDIS_HOST=${REDIS_HOST}
|
|
||||||
- REDIS_PORT=${REDIS_PORT}
|
|
||||||
# Job timeout: 30 min
|
|
||||||
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
|
|
||||||
# Fetcher
|
|
||||||
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
|
|
||||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
|
|
||||||
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search
|
|
||||||
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host
|
|
||||||
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection
|
|
||||||
- FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
|
|
||||||
- FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
|
|
||||||
# Selenium
|
|
||||||
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
|
|
||||||
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
|
|
||||||
# Ghost
|
|
||||||
- GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
|
|
||||||
- GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
|
|
||||||
- PEXELS_API_KEY=${PEXELS_API_KEY}
|
|
||||||
- OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
|
|
||||||
########################
|
|
||||||
volumes: # Development mode
|
|
||||||
- ./app_urls:/opt/app
|
|
||||||
########################
|
|
||||||
ports:
|
|
||||||
- 8000:8000
|
|
||||||
depends_on:
|
|
||||||
- fetcher_db
|
|
||||||
- fetcher_redis
|
|
||||||
dns:
|
|
||||||
- 1.1.1.1
|
|
||||||
- 1.0.0.1
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
cpus: '${DEPLOY_CPUS}'
|
|
||||||
memory: ${DEPLOY_RAM}
|
|
||||||
#labels: # Reverse proxy sample
|
#labels: # Reverse proxy sample
|
||||||
# - "traefik.enable=true"
|
# - "traefik.enable=true"
|
||||||
# - "traefik.http.routers.fetcher.rule=Host(`urls.yourdomain.com`)"
|
# - "traefik.http.routers.fetcher.rule=Host(`urls.yourdomain.com`)"
|
||||||
@@ -95,11 +27,21 @@ services:
|
|||||||
#networks:
|
#networks:
|
||||||
# - default # This network
|
# - default # This network
|
||||||
# - docker_default # Reverse proxy network
|
# - docker_default # Reverse proxy network
|
||||||
|
ports:
|
||||||
|
- 8000:8000
|
||||||
|
volumes: # Development mode
|
||||||
|
- ./app_urls:/opt/app
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: '${DEPLOY_CPUS}'
|
||||||
|
memory: ${DEPLOY_RAM}
|
||||||
|
|
||||||
fetcher_db:
|
fetcher_db:
|
||||||
|
extends:
|
||||||
|
file: docker-compose-base.yml
|
||||||
|
service: fetcher_db
|
||||||
image: postgres:17
|
image: postgres:17
|
||||||
container_name: fetcher_db
|
|
||||||
restart: unless-stopped
|
|
||||||
# Set shared memory limit when using docker-compose
|
# Set shared memory limit when using docker-compose
|
||||||
shm_size: 128mb
|
shm_size: 128mb
|
||||||
environment:
|
environment:
|
||||||
@@ -107,18 +49,14 @@ services:
|
|||||||
POSTGRES_PASSWORD: ${DB_PASSWORD}
|
POSTGRES_PASSWORD: ${DB_PASSWORD}
|
||||||
POSTGRES_USER: ${DB_USER}
|
POSTGRES_USER: ${DB_USER}
|
||||||
POSTGRES_INITDB_ARGS: '--data-checksums'
|
POSTGRES_INITDB_ARGS: '--data-checksums'
|
||||||
#volumes: # Persistent DB?
|
|
||||||
# - ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data
|
|
||||||
ports:
|
ports:
|
||||||
- 5432 #:5432
|
- 5432 #:5432
|
||||||
|
#volumes: # Persistent DB?
|
||||||
|
# - ./postgres:/var/lib/postgresql/data
|
||||||
|
|
||||||
fetcher_redis:
|
fetcher_redis:
|
||||||
image: redis:alpine
|
extends:
|
||||||
container_name: fetcher_redis
|
file: docker-compose-base.yml
|
||||||
restart: unless-stopped
|
service: fetcher_redis
|
||||||
ports:
|
ports:
|
||||||
- 6379 #:6379
|
- 6379:6379
|
||||||
|
|
||||||
#networks:
|
|
||||||
# docker_default:
|
|
||||||
# external: true
|
|
||||||
|
|||||||
@@ -3,22 +3,9 @@ version: '3.9'
|
|||||||
services:
|
services:
|
||||||
|
|
||||||
fetcher_app_selenium:
|
fetcher_app_selenium:
|
||||||
image: fetcher_app_selenium
|
extends:
|
||||||
build:
|
file: docker-compose-base.yml
|
||||||
context: ./app_selenium
|
service: fetcher_app_selenium
|
||||||
args:
|
|
||||||
- ARCH=${ARCH} # arm64, amd64
|
|
||||||
container_name: fetcher_app_selenium
|
|
||||||
restart: unless-stopped
|
|
||||||
shm_size: 512mb
|
|
||||||
environment:
|
|
||||||
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
|
|
||||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
|
|
||||||
ports:
|
|
||||||
- 80
|
|
||||||
dns:
|
|
||||||
- 1.1.1.1
|
|
||||||
- 1.0.0.1
|
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
@@ -26,61 +13,11 @@ services:
|
|||||||
memory: ${DEPLOY_RAM}
|
memory: ${DEPLOY_RAM}
|
||||||
|
|
||||||
fetcher_app_urls:
|
fetcher_app_urls:
|
||||||
image: fetcher_app_urls
|
extends:
|
||||||
build:
|
file: docker-compose-base.yml
|
||||||
context: ./app_urls
|
service: fetcher_app_urls
|
||||||
container_name: fetcher_app_urls
|
|
||||||
restart: unless-stopped
|
|
||||||
environment:
|
|
||||||
# Initialization
|
|
||||||
- INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence
|
|
||||||
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
|
|
||||||
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
|
|
||||||
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
|
|
||||||
# Django
|
|
||||||
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
|
|
||||||
- DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
|
|
||||||
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
|
|
||||||
- DJANGO_DEBUG=${DJANGO_DEBUG}
|
|
||||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
|
|
||||||
# Database
|
|
||||||
- DB_NAME=${DB_NAME}
|
|
||||||
- DB_USER=${DB_USER}
|
|
||||||
- DB_PASSWORD=${DB_PASSWORD}
|
|
||||||
- DB_HOST=${DB_HOST}
|
|
||||||
- DB_PORT=${DB_PORT}
|
|
||||||
- REDIS_HOST=${REDIS_HOST}
|
|
||||||
- REDIS_PORT=${REDIS_PORT}
|
|
||||||
# Job timeout: 30 min
|
|
||||||
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
|
|
||||||
# Fetcher
|
|
||||||
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
|
|
||||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
|
|
||||||
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search
|
|
||||||
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host
|
|
||||||
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection
|
|
||||||
- FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
|
|
||||||
- FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
|
|
||||||
# Selenium
|
|
||||||
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
|
|
||||||
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
|
|
||||||
# Ghost
|
|
||||||
- GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
|
|
||||||
- GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
|
|
||||||
- PEXELS_API_KEY=${PEXELS_API_KEY}
|
|
||||||
- OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
|
|
||||||
########################
|
|
||||||
#volumes: # Development mode
|
|
||||||
# - ./app_urls:/opt/app
|
|
||||||
########################
|
|
||||||
ports:
|
ports:
|
||||||
- 8000:8000
|
- 8000:8000
|
||||||
depends_on:
|
|
||||||
- fetcher_db
|
|
||||||
- fetcher_redis
|
|
||||||
dns:
|
|
||||||
- 1.1.1.1
|
|
||||||
- 1.0.0.1
|
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
@@ -88,7 +25,9 @@ services:
|
|||||||
memory: ${DEPLOY_RAM}
|
memory: ${DEPLOY_RAM}
|
||||||
|
|
||||||
fetcher_db:
|
fetcher_db:
|
||||||
container_name: fetcher_db
|
extends:
|
||||||
|
file: docker-compose-base.yml
|
||||||
|
service: fetcher_db
|
||||||
image: alpine:latest
|
image: alpine:latest
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
deploy:
|
deploy:
|
||||||
@@ -98,22 +37,21 @@ services:
|
|||||||
volumes:
|
volumes:
|
||||||
# REQUIREMENT: Add fetcher's SSH public key into the DB's .ssh/authorized_keys machine
|
# REQUIREMENT: Add fetcher's SSH public key into the DB's .ssh/authorized_keys machine
|
||||||
- ~/.ssh:/root/.ssh:ro
|
- ~/.ssh:/root/.ssh:ro
|
||||||
|
ports:
|
||||||
|
- 15885:15885
|
||||||
|
- 5432:5432
|
||||||
command:
|
command:
|
||||||
- sh
|
- sh
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
apk add --update openssh autossh
|
apk add --update openssh autossh
|
||||||
autossh -M 15885 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
|
# Monitor status on port 15885
|
||||||
### Alternative:
|
autossh -M 15885 -N -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
|
||||||
### autossh -M 0 -o "ServerAliveInterval 30" -o "ServerAliveCountMax 3" -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
|
# autossh -M 15885 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
|
||||||
### -M 15882 monitors on port X, if already being used conflict!
|
|
||||||
###autossh -M 15882 -N -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
|
|
||||||
###ssh -N -o "StrictHostKeyChecking no" -o "ServerAliveInterval 60" -o "ServerAliveCountMax 3" -o 'PasswordAuthentication no' -o 'GatewayPorts yes' -L 15882:127.0.0.1:15882 matitos@matitos.org
|
|
||||||
network_mode: "host"
|
|
||||||
|
|
||||||
fetcher_redis:
|
fetcher_redis:
|
||||||
image: redis:alpine
|
extends:
|
||||||
container_name: fetcher_redis
|
file: docker-compose-base.yml
|
||||||
restart: unless-stopped
|
service: fetcher_redis
|
||||||
ports:
|
ports:
|
||||||
- 6379 #:6379
|
- 6379:6379
|
||||||
79
utils/DB-Dev.ipynb
Normal file
79
utils/DB-Dev.ipynb
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#!pip install python-dotenv\n",
|
||||||
|
"from dotenv import load_dotenv\n",
|
||||||
|
"\n",
|
||||||
|
"# Specify the path to your .env file (optional if in the current dir)\n",
|
||||||
|
"load_dotenv(dotenv_path=\".env\", override=True)\n",
|
||||||
|
"\n",
|
||||||
|
"import os\n",
|
||||||
|
"import psycopg\n",
|
||||||
|
"from sshtunnel import SSHTunnelForwarder\n",
|
||||||
|
"\n",
|
||||||
|
"if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
|
||||||
|
" print(\"SSH tunnel: True\")\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"SSH tunnel: False\")\n",
|
||||||
|
"\n",
|
||||||
|
"connect_info = \"host={} port={} user={} password={} dbname={}\".format(os.environ.get(\"DB_HOST\"), os.environ.get(\"DB_PORT\"), os.environ.get(\"DB_USER\"), os.environ.get(\"DB_PASSWORD\"), os.environ.get(\"DB_NAME\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
|
||||||
|
" ssh_tunnel = SSHTunnelForwarder(\n",
|
||||||
|
" (os.environ.get(\"REMOTE_HOST\"), int(os.environ.get(\"REMOTE_SSH_PORT\"))), \n",
|
||||||
|
" ssh_username=os.environ.get(\"REMOTE_USERNAME\"), ssh_password=os.environ.get(\"REMOTE_PASSWORD\"), \n",
|
||||||
|
" remote_bind_address=('localhost', int(os.environ.get(\"REMOTE_PORT\"))), local_bind_address=('localhost', int(os.environ.get(\"DB_PORT\"))) \n",
|
||||||
|
" )\n",
|
||||||
|
" ssh_tunnel.start()\n",
|
||||||
|
"\n",
|
||||||
|
"try:\n",
|
||||||
|
" with psycopg.connect(connect_info) as conn:\n",
|
||||||
|
" if True:\n",
|
||||||
|
" for t in conn.execute(\"\"\"\n",
|
||||||
|
" SELECT * from URLS WHERE id IN (SELECT id_url FROM URLS_SOURCE_SEARCH INNER JOIN SEARCH ON URLS_SOURCE_SEARCH.id_search = SEARCH.id WHERE SEARCH.search LIKE '%child abuse%') LIMIT 5;\n",
|
||||||
|
" \"\"\").fetchall():\n",
|
||||||
|
" print(t)\n",
|
||||||
|
" \n",
|
||||||
|
"except Exception as e:\n",
|
||||||
|
" print(\"Err:\", str(e))\n",
|
||||||
|
"\n",
|
||||||
|
"if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
|
||||||
|
" ssh_tunnel.stop()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "matitos_urls",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user