Docker and deployment to fetcher server

This commit is contained in:
Luciano Gervasoni
2025-06-27 09:14:44 +02:00
parent f659d4adb3
commit 8b689729bf
12 changed files with 148 additions and 222 deletions

View File

@@ -54,6 +54,7 @@ class FetchSearcher():
for SearchInstance in ListSearchInstances:
# Sleep between requests, avoid too many requests...
time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
# TODO: Random proxy / VPN
SearchInstance(args).fetch_articles(db_writer, obj_search)
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master

View File

@@ -1,8 +1,6 @@
import time
import feedparser
import os
from django.utils import timezone
from datetime import timedelta
from ..models import Search, Source
from .fetch_utils_gnews import decode_gnews_urls
from .logger import get_logger

View File

@@ -11,7 +11,7 @@ logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(messa
logger = logging.getLogger("fetcher")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL
# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG)

View File

@@ -74,7 +74,7 @@ def process_missing_kids_urls_all(batch_size=None):
logger.info("Task completed: {}".format(task))
@job('default')
def clean_old_url_content(older_than_days=60):
def clean_old_url_content(older_than_days=14):
task = "Clean old URL content"
logger.info("Task triggered: {}".format(task))
DB_Handler().clean_old_url_content(older_than_days=older_than_days)

View File

@@ -24,11 +24,12 @@
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
[".*radio.foxnews\\.com\\/.*", "invalid", 75],
[".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
[".*missingkids\\.org\\/poster\\/.*", "valid", 50]
]
}

View File

@@ -29,13 +29,15 @@ def wait_connection():
connected = True
except psycopg.OperationalError as e:
print(str(e))
# Connection not ready...
# print(".", end="")
time.sleep(2)
time.sleep(15)
except Exception as e:
print(str(e))
# Connection not ready...
# print("e", end="")
time.sleep(2)
time.sleep(15)
print("DB connection ready")
@@ -57,7 +59,8 @@ def initialize_tables():
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
-- status_wendy WENDY_STATUS DEFAULT NULL,
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
-- ts_wendy TIMESTAMPTZ DEFAULT NULL,
-- child_abuse BOOLEAN DEFAULT NULL,
);
CREATE INDEX idx_urls_status ON urls(status);
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);

View File

@@ -13,7 +13,7 @@
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 4,
"interval": 8,
"interval_unit": "hours",
"successful_runs": 0,
"failed_runs": 0,
@@ -139,7 +139,7 @@
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 2,
"interval": 4,
"interval_unit": "hours",
"successful_runs": 0,
"failed_runs": 0,