Docker and deployment to fetcher server
This commit is contained in:
@@ -54,6 +54,7 @@ class FetchSearcher():
|
||||
for SearchInstance in ListSearchInstances:
|
||||
# Sleep between requests, avoid too many requests...
|
||||
time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
|
||||
# TODO: Random proxy / VPN
|
||||
SearchInstance(args).fetch_articles(db_writer, obj_search)
|
||||
|
||||
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
import time
|
||||
import feedparser
|
||||
import os
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from ..models import Search, Source
|
||||
from .fetch_utils_gnews import decode_gnews_urls
|
||||
from .logger import get_logger
|
||||
|
||||
@@ -11,7 +11,7 @@ logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(messa
|
||||
logger = logging.getLogger("fetcher")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||
# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
|
||||
@@ -74,7 +74,7 @@ def process_missing_kids_urls_all(batch_size=None):
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def clean_old_url_content(older_than_days=60):
|
||||
def clean_old_url_content(older_than_days=14):
|
||||
task = "Clean old URL content"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
||||
|
||||
@@ -24,11 +24,12 @@
|
||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
||||
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
||||
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
|
||||
[".*radio.foxnews\\.com\\/.*", "invalid", 75],
|
||||
[".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
|
||||
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
||||
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
|
||||
[".*missingkids\\.org\\/poster\\/.*", "valid", 50]
|
||||
]
|
||||
}
|
||||
|
||||
@@ -29,13 +29,15 @@ def wait_connection():
|
||||
connected = True
|
||||
|
||||
except psycopg.OperationalError as e:
|
||||
print(str(e))
|
||||
# Connection not ready...
|
||||
# print(".", end="")
|
||||
time.sleep(2)
|
||||
time.sleep(15)
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
# Connection not ready...
|
||||
# print("e", end="")
|
||||
time.sleep(2)
|
||||
time.sleep(15)
|
||||
|
||||
print("DB connection ready")
|
||||
|
||||
@@ -57,7 +59,8 @@ def initialize_tables():
|
||||
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
|
||||
-- status_wendy WENDY_STATUS DEFAULT NULL,
|
||||
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
|
||||
-- ts_wendy TIMESTAMPTZ DEFAULT NULL,
|
||||
-- child_abuse BOOLEAN DEFAULT NULL,
|
||||
);
|
||||
CREATE INDEX idx_urls_status ON urls(status);
|
||||
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 4,
|
||||
"interval": 8,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
@@ -139,7 +139,7 @@
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 2,
|
||||
"interval": 4,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
|
||||
Reference in New Issue
Block a user