Clean old url content, use django db connection
This commit is contained in:
5
.env
5
.env
@@ -5,13 +5,12 @@ DJANGO_SUPERUSER_PASSWORD=matitos
|
||||
DJANGO_SUPERUSER_EMAIL=matitos@matitos.org
|
||||
|
||||
# Reverse proxy
|
||||
REVERSE_PROXY_URL=fetcher.matitos.org
|
||||
REVERSE_PROXY_URL=sample.url.com
|
||||
|
||||
# Django
|
||||
DJANGO_ALLOWED_ORIGINS=https://fetcher.matitos.org # Reverse proxy
|
||||
DJANGO_ALLOWED_ORIGINS=https://sample.url.com # Reverse proxy
|
||||
DJANGO_ALLOWED_HOSTS=* # host1,host2
|
||||
DJANGO_SECRET_KEY=EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5
|
||||
# DJANGO_DEBUG=False
|
||||
DJANGO_DEBUG=True
|
||||
PATH_LOGS_DIRECTORY=/opt/logs
|
||||
|
||||
|
||||
@@ -2,6 +2,8 @@ from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPa
|
||||
from django.db.models import Q
|
||||
from django.core.cache import cache
|
||||
from django.db import IntegrityError
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from .url_processor import process_url, get_with_protocol
|
||||
import re
|
||||
import traceback
|
||||
@@ -271,4 +273,14 @@ class DB_Handler():
|
||||
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
|
||||
def clean_old_url_content(self, older_than_days=60):
|
||||
try:
|
||||
# Get cut off date
|
||||
cutoff_date = timezone.now() - timedelta(days=older_than_days)
|
||||
# Delete old UrlContent objects
|
||||
old_url_content = UrlContent.objects.filter(id_url__ts_fetch__lt=cutoff_date)
|
||||
logger.info("Cleaning URL content older than {} days: #{}".format(older_than_days, len(old_url_content)))
|
||||
old_url_content.delete()
|
||||
except Exception as e:
|
||||
logger.warning("Exception cleaning old URL content: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
@@ -45,7 +45,7 @@ def fetch_missing_kids_all(number_pages=-1):
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def process_raw_urls(batch_size=50):
|
||||
def process_raw_urls(batch_size=100):
|
||||
task = "Process raw URLs"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().process_raw_urls(batch_size=batch_size)
|
||||
@@ -72,6 +72,13 @@ def process_missing_kids_urls_all(batch_size=None):
|
||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def clean_old_url_content(older_than_days=60):
|
||||
task = "Clean old URL content"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
|
||||
@job('default')
|
||||
def background_task(process_type: str):
|
||||
@@ -86,6 +93,7 @@ def background_task(process_type: str):
|
||||
FetchSearcher().run()
|
||||
elif (process_type == "fetch_missingkids_all"):
|
||||
FetchMissingKids().run(number_pages=-1)
|
||||
|
||||
elif ("fetch_missingkids" in process_type):
|
||||
# number_pages encoded in URL
|
||||
try:
|
||||
@@ -93,6 +101,7 @@ def background_task(process_type: str):
|
||||
except Exception as e:
|
||||
number_pages = -1
|
||||
FetchMissingKids().run(number_pages=number_pages)
|
||||
|
||||
elif ("process_" in process_type):
|
||||
# Batch size encoded in URL
|
||||
try:
|
||||
@@ -106,6 +115,15 @@ def background_task(process_type: str):
|
||||
DB_Handler().process_error_urls(batch_size=batch_size)
|
||||
elif ("process_missing_kids_urls" in process_type):
|
||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||
|
||||
elif ( "clean_old_url_content" in process_type ):
|
||||
# Older than X days encoded in URL
|
||||
try:
|
||||
older_than_days = float(process_type.split("_")[-1])
|
||||
except Exception as e:
|
||||
older_than_days = None
|
||||
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
||||
|
||||
else:
|
||||
logger.info("Task unknown!: {}".format(process_type))
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ import os
|
||||
import psycopg
|
||||
from .tasks import background_task
|
||||
from django.http import JsonResponse, HttpResponse
|
||||
from django.db import connection
|
||||
|
||||
####################################################################################################
|
||||
def trigger_task(request, task):
|
||||
@@ -15,7 +16,7 @@ def link_list(request):
|
||||
app_url = request.build_absolute_uri()
|
||||
# Tasks
|
||||
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
|
||||
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
||||
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"]
|
||||
# List of links
|
||||
list_links = \
|
||||
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
|
||||
@@ -32,7 +33,6 @@ def link_list(request):
|
||||
|
||||
return HttpResponse(html)
|
||||
|
||||
|
||||
####################################################################################################
|
||||
def logs(request, log_type):
|
||||
# Capture output: python manage.py rqstats
|
||||
@@ -45,39 +45,28 @@ def logs(request, log_type):
|
||||
|
||||
####################################################################################################
|
||||
def log_db(request):
|
||||
# TODO: Django connection
|
||||
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
|
||||
os.environ.get("DB_HOST", "localhost"),
|
||||
os.environ.get("DB_PORT", "5432"),
|
||||
os.environ.get("DB_NAME", "matitos"),
|
||||
os.environ.get("DB_USER", "supermatitos"),
|
||||
os.environ.get("DB_PASSWORD", "supermatitos")
|
||||
)
|
||||
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Create URLs table
|
||||
r = cur.execute("""
|
||||
SELECT
|
||||
relname AS "relation",
|
||||
pg_size_pretty (
|
||||
pg_total_relation_size (C .oid)
|
||||
) AS "total_size"
|
||||
FROM
|
||||
pg_class C
|
||||
LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
|
||||
WHERE
|
||||
nspname NOT IN (
|
||||
'pg_catalog',
|
||||
'information_schema'
|
||||
)
|
||||
AND C .relkind <> 'i'
|
||||
AND nspname !~ '^pg_toast'
|
||||
ORDER BY
|
||||
pg_total_relation_size (C .oid) DESC
|
||||
LIMIT 100;
|
||||
""").fetchall()
|
||||
with connection.cursor() as cursor:
|
||||
# Create URLs table
|
||||
r = cursor.execute("""
|
||||
SELECT
|
||||
relname AS "relation",
|
||||
pg_size_pretty (
|
||||
pg_total_relation_size (C .oid)
|
||||
) AS "total_size"
|
||||
FROM
|
||||
pg_class C
|
||||
LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
|
||||
WHERE
|
||||
nspname NOT IN (
|
||||
'pg_catalog',
|
||||
'information_schema'
|
||||
)
|
||||
AND C .relkind <> 'i'
|
||||
AND nspname !~ '^pg_toast'
|
||||
ORDER BY
|
||||
pg_total_relation_size (C .oid) DESC
|
||||
LIMIT 100;
|
||||
""").fetchall()
|
||||
return HttpResponse( "\n".join([str(e) for e in r]) )
|
||||
|
||||
####################################################################################################
|
||||
@@ -187,5 +187,26 @@
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Clean old URL content",
|
||||
"callable": "fetcher.tasks.clean_old_url_content",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "weeks",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
}
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user