Clean old url content, use django db connection

This commit is contained in:
Luciano Gervasoni
2025-04-14 10:48:33 +02:00
parent 0cd84496cf
commit 43c6c3aabf
5 changed files with 80 additions and 41 deletions

5
.env
View File

@@ -5,13 +5,12 @@ DJANGO_SUPERUSER_PASSWORD=matitos
DJANGO_SUPERUSER_EMAIL=matitos@matitos.org
# Reverse proxy
REVERSE_PROXY_URL=fetcher.matitos.org
REVERSE_PROXY_URL=sample.url.com
# Django
DJANGO_ALLOWED_ORIGINS=https://fetcher.matitos.org # Reverse proxy
DJANGO_ALLOWED_ORIGINS=https://sample.url.com # Reverse proxy
DJANGO_ALLOWED_HOSTS=* # host1,host2
DJANGO_SECRET_KEY=EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5
# DJANGO_DEBUG=False
DJANGO_DEBUG=True
PATH_LOGS_DIRECTORY=/opt/logs

View File

@@ -2,6 +2,8 @@ from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPa
from django.db.models import Q
from django.core.cache import cache
from django.db import IntegrityError
from django.utils import timezone
from datetime import timedelta
from .url_processor import process_url, get_with_protocol
import re
import traceback
@@ -271,4 +273,14 @@ class DB_Handler():
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
except Exception as e:
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
def clean_old_url_content(self, older_than_days=60):
try:
# Get cut off date
cutoff_date = timezone.now() - timedelta(days=older_than_days)
# Delete old UrlContent objects
old_url_content = UrlContent.objects.filter(id_url__ts_fetch__lt=cutoff_date)
logger.info("Cleaning URL content older than {} days: #{}".format(older_than_days, len(old_url_content)))
old_url_content.delete()
except Exception as e:
logger.warning("Exception cleaning old URL content: {}\n{}".format(e, traceback.format_exc()))

View File

@@ -45,7 +45,7 @@ def fetch_missing_kids_all(number_pages=-1):
logger.info("Task completed: {}".format(task))
@job('default')
def process_raw_urls(batch_size=50):
def process_raw_urls(batch_size=100):
task = "Process raw URLs"
logger.info("Task triggered: {}".format(task))
DB_Handler().process_raw_urls(batch_size=batch_size)
@@ -72,6 +72,13 @@ def process_missing_kids_urls_all(batch_size=None):
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
logger.info("Task completed: {}".format(task))
@job('default')
def clean_old_url_content(older_than_days=60):
task = "Clean old URL content"
logger.info("Task triggered: {}".format(task))
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
logger.info("Task completed: {}".format(task))
@job('default')
def background_task(process_type: str):
@@ -86,6 +93,7 @@ def background_task(process_type: str):
FetchSearcher().run()
elif (process_type == "fetch_missingkids_all"):
FetchMissingKids().run(number_pages=-1)
elif ("fetch_missingkids" in process_type):
# number_pages encoded in URL
try:
@@ -93,6 +101,7 @@ def background_task(process_type: str):
except Exception as e:
number_pages = -1
FetchMissingKids().run(number_pages=number_pages)
elif ("process_" in process_type):
# Batch size encoded in URL
try:
@@ -106,6 +115,15 @@ def background_task(process_type: str):
DB_Handler().process_error_urls(batch_size=batch_size)
elif ("process_missing_kids_urls" in process_type):
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
elif ( "clean_old_url_content" in process_type ):
# Older than X days encoded in URL
try:
older_than_days = float(process_type.split("_")[-1])
except Exception as e:
older_than_days = None
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
else:
logger.info("Task unknown!: {}".format(process_type))

View File

@@ -2,6 +2,7 @@ import os
import psycopg
from .tasks import background_task
from django.http import JsonResponse, HttpResponse
from django.db import connection
####################################################################################################
def trigger_task(request, task):
@@ -15,7 +16,7 @@ def link_list(request):
app_url = request.build_absolute_uri()
# Tasks
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"]
# List of links
list_links = \
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
@@ -32,7 +33,6 @@ def link_list(request):
return HttpResponse(html)
####################################################################################################
def logs(request, log_type):
# Capture output: python manage.py rqstats
@@ -45,39 +45,28 @@ def logs(request, log_type):
####################################################################################################
def log_db(request):
# TODO: Django connection
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
os.environ.get("DB_HOST", "localhost"),
os.environ.get("DB_PORT", "5432"),
os.environ.get("DB_NAME", "matitos"),
os.environ.get("DB_USER", "supermatitos"),
os.environ.get("DB_PASSWORD", "supermatitos")
)
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Create URLs table
r = cur.execute("""
SELECT
relname AS "relation",
pg_size_pretty (
pg_total_relation_size (C .oid)
) AS "total_size"
FROM
pg_class C
LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
WHERE
nspname NOT IN (
'pg_catalog',
'information_schema'
)
AND C .relkind <> 'i'
AND nspname !~ '^pg_toast'
ORDER BY
pg_total_relation_size (C .oid) DESC
LIMIT 100;
""").fetchall()
with connection.cursor() as cursor:
# Create URLs table
r = cursor.execute("""
SELECT
relname AS "relation",
pg_size_pretty (
pg_total_relation_size (C .oid)
) AS "total_size"
FROM
pg_class C
LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
WHERE
nspname NOT IN (
'pg_catalog',
'information_schema'
)
AND C .relkind <> 'i'
AND nspname !~ '^pg_toast'
ORDER BY
pg_total_relation_size (C .oid) DESC
LIMIT 100;
""").fetchall()
return HttpResponse( "\n".join([str(e) for e in r]) )
####################################################################################################

View File

@@ -187,5 +187,26 @@
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Clean old URL content",
"callable": "fetcher.tasks.clean_old_url_content",
"callable_args": [],
"callable_kwargs": [],
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 1,
"interval_unit": "weeks",
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null
}
]