From 43c6c3aabf196492bff0e1c988d2ad2d79b3f2bd Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Mon, 14 Apr 2025 10:48:33 +0200 Subject: [PATCH] Clean old url content, use django db connection --- .env | 5 ++- app_urls/fetcher/src/db_utils.py | 14 +++++++- app_urls/fetcher/tasks.py | 20 ++++++++++- app_urls/fetcher/views_base.py | 61 +++++++++++++------------------- app_urls/scheduled_tasks.json | 21 +++++++++++ 5 files changed, 80 insertions(+), 41 deletions(-) diff --git a/.env b/.env index 55139be..a8c2eae 100644 --- a/.env +++ b/.env @@ -5,13 +5,12 @@ DJANGO_SUPERUSER_PASSWORD=matitos DJANGO_SUPERUSER_EMAIL=matitos@matitos.org # Reverse proxy -REVERSE_PROXY_URL=fetcher.matitos.org +REVERSE_PROXY_URL=sample.url.com # Django -DJANGO_ALLOWED_ORIGINS=https://fetcher.matitos.org # Reverse proxy +DJANGO_ALLOWED_ORIGINS=https://sample.url.com # Reverse proxy DJANGO_ALLOWED_HOSTS=* # host1,host2 DJANGO_SECRET_KEY=EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5 -# DJANGO_DEBUG=False DJANGO_DEBUG=True PATH_LOGS_DIRECTORY=/opt/logs diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index d7a6c6e..ae5d59f 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -2,6 +2,8 @@ from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPa from django.db.models import Q from django.core.cache import cache from django.db import IntegrityError +from django.utils import timezone +from datetime import timedelta from .url_processor import process_url, get_with_protocol import re import traceback @@ -271,4 +273,14 @@ class DB_Handler(): logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls))) except Exception as e: logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc())) - \ No newline at end of file + + def clean_old_url_content(self, older_than_days=60): + try: + # Get cut off date + cutoff_date = timezone.now() - timedelta(days=older_than_days) + # Delete old UrlContent objects + old_url_content = UrlContent.objects.filter(id_url__ts_fetch__lt=cutoff_date) + logger.info("Cleaning URL content older than {} days: #{}".format(older_than_days, len(old_url_content))) + old_url_content.delete() + except Exception as e: + logger.warning("Exception cleaning old URL content: {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/fetcher/tasks.py b/app_urls/fetcher/tasks.py index b22dc5c..21fed3d 100644 --- a/app_urls/fetcher/tasks.py +++ b/app_urls/fetcher/tasks.py @@ -45,7 +45,7 @@ def fetch_missing_kids_all(number_pages=-1): logger.info("Task completed: {}".format(task)) @job('default') -def process_raw_urls(batch_size=50): +def process_raw_urls(batch_size=100): task = "Process raw URLs" logger.info("Task triggered: {}".format(task)) DB_Handler().process_raw_urls(batch_size=batch_size) @@ -72,6 +72,13 @@ def process_missing_kids_urls_all(batch_size=None): DB_Handler().process_missing_kids_urls(batch_size=batch_size) logger.info("Task completed: {}".format(task)) +@job('default') +def clean_old_url_content(older_than_days=60): + task = "Clean old URL content" + logger.info("Task triggered: {}".format(task)) + DB_Handler().clean_old_url_content(older_than_days=older_than_days) + logger.info("Task completed: {}".format(task)) + @job('default') def background_task(process_type: str): @@ -86,6 +93,7 @@ def background_task(process_type: str): FetchSearcher().run() elif (process_type == "fetch_missingkids_all"): FetchMissingKids().run(number_pages=-1) + elif ("fetch_missingkids" in process_type): # number_pages encoded in URL try: @@ -93,6 +101,7 @@ def background_task(process_type: str): except Exception as e: number_pages = -1 FetchMissingKids().run(number_pages=number_pages) + elif ("process_" in process_type): # Batch size encoded in URL try: @@ -106,6 +115,15 @@ def background_task(process_type: str): DB_Handler().process_error_urls(batch_size=batch_size) elif ("process_missing_kids_urls" in process_type): DB_Handler().process_missing_kids_urls(batch_size=batch_size) + + elif ( "clean_old_url_content" in process_type ): + # Older than X days encoded in URL + try: + older_than_days = float(process_type.split("_")[-1]) + except Exception as e: + older_than_days = None + DB_Handler().clean_old_url_content(older_than_days=older_than_days) + else: logger.info("Task unknown!: {}".format(process_type)) diff --git a/app_urls/fetcher/views_base.py b/app_urls/fetcher/views_base.py index b5e560e..b728be0 100644 --- a/app_urls/fetcher/views_base.py +++ b/app_urls/fetcher/views_base.py @@ -2,6 +2,7 @@ import os import psycopg from .tasks import background_task from django.http import JsonResponse, HttpResponse +from django.db import connection #################################################################################################### def trigger_task(request, task): @@ -15,7 +16,7 @@ def link_list(request): app_url = request.build_absolute_uri() # Tasks links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"] - links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"] + links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"] # List of links list_links = \ [ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \ @@ -32,7 +33,6 @@ def link_list(request): return HttpResponse(html) - #################################################################################################### def logs(request, log_type): # Capture output: python manage.py rqstats @@ -45,39 +45,28 @@ def logs(request, log_type): #################################################################################################### def log_db(request): - # TODO: Django connection - connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format( - os.environ.get("DB_HOST", "localhost"), - os.environ.get("DB_PORT", "5432"), - os.environ.get("DB_NAME", "matitos"), - os.environ.get("DB_USER", "supermatitos"), - os.environ.get("DB_PASSWORD", "supermatitos") - ) - - # Connect to an existing database - with psycopg.connect(connection_info) as conn: - # Open a cursor to perform database operations - with conn.cursor() as cur: - # Create URLs table - r = cur.execute(""" - SELECT - relname AS "relation", - pg_size_pretty ( - pg_total_relation_size (C .oid) - ) AS "total_size" - FROM - pg_class C - LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace) - WHERE - nspname NOT IN ( - 'pg_catalog', - 'information_schema' - ) - AND C .relkind <> 'i' - AND nspname !~ '^pg_toast' - ORDER BY - pg_total_relation_size (C .oid) DESC - LIMIT 100; - """).fetchall() + with connection.cursor() as cursor: + # Create URLs table + r = cursor.execute(""" + SELECT + relname AS "relation", + pg_size_pretty ( + pg_total_relation_size (C .oid) + ) AS "total_size" + FROM + pg_class C + LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace) + WHERE + nspname NOT IN ( + 'pg_catalog', + 'information_schema' + ) + AND C .relkind <> 'i' + AND nspname !~ '^pg_toast' + ORDER BY + pg_total_relation_size (C .oid) DESC + LIMIT 100; + """).fetchall() return HttpResponse( "\n".join([str(e) for e in r]) ) + #################################################################################################### \ No newline at end of file diff --git a/app_urls/scheduled_tasks.json b/app_urls/scheduled_tasks.json index abf1654..3e39d54 100644 --- a/app_urls/scheduled_tasks.json +++ b/app_urls/scheduled_tasks.json @@ -187,5 +187,26 @@ "failed_runs": 0, "last_successful_run": null, "last_failed_run": null + }, + { + "model": "RepeatableTaskType", + "name": "Clean old URL content", + "callable": "fetcher.tasks.clean_old_url_content", + "callable_args": [], + "callable_kwargs": [], + "enabled": false, + "queue": "default", + "repeat": null, + "at_front": false, + "timeout": null, + "result_ttl": 86400, + "cron_string": null, + "scheduled_time": "2025-01-01T00:00:00+00:00", + "interval": 1, + "interval_unit": "weeks", + "successful_runs": 0, + "failed_runs": 0, + "last_successful_run": null, + "last_failed_run": null } ]