Clean old url content, use django db connection

2025-04-14 10:48:33 +02:00
parent 0cd84496cf
commit 43c6c3aabf
5 changed files with 80 additions and 41 deletions
--- a/.env
+++ b/.env
@@ -5,13 +5,12 @@ DJANGO_SUPERUSER_PASSWORD=matitos
 DJANGO_SUPERUSER_EMAIL=matitos@matitos.org

 # Reverse proxy
-REVERSE_PROXY_URL=fetcher.matitos.org
+REVERSE_PROXY_URL=sample.url.com

 # Django
-DJANGO_ALLOWED_ORIGINS=https://fetcher.matitos.org # Reverse proxy
+DJANGO_ALLOWED_ORIGINS=https://sample.url.com # Reverse proxy
 DJANGO_ALLOWED_HOSTS=* # host1,host2
 DJANGO_SECRET_KEY=EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5
-# DJANGO_DEBUG=False
 DJANGO_DEBUG=True
 PATH_LOGS_DIRECTORY=/opt/logs

--- a/app_urls/fetcher/src/db_utils.py
+++ b/app_urls/fetcher/src/db_utils.py
@@ -2,6 +2,8 @@ from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPa
 from django.db.models import Q
 from django.core.cache import cache
 from django.db import IntegrityError
+from django.utils import timezone
+from datetime import timedelta
 from .url_processor import process_url, get_with_protocol
 import re
 import traceback
@@ -271,4 +273,14 @@ class DB_Handler():
            logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
        except Exception as e:
            logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
-    
+    
+    def clean_old_url_content(self, older_than_days=60):
+        try:
+            # Get cut off date
+            cutoff_date = timezone.now() - timedelta(days=older_than_days)
+            # Delete old UrlContent objects
+            old_url_content = UrlContent.objects.filter(id_url__ts_fetch__lt=cutoff_date)
+            logger.info("Cleaning URL content older than {} days: #{}".format(older_than_days, len(old_url_content)))
+            old_url_content.delete()
+        except Exception as e:
+            logger.warning("Exception cleaning old URL content: {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/fetcher/tasks.py
+++ b/app_urls/fetcher/tasks.py
@@ -45,7 +45,7 @@ def fetch_missing_kids_all(number_pages=-1):
    logger.info("Task completed: {}".format(task))

@job('default')
-def process_raw_urls(batch_size=50):
+def process_raw_urls(batch_size=100):
    task = "Process raw URLs"
    logger.info("Task triggered: {}".format(task))
    DB_Handler().process_raw_urls(batch_size=batch_size)
@@ -72,6 +72,13 @@ def process_missing_kids_urls_all(batch_size=None):
    DB_Handler().process_missing_kids_urls(batch_size=batch_size)
    logger.info("Task completed: {}".format(task))

+@job('default')
+def clean_old_url_content(older_than_days=60):
+    task = "Clean old URL content"
+    logger.info("Task triggered: {}".format(task))
+    DB_Handler().clean_old_url_content(older_than_days=older_than_days)
+    logger.info("Task completed: {}".format(task))
+

@job('default')
 def background_task(process_type: str):
@@ -86,6 +93,7 @@ def background_task(process_type: str):
            FetchSearcher().run()
        elif (process_type == "fetch_missingkids_all"):
            FetchMissingKids().run(number_pages=-1)
+
        elif ("fetch_missingkids" in process_type):
            # number_pages encoded in URL
            try:
@@ -93,6 +101,7 @@ def background_task(process_type: str):
            except Exception as e:
                number_pages = -1
            FetchMissingKids().run(number_pages=number_pages)
+
        elif ("process_" in process_type):
            # Batch size encoded in URL
            try:
@@ -106,6 +115,15 @@ def background_task(process_type: str):
                DB_Handler().process_error_urls(batch_size=batch_size)
            elif ("process_missing_kids_urls" in process_type):
                DB_Handler().process_missing_kids_urls(batch_size=batch_size)
+
+        elif ( "clean_old_url_content" in process_type ):
+            # Older than X days encoded in URL
+            try:
+                older_than_days = float(process_type.split("_")[-1])
+            except Exception as e:
+                older_than_days = None
+            DB_Handler().clean_old_url_content(older_than_days=older_than_days)
+            
        else:
            logger.info("Task unknown!: {}".format(process_type))

--- a/app_urls/fetcher/views_base.py
+++ b/app_urls/fetcher/views_base.py
@@ -2,6 +2,7 @@ import os
 import psycopg
 from .tasks import background_task
 from django.http import JsonResponse, HttpResponse
+from django.db import connection

 ####################################################################################################
 def trigger_task(request, task):
@@ -15,7 +16,7 @@ def link_list(request):
    app_url = request.build_absolute_uri()
    # Tasks
    links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
-    links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
+    links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"]
    # List of links
    list_links = \
        [ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
@@ -32,7 +33,6 @@ def link_list(request):

    return HttpResponse(html)

-
 ####################################################################################################
 def logs(request, log_type):
    # Capture output: python manage.py rqstats
@@ -45,39 +45,28 @@ def logs(request, log_type):

 ####################################################################################################
 def log_db(request):
-    # TODO: Django connection
-    connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
-        os.environ.get("DB_HOST", "localhost"),
-        os.environ.get("DB_PORT", "5432"),
-        os.environ.get("DB_NAME", "matitos"),
-        os.environ.get("DB_USER", "supermatitos"),
-        os.environ.get("DB_PASSWORD", "supermatitos")
-    )
-
-    # Connect to an existing database
-    with psycopg.connect(connection_info) as conn:
-        # Open a cursor to perform database operations
-        with conn.cursor() as cur:
-            # Create URLs table
-            r = cur.execute(""" 
-                SELECT
-                    relname AS "relation",
-                    pg_size_pretty (
-                        pg_total_relation_size (C .oid)
-                    ) AS "total_size"
-                FROM
-                    pg_class C
-                LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
-                WHERE
-                    nspname NOT IN (
-                        'pg_catalog',
-                        'information_schema'
-                    )
-                AND C .relkind <> 'i'
-                AND nspname !~ '^pg_toast'
-                ORDER BY
-                    pg_total_relation_size (C .oid) DESC
-                LIMIT 100;
-            """).fetchall()
+    with connection.cursor() as cursor:
+        # Create URLs table
+        r = cursor.execute(""" 
+            SELECT
+                relname AS "relation",
+                pg_size_pretty (
+                    pg_total_relation_size (C .oid)
+                ) AS "total_size"
+            FROM
+                pg_class C
+            LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
+            WHERE
+                nspname NOT IN (
+                    'pg_catalog',
+                    'information_schema'
+                )
+            AND C .relkind <> 'i'
+            AND nspname !~ '^pg_toast'
+            ORDER BY
+                pg_total_relation_size (C .oid) DESC
+            LIMIT 100;
+        """).fetchall()
    return HttpResponse( "\n".join([str(e) for e in r]) )
+
 ####################################################################################################
--- a/app_urls/scheduled_tasks.json
+++ b/app_urls/scheduled_tasks.json
@@ -187,5 +187,26 @@
    "failed_runs": 0,
    "last_successful_run": null,
    "last_failed_run": null
+  },
+  {
+    "model": "RepeatableTaskType",
+    "name": "Clean old URL content",
+    "callable": "fetcher.tasks.clean_old_url_content",
+    "callable_args": [],
+    "callable_kwargs": [],
+    "enabled": false,
+    "queue": "default",
+    "repeat": null,
+    "at_front": false,
+    "timeout": null,
+    "result_ttl": 86400,
+    "cron_string": null,
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
+    "interval": 1,
+    "interval_unit": "weeks",
+    "successful_runs": 0,
+    "failed_runs": 0,
+    "last_successful_run": null,
+    "last_failed_run": null
  }
 ]