Clean old url content, use django db connection
This commit is contained in:
5
.env
5
.env
@@ -5,13 +5,12 @@ DJANGO_SUPERUSER_PASSWORD=matitos
|
|||||||
DJANGO_SUPERUSER_EMAIL=matitos@matitos.org
|
DJANGO_SUPERUSER_EMAIL=matitos@matitos.org
|
||||||
|
|
||||||
# Reverse proxy
|
# Reverse proxy
|
||||||
REVERSE_PROXY_URL=fetcher.matitos.org
|
REVERSE_PROXY_URL=sample.url.com
|
||||||
|
|
||||||
# Django
|
# Django
|
||||||
DJANGO_ALLOWED_ORIGINS=https://fetcher.matitos.org # Reverse proxy
|
DJANGO_ALLOWED_ORIGINS=https://sample.url.com # Reverse proxy
|
||||||
DJANGO_ALLOWED_HOSTS=* # host1,host2
|
DJANGO_ALLOWED_HOSTS=* # host1,host2
|
||||||
DJANGO_SECRET_KEY=EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5
|
DJANGO_SECRET_KEY=EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5
|
||||||
# DJANGO_DEBUG=False
|
|
||||||
DJANGO_DEBUG=True
|
DJANGO_DEBUG=True
|
||||||
PATH_LOGS_DIRECTORY=/opt/logs
|
PATH_LOGS_DIRECTORY=/opt/logs
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,8 @@ from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPa
|
|||||||
from django.db.models import Q
|
from django.db.models import Q
|
||||||
from django.core.cache import cache
|
from django.core.cache import cache
|
||||||
from django.db import IntegrityError
|
from django.db import IntegrityError
|
||||||
|
from django.utils import timezone
|
||||||
|
from datetime import timedelta
|
||||||
from .url_processor import process_url, get_with_protocol
|
from .url_processor import process_url, get_with_protocol
|
||||||
import re
|
import re
|
||||||
import traceback
|
import traceback
|
||||||
@@ -271,4 +273,14 @@ class DB_Handler():
|
|||||||
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
|
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
|
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||||
|
|
||||||
|
def clean_old_url_content(self, older_than_days=60):
|
||||||
|
try:
|
||||||
|
# Get cut off date
|
||||||
|
cutoff_date = timezone.now() - timedelta(days=older_than_days)
|
||||||
|
# Delete old UrlContent objects
|
||||||
|
old_url_content = UrlContent.objects.filter(id_url__ts_fetch__lt=cutoff_date)
|
||||||
|
logger.info("Cleaning URL content older than {} days: #{}".format(older_than_days, len(old_url_content)))
|
||||||
|
old_url_content.delete()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Exception cleaning old URL content: {}\n{}".format(e, traceback.format_exc()))
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ def fetch_missing_kids_all(number_pages=-1):
|
|||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
@job('default')
|
@job('default')
|
||||||
def process_raw_urls(batch_size=50):
|
def process_raw_urls(batch_size=100):
|
||||||
task = "Process raw URLs"
|
task = "Process raw URLs"
|
||||||
logger.info("Task triggered: {}".format(task))
|
logger.info("Task triggered: {}".format(task))
|
||||||
DB_Handler().process_raw_urls(batch_size=batch_size)
|
DB_Handler().process_raw_urls(batch_size=batch_size)
|
||||||
@@ -72,6 +72,13 @@ def process_missing_kids_urls_all(batch_size=None):
|
|||||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
|
@job('default')
|
||||||
|
def clean_old_url_content(older_than_days=60):
|
||||||
|
task = "Clean old URL content"
|
||||||
|
logger.info("Task triggered: {}".format(task))
|
||||||
|
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
||||||
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
|
|
||||||
@job('default')
|
@job('default')
|
||||||
def background_task(process_type: str):
|
def background_task(process_type: str):
|
||||||
@@ -86,6 +93,7 @@ def background_task(process_type: str):
|
|||||||
FetchSearcher().run()
|
FetchSearcher().run()
|
||||||
elif (process_type == "fetch_missingkids_all"):
|
elif (process_type == "fetch_missingkids_all"):
|
||||||
FetchMissingKids().run(number_pages=-1)
|
FetchMissingKids().run(number_pages=-1)
|
||||||
|
|
||||||
elif ("fetch_missingkids" in process_type):
|
elif ("fetch_missingkids" in process_type):
|
||||||
# number_pages encoded in URL
|
# number_pages encoded in URL
|
||||||
try:
|
try:
|
||||||
@@ -93,6 +101,7 @@ def background_task(process_type: str):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
number_pages = -1
|
number_pages = -1
|
||||||
FetchMissingKids().run(number_pages=number_pages)
|
FetchMissingKids().run(number_pages=number_pages)
|
||||||
|
|
||||||
elif ("process_" in process_type):
|
elif ("process_" in process_type):
|
||||||
# Batch size encoded in URL
|
# Batch size encoded in URL
|
||||||
try:
|
try:
|
||||||
@@ -106,6 +115,15 @@ def background_task(process_type: str):
|
|||||||
DB_Handler().process_error_urls(batch_size=batch_size)
|
DB_Handler().process_error_urls(batch_size=batch_size)
|
||||||
elif ("process_missing_kids_urls" in process_type):
|
elif ("process_missing_kids_urls" in process_type):
|
||||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||||
|
|
||||||
|
elif ( "clean_old_url_content" in process_type ):
|
||||||
|
# Older than X days encoded in URL
|
||||||
|
try:
|
||||||
|
older_than_days = float(process_type.split("_")[-1])
|
||||||
|
except Exception as e:
|
||||||
|
older_than_days = None
|
||||||
|
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logger.info("Task unknown!: {}".format(process_type))
|
logger.info("Task unknown!: {}".format(process_type))
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import os
|
|||||||
import psycopg
|
import psycopg
|
||||||
from .tasks import background_task
|
from .tasks import background_task
|
||||||
from django.http import JsonResponse, HttpResponse
|
from django.http import JsonResponse, HttpResponse
|
||||||
|
from django.db import connection
|
||||||
|
|
||||||
####################################################################################################
|
####################################################################################################
|
||||||
def trigger_task(request, task):
|
def trigger_task(request, task):
|
||||||
@@ -15,7 +16,7 @@ def link_list(request):
|
|||||||
app_url = request.build_absolute_uri()
|
app_url = request.build_absolute_uri()
|
||||||
# Tasks
|
# Tasks
|
||||||
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
|
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
|
||||||
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"]
|
||||||
# List of links
|
# List of links
|
||||||
list_links = \
|
list_links = \
|
||||||
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
|
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
|
||||||
@@ -32,7 +33,6 @@ def link_list(request):
|
|||||||
|
|
||||||
return HttpResponse(html)
|
return HttpResponse(html)
|
||||||
|
|
||||||
|
|
||||||
####################################################################################################
|
####################################################################################################
|
||||||
def logs(request, log_type):
|
def logs(request, log_type):
|
||||||
# Capture output: python manage.py rqstats
|
# Capture output: python manage.py rqstats
|
||||||
@@ -45,39 +45,28 @@ def logs(request, log_type):
|
|||||||
|
|
||||||
####################################################################################################
|
####################################################################################################
|
||||||
def log_db(request):
|
def log_db(request):
|
||||||
# TODO: Django connection
|
with connection.cursor() as cursor:
|
||||||
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
|
# Create URLs table
|
||||||
os.environ.get("DB_HOST", "localhost"),
|
r = cursor.execute("""
|
||||||
os.environ.get("DB_PORT", "5432"),
|
SELECT
|
||||||
os.environ.get("DB_NAME", "matitos"),
|
relname AS "relation",
|
||||||
os.environ.get("DB_USER", "supermatitos"),
|
pg_size_pretty (
|
||||||
os.environ.get("DB_PASSWORD", "supermatitos")
|
pg_total_relation_size (C .oid)
|
||||||
)
|
) AS "total_size"
|
||||||
|
FROM
|
||||||
# Connect to an existing database
|
pg_class C
|
||||||
with psycopg.connect(connection_info) as conn:
|
LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
|
||||||
# Open a cursor to perform database operations
|
WHERE
|
||||||
with conn.cursor() as cur:
|
nspname NOT IN (
|
||||||
# Create URLs table
|
'pg_catalog',
|
||||||
r = cur.execute("""
|
'information_schema'
|
||||||
SELECT
|
)
|
||||||
relname AS "relation",
|
AND C .relkind <> 'i'
|
||||||
pg_size_pretty (
|
AND nspname !~ '^pg_toast'
|
||||||
pg_total_relation_size (C .oid)
|
ORDER BY
|
||||||
) AS "total_size"
|
pg_total_relation_size (C .oid) DESC
|
||||||
FROM
|
LIMIT 100;
|
||||||
pg_class C
|
""").fetchall()
|
||||||
LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
|
|
||||||
WHERE
|
|
||||||
nspname NOT IN (
|
|
||||||
'pg_catalog',
|
|
||||||
'information_schema'
|
|
||||||
)
|
|
||||||
AND C .relkind <> 'i'
|
|
||||||
AND nspname !~ '^pg_toast'
|
|
||||||
ORDER BY
|
|
||||||
pg_total_relation_size (C .oid) DESC
|
|
||||||
LIMIT 100;
|
|
||||||
""").fetchall()
|
|
||||||
return HttpResponse( "\n".join([str(e) for e in r]) )
|
return HttpResponse( "\n".join([str(e) for e in r]) )
|
||||||
|
|
||||||
####################################################################################################
|
####################################################################################################
|
||||||
@@ -187,5 +187,26 @@
|
|||||||
"failed_runs": 0,
|
"failed_runs": 0,
|
||||||
"last_successful_run": null,
|
"last_successful_run": null,
|
||||||
"last_failed_run": null
|
"last_failed_run": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "RepeatableTaskType",
|
||||||
|
"name": "Clean old URL content",
|
||||||
|
"callable": "fetcher.tasks.clean_old_url_content",
|
||||||
|
"callable_args": [],
|
||||||
|
"callable_kwargs": [],
|
||||||
|
"enabled": false,
|
||||||
|
"queue": "default",
|
||||||
|
"repeat": null,
|
||||||
|
"at_front": false,
|
||||||
|
"timeout": null,
|
||||||
|
"result_ttl": 86400,
|
||||||
|
"cron_string": null,
|
||||||
|
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||||
|
"interval": 1,
|
||||||
|
"interval_unit": "weeks",
|
||||||
|
"successful_runs": 0,
|
||||||
|
"failed_runs": 0,
|
||||||
|
"last_successful_run": null,
|
||||||
|
"last_failed_run": null
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user