Update missing kids poster status

This commit is contained in:
Luciano Gervasoni
2025-03-18 17:38:01 +01:00
parent fb4b30f05e
commit 83f76232b2
4 changed files with 58 additions and 24 deletions

View File

@@ -157,6 +157,9 @@
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality', 'valid')\")\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.google.com', 'invalid')\")\n",
"\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/USVA/VA25-0820/1', 'valid')\")\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/NCMC/2045193/1', 'valid')\")\n",
"\n",
" cur.execute(\"INSERT INTO SOURCE (source) values ('news.google.com')\")\n",
" cur.execute(\"INSERT INTO SOURCE (source) values ('qwant.com')\")\n",
"\n",

View File

@@ -1,4 +1,5 @@
from ..models import Urls, UrlContent, UrlsSource, UrlsDuplicate, Source, StatusPatternMatching
from django.db.models import Q
from .url_processor import process_url
from django.core.cache import cache
from django.db import IntegrityError
@@ -135,13 +136,25 @@ class DB_Handler():
cache.set("processed_{}".format(url_host), time.time(), timeout=60*5) # Expire after 5 minutes
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
def set_status(obj_url, status):
# Update status if setting a new value
if (obj_url.status != status):
obj_url.status = status
obj_url.save()
# updating_urls.append(obj_url)
# TODO: Fix enum type issue. Bulk update instead of .save() for each object
# List of objects to bulk update
# updating_urls = []
# ... general processing, append to updating_urls
# Urls.objects.bulk_update(updating_urls, ['status'])
##### Filter URL? -> Invalid
if (status_pattern_match == "invalid"):
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
# Update status
obj_url.status = Urls.STATUS_ENUM.INVALID
obj_url.save()
# updating_urls.append(obj_url)
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
@@ -161,18 +174,14 @@ class DB_Handler():
# Set status to error
logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc()))
# Update status
obj_url.status = Urls.STATUS_ENUM.ERROR
obj_url.save()
# updating_urls.append(obj_url)
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
##### Canonical URL different? -> Duplicate
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status
obj_url.status = Urls.STATUS_ENUM.DUPLICATE
obj_url.save()
# updating_urls.append(obj_url)
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
# Get or create URL with canonical form
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
@@ -190,9 +199,7 @@ class DB_Handler():
##### Valid URL
# Update status
obj_url.status = Urls.STATUS_ENUM.VALID
obj_url.save()
# updating_urls.append(obj_url)
set_status(obj_url, Urls.STATUS_ENUM.VALID)
# Create or update extracted URL data
UrlContent.objects.update_or_create(
@@ -240,8 +247,6 @@ class DB_Handler():
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
# List of objects to bulk update
# updating_urls = []
# Per URL
for obj_url in raw_urls:
@@ -250,9 +255,6 @@ class DB_Handler():
# Process URL
self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False)
# TODO: Fix enum type issue. Bulk update instead of .save() for each object
# Urls.objects.bulk_update(updating_urls, ['status'])
logger.info("Updated #{} raw URLs".format(len(raw_urls)))
except Exception as e:
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
@@ -289,3 +291,28 @@ class DB_Handler():
logger.info("Updated #{}, skipped #{} error URLs".format(num_urls_processed, num_urls_skipped))
except Exception as e:
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
def process_missing_kids_urls(self, batch_size):
try:
logger.debug("Processing MissingKids URLs")
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
&
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID))
)[:batch_size]
# Per URL
for obj_url in missingkids_urls:
try:
# Process URL. If no exception -> Valid
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
except Exception as e:
# Raised exception -> Invalid (404 error)
obj_url.status = Urls.STATUS_ENUM.INVALID
obj_url.save()
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
except Exception as e:
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))

View File

@@ -4,18 +4,14 @@ from .src.fetch_feed import FetchFeeds
from .src.fetch_parser import FetchParser
from .src.db_utils import DB_Handler
'''
from src.fetch_parser import FetchParser
from src.fetch_search import FetchSearcher
from src.missing_kids_fetch import MissingKidsFetch
from src.missing_kids_status import MissingKidsStatus
from src.url_status import UpdateErrorURLs
'''
from .src.logger import get_logger
logger = get_logger()
# TODO: Queues with priorities, process_raw_urls least priority due to slowdown...
@job
def background_task(process_type: str):
logger.info("Task triggered: {}".format(process_type))
@@ -25,10 +21,16 @@ def background_task(process_type: str):
FetchFeeds().run()
elif (process_type == "fetch_parser"):
FetchParser().run()
# TODO: ENCODE BATCH_SIZE IN PROCESS_tYPE..
elif (process_type == "process_raw_urls"):
DB_Handler().process_raw_urls(batch_size=50)
elif (process_type == "process_error_urls"):
DB_Handler().process_error_urls(batch_size=50)
elif (process_type == "process_missing_kids_urls"):
DB_Handler().process_missing_kids_urls(batch_size=50)
elif ("process_missing_kids_urls" in process_type):
batch_size = int(process_type.split("_")[-1])
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
else:
logger.info("Task unknown!: {}".format(process_type))

View File

@@ -5,6 +5,8 @@ import os
from .src.logger import get_logger
logger = get_logger()
# TODO: Queues with priorities, process_raw_urls, process_error_urls least priority due to slowdown logic
def trigger_task(request, task):
"""View that enqueues a task."""
queue = django_rq.get_queue('default') # Get the default queue
@@ -13,5 +15,5 @@ def trigger_task(request, task):
def link_list(request):
prefix = "http://localhost:8000/api"
links = ["fetch_feeds", "fetch_parser", "process_raw_urls", "process_error_urls"]
links = ["fetch_feeds", "fetch_parser", "process_raw_urls", "process_error_urls", "process_missing_kids_urls_50", "process_missing_kids_urls_500000"]
return JsonResponse({"links": ["http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id"] + [os.path.join(prefix, l) for l in links]})