From b3f896b35a82ee75f63617bfa84788ae5948138c Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Fri, 28 Mar 2025 16:21:35 +0100 Subject: [PATCH] Process all missing kids task, urls views cleaner, adding language filter WIP --- app_urls/api/src/db_utils.py | 16 +- app_urls/api/src/url_processor.py | 2 +- app_urls/api/tasks.py | 12 +- .../{urls.html => OBSOLETE_urls.html} | 0 ...artial.html => OBSOLETE_urls_partial.html} | 0 app_urls/api/templates/charts.html | 9 +- app_urls/api/templates/filtered_urls.html | 110 +++++++-- app_urls/api/templates/url_detail.html | 106 +++++++-- app_urls/api/urls.py | 6 +- app_urls/api/views.py | 217 +++++++----------- app_urls/core/urls.py | 2 +- 11 files changed, 284 insertions(+), 196 deletions(-) rename app_urls/api/templates/{urls.html => OBSOLETE_urls.html} (100%) rename app_urls/api/templates/{urls_partial.html => OBSOLETE_urls_partial.html} (100%) diff --git a/app_urls/api/src/db_utils.py b/app_urls/api/src/db_utils.py index d34c075..79a02c8 100644 --- a/app_urls/api/src/db_utils.py +++ b/app_urls/api/src/db_utils.py @@ -61,9 +61,9 @@ class DB_Handler(): # URL obj_url, created = Urls.objects.get_or_create(url=url) if (created): - logger.info("CREATED: {}".format(obj_url.url)) + logger.debug("Inserted: {}".format(obj_url.url)) else: - logger.info("NOT CREATED: {}".format(obj_url.url)) + logger.debug("Not inserted: {}".format(obj_url.url)) # (URL, source, search) UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search) except Exception as e: @@ -76,7 +76,7 @@ class DB_Handler(): cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url) cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url) - logger.info("Inserted #{} raw URLs".format(len(urls_to_insert))) + logger.info("Inserted #{} raw URLs, Source-Search {}-{}".format(len(urls_to_insert), obj_source.source, obj_search.search)) except Exception as e: logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) @@ -243,15 +243,19 @@ class DB_Handler(): except Exception as e: logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc())) - def process_missing_kids_urls(self, batch_size): + def process_missing_kids_urls(self, batch_size=None): try: - logger.debug("Processing MissingKids URLs") + logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size)) # Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid') missingkids_urls = Urls.objects.order_by("-ts_fetch").filter( (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster")) & (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR)) - )[:batch_size] + ) + + # Get batch size + if (batch_size is not None): + missingkids_urls = missingkids_urls[:batch_size] # Per URL for obj_url in missingkids_urls: diff --git a/app_urls/api/src/url_processor.py b/app_urls/api/src/url_processor.py index 56b5a33..04f5e5e 100644 --- a/app_urls/api/src/url_processor.py +++ b/app_urls/api/src/url_processor.py @@ -64,7 +64,7 @@ def process_url(url): if ("Website protected with PerimeterX" in str(e.args)): logger.debug("TODO: process_url Implement bypass PerimeterX") - logger.warning("ArticleException for input URL {}\n{}".format(url, str(e.args))) + logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args))) return None except Exception as e: logger.warning("Exception for input URL {}\n{}".format(url, str(e))) diff --git a/app_urls/api/tasks.py b/app_urls/api/tasks.py index d7ea90f..4193bab 100644 --- a/app_urls/api/tasks.py +++ b/app_urls/api/tasks.py @@ -56,6 +56,12 @@ def process_missing_kids_urls(batch_size=50): DB_Handler().process_missing_kids_urls(batch_size=batch_size) logger.info("Task completed: {}".format(task)) +@job('default') +def process_missing_kids_urls_all(batch_size=None): + task = "Process Missing Kids URLs ALL" + logger.info("Task triggered: {}".format(task)) + DB_Handler().process_missing_kids_urls(batch_size=batch_size) + logger.info("Task completed: {}".format(task)) @@ -76,7 +82,10 @@ def background_task(process_type: str): # FetchMissingKids().run() elif ("process_" in process_type): # Batch size encoded in URL - batch_size = int(process_type.split("_")[-1]) + try: + batch_size = int(process_type.split("_")[-1]) + except Exception as e: + batch_size = None # Task type if ("process_raw_urls" in process_type): DB_Handler().process_raw_urls(batch_size=batch_size) @@ -87,7 +96,6 @@ def background_task(process_type: str): else: logger.info("Task unknown!: {}".format(process_type)) - ''' # Selenium based elif (process_type == "fetch_missing_kids_reduced"): diff --git a/app_urls/api/templates/urls.html b/app_urls/api/templates/OBSOLETE_urls.html similarity index 100% rename from app_urls/api/templates/urls.html rename to app_urls/api/templates/OBSOLETE_urls.html diff --git a/app_urls/api/templates/urls_partial.html b/app_urls/api/templates/OBSOLETE_urls_partial.html similarity index 100% rename from app_urls/api/templates/urls_partial.html rename to app_urls/api/templates/OBSOLETE_urls_partial.html diff --git a/app_urls/api/templates/charts.html b/app_urls/api/templates/charts.html index f39445d..9cffde6 100644 --- a/app_urls/api/templates/charts.html +++ b/app_urls/api/templates/charts.html @@ -60,6 +60,7 @@
-

+

Fetch Date

@@ -249,10 +287,23 @@ input[type="checkbox"] { -

+

Status

+
{% for status in statuses %}
{% endfor %} -

+

Search

@@ -270,10 +321,10 @@ input[type="checkbox"] {
{% endfor %} -

+

Source

@@ -282,10 +333,21 @@ input[type="checkbox"] {
{% endfor %} -

+ + +

Language

+
+ {% for lang in languages %} +
+ {% endfor %} +
@@ -300,6 +362,8 @@ input[type="checkbox"] { Fetch Date Search Source + Valid content? + Language @@ -349,7 +413,18 @@ input[type="checkbox"] { {% endif %} {% endwith %} + + {% with url_content_map|dict_get:url.id as content %} + {{ content.valid_content }} + {% endwith %} + + + {% with url_content_map|dict_get:url.id as content %} + {{ content.language }} + {% endwith %} + + {% empty %} No URLs found for the selected filters. @@ -360,7 +435,8 @@ input[type="checkbox"] {