Valid content filter, language detect on min chars, fetch missingkids.org

This commit is contained in:
Luciano Gervasoni
2025-04-03 09:44:46 +02:00
parent 3b54e247e7
commit 5addfa5ba9
18 changed files with 533 additions and 66 deletions

View File

@@ -15,7 +15,7 @@ def trigger_task(request, task):
####################################################################################################
def link_list(request):
prefix = "http://localhost:8000/task"
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
list_links = [
# DB
@@ -212,21 +212,26 @@ def filtered_urls(request):
# TODO: Cache languages, update once every N
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
# Null for visualization
languages = ["Null"] + [l for l in languages if l is not None]
languages = ["Unknown"] + [l for l in languages if l is not None]
valid_contents = ["True", "False", "Unknown"]
# Get selected parameters
selected_status = request.GET.getlist('status', ["null"])
selected_search = request.GET.getlist('search', ["null"])
selected_source = request.GET.getlist('source', ["null"])
selected_language = request.GET.getlist('language', ["null"])
selected_valid_contents = request.GET.getlist('valid_content', ["null"])
selected_days = request.GET.get("days", 30)
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
page_number = request.GET.get('page') # Get the current page number
all_status = [str(status[0]) for status in statuses]
all_search = [str(search.id) for search in searches]
all_source = [str(source.id) for source in sources]
all_languages = languages
all_valid_contents = valid_contents
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
@@ -234,23 +239,22 @@ def filtered_urls(request):
selected_search = ["all"]
selected_source = ["all"]
selected_language = ["all"]
# print(set(selected_status), set(all_status))
"""
# List of TODO remove...
if (set(selected_status) == set(all_status)):
selected_status = ["all"]
if (set(selected_search) == set(all_search)):
selected_search = ["all"]
if (set(selected_source) == set(all_source)):
selected_source = ["all"]
if (set(selected_language) == set(languages)):
selected_language = ["all"]"
"""
selected_valid_contents = ["all"]
else:
# Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
if (set(selected_status) == set(all_status)):
selected_status = ["all"]
if (set(selected_search) == set(all_search)):
selected_search = ["all"]
if (set(selected_source) == set(all_source)):
selected_source = ["all"]
if (set(selected_language) == set(all_languages)):
selected_language = ["all"]
if (set(selected_valid_contents) == set(all_valid_contents)):
selected_valid_contents = ["all"]
# Filter URLs based on selected filters
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language):
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
urls = []
else:
# Filter by date
@@ -262,18 +266,36 @@ def filtered_urls(request):
query &= Q(urlssourcesearch__id_source__in=selected_source)
if ("all" not in selected_search):
query &= Q(urlssourcesearch__id_search__in=selected_search)
if ("all" not in selected_language):
if ("all" not in selected_language):
# URLs with selected languages
subquery = Q(urlcontent__language__in=selected_language)
if ("Null" in selected_language):
if ("Unknown" in selected_language):
# URLs with NULL language
subquery |= Q(urlcontent__language__isnull=True)
# URLs with no UrlContent record at all (similar to URLs with NULL language)
subquery |= Q(urlcontent__id_url__isnull=True)
# Update query
query &= (subquery)
if ("all" not in selected_valid_contents):
# Boolean array
bool_array = []
if ('True' in selected_valid_contents):
bool_array.append(True)
if ('False' in selected_valid_contents):
bool_array.append(False)
# URLs with selected valid_contents
subquery = Q(urlcontent__valid_content__in=bool_array)
if ("Unknown" in selected_valid_contents):
# URLs with NULL valid_content
subquery |= Q(urlcontent__valid_content__isnull=True)
# URLs with no UrlContent record at all (similar to URLs with NULL valid_content)
subquery |= Q(urlcontent__id_url__isnull=True)
# Update query
query &= (subquery)
# Run query
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
# print(urls.query)
# Pagination
paginator = Paginator(urls, per_page) # Paginate the filtered URLs
@@ -300,11 +322,13 @@ def filtered_urls(request):
'searches': sorted(searches, key=lambda x: (x.type, x.search)),
'sources': sorted(sources, key=lambda x: x.source),
'languages': sorted(languages, key=lambda x: (x is None, x)),
'valid_contents': valid_contents,
# Selection
'selected_status': selected_status,
'selected_search': selected_search,
'selected_source': selected_source,
'selected_language': selected_language,
'selected_valid_contents': selected_valid_contents,
"selected_days": selected_days,
# Map
"sources_map": sources_map,