Valid content filter, language detect on min chars, fetch missingkids.org
This commit is contained in:
@@ -15,7 +15,7 @@ def trigger_task(request, task):
|
||||
####################################################################################################
|
||||
def link_list(request):
|
||||
prefix = "http://localhost:8000/task"
|
||||
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
||||
links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
||||
|
||||
list_links = [
|
||||
# DB
|
||||
@@ -212,21 +212,26 @@ def filtered_urls(request):
|
||||
# TODO: Cache languages, update once every N
|
||||
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
|
||||
# Null for visualization
|
||||
languages = ["Null"] + [l for l in languages if l is not None]
|
||||
languages = ["Unknown"] + [l for l in languages if l is not None]
|
||||
valid_contents = ["True", "False", "Unknown"]
|
||||
|
||||
# Get selected parameters
|
||||
selected_status = request.GET.getlist('status', ["null"])
|
||||
selected_search = request.GET.getlist('search', ["null"])
|
||||
selected_source = request.GET.getlist('source', ["null"])
|
||||
selected_language = request.GET.getlist('language', ["null"])
|
||||
selected_valid_contents = request.GET.getlist('valid_content', ["null"])
|
||||
selected_days = request.GET.get("days", 30)
|
||||
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
|
||||
page_number = request.GET.get('page') # Get the current page number
|
||||
|
||||
|
||||
all_status = [str(status[0]) for status in statuses]
|
||||
all_search = [str(search.id) for search in searches]
|
||||
all_source = [str(source.id) for source in sources]
|
||||
all_languages = languages
|
||||
all_valid_contents = valid_contents
|
||||
|
||||
|
||||
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
|
||||
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
|
||||
@@ -234,23 +239,22 @@ def filtered_urls(request):
|
||||
selected_search = ["all"]
|
||||
selected_source = ["all"]
|
||||
selected_language = ["all"]
|
||||
|
||||
|
||||
# print(set(selected_status), set(all_status))
|
||||
"""
|
||||
# List of TODO remove...
|
||||
if (set(selected_status) == set(all_status)):
|
||||
selected_status = ["all"]
|
||||
if (set(selected_search) == set(all_search)):
|
||||
selected_search = ["all"]
|
||||
if (set(selected_source) == set(all_source)):
|
||||
selected_source = ["all"]
|
||||
if (set(selected_language) == set(languages)):
|
||||
selected_language = ["all"]"
|
||||
"""
|
||||
selected_valid_contents = ["all"]
|
||||
else:
|
||||
# Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
|
||||
if (set(selected_status) == set(all_status)):
|
||||
selected_status = ["all"]
|
||||
if (set(selected_search) == set(all_search)):
|
||||
selected_search = ["all"]
|
||||
if (set(selected_source) == set(all_source)):
|
||||
selected_source = ["all"]
|
||||
if (set(selected_language) == set(all_languages)):
|
||||
selected_language = ["all"]
|
||||
if (set(selected_valid_contents) == set(all_valid_contents)):
|
||||
selected_valid_contents = ["all"]
|
||||
|
||||
# Filter URLs based on selected filters
|
||||
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language):
|
||||
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
|
||||
urls = []
|
||||
else:
|
||||
# Filter by date
|
||||
@@ -262,18 +266,36 @@ def filtered_urls(request):
|
||||
query &= Q(urlssourcesearch__id_source__in=selected_source)
|
||||
if ("all" not in selected_search):
|
||||
query &= Q(urlssourcesearch__id_search__in=selected_search)
|
||||
if ("all" not in selected_language):
|
||||
if ("all" not in selected_language):
|
||||
# URLs with selected languages
|
||||
subquery = Q(urlcontent__language__in=selected_language)
|
||||
if ("Null" in selected_language):
|
||||
if ("Unknown" in selected_language):
|
||||
# URLs with NULL language
|
||||
subquery |= Q(urlcontent__language__isnull=True)
|
||||
# URLs with no UrlContent record at all (similar to URLs with NULL language)
|
||||
subquery |= Q(urlcontent__id_url__isnull=True)
|
||||
# Update query
|
||||
query &= (subquery)
|
||||
if ("all" not in selected_valid_contents):
|
||||
# Boolean array
|
||||
bool_array = []
|
||||
if ('True' in selected_valid_contents):
|
||||
bool_array.append(True)
|
||||
if ('False' in selected_valid_contents):
|
||||
bool_array.append(False)
|
||||
# URLs with selected valid_contents
|
||||
subquery = Q(urlcontent__valid_content__in=bool_array)
|
||||
if ("Unknown" in selected_valid_contents):
|
||||
# URLs with NULL valid_content
|
||||
subquery |= Q(urlcontent__valid_content__isnull=True)
|
||||
# URLs with no UrlContent record at all (similar to URLs with NULL valid_content)
|
||||
subquery |= Q(urlcontent__id_url__isnull=True)
|
||||
# Update query
|
||||
query &= (subquery)
|
||||
|
||||
# Run query
|
||||
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
|
||||
# print(urls.query)
|
||||
|
||||
# Pagination
|
||||
paginator = Paginator(urls, per_page) # Paginate the filtered URLs
|
||||
@@ -300,11 +322,13 @@ def filtered_urls(request):
|
||||
'searches': sorted(searches, key=lambda x: (x.type, x.search)),
|
||||
'sources': sorted(sources, key=lambda x: x.source),
|
||||
'languages': sorted(languages, key=lambda x: (x is None, x)),
|
||||
'valid_contents': valid_contents,
|
||||
# Selection
|
||||
'selected_status': selected_status,
|
||||
'selected_search': selected_search,
|
||||
'selected_source': selected_source,
|
||||
'selected_language': selected_language,
|
||||
'selected_valid_contents': selected_valid_contents,
|
||||
"selected_days": selected_days,
|
||||
# Map
|
||||
"sources_map": sources_map,
|
||||
|
||||
Reference in New Issue
Block a user