Files
matitos_news/app_urls/fetcher/views.py
2025-04-04 20:11:22 +02:00

345 lines
16 KiB
Python

from .tasks import background_task
from django.core.paginator import Paginator
from django.shortcuts import render, get_object_or_404
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
from django.contrib.auth.decorators import login_required
import ollama
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
import os
from .src.logger import get_logger
logger = get_logger()
####################################################################################################
def trigger_task(request, task):
# Enqueue function in "default" queue
background_task.delay(task)
return JsonResponse({"message": "Task has been enqueued!", "task": task})
####################################################################################################
def link_list(request):
# Base URL path
app_url = request.build_absolute_uri()
# Tasks
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
# List of links
list_links = \
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
[ os.path.join(app_url, "logs", log_type) for log_type in ["debug", "info", "error"] ] + \
[ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
# Json
return JsonResponse({"links": list_links })
####################################################################################################
def logs(request, log_type):
# Capture output: python manage.py rqstats
try:
with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
file_content = f.read()
except Exception as e:
file_content = "Error reading logs for log type :{}".format(log_type)
return HttpResponse(file_content, content_type="text/plain")
####################################################################################################
class OllamaClient():
def __init__(self):
self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
def _get_default_model(self):
return "llama3.2:3b"
def get_models(self):
models = sorted([m.model for m in self.client.list().models])
if (self._get_default_model() in models):
return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
else:
return models
def get_prompt(self):
return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
#return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
# return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
#return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
def fetch_details(request, id):
url_item = get_object_or_404(Urls, id=id)
url_param = request.GET.get("url", "") # Get URL
model = request.GET.get("model", "") # Get LLM model
# TODO: post with body
text = request.GET.get("text", "") # Get LLM prompt
def stream_response():
msg_content = {
"role": "user",
"content": text,
}
response = OllamaClient().client.chat(model=model, messages=[msg_content], stream=True)
for chunk in response:
yield chunk["message"]["content"] # Stream each chunk of text
return StreamingHttpResponse(stream_response(), content_type="text/plain")
def url_detail_view(request, id):
url_item = get_object_or_404(Urls, id=id)
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
if (url_item.status == Urls.STATUS_ENUM.DUPLICATE):
url_canonical = UrlsDuplicate.objects.get(id_url_duplicated=url_item).id_url_canonical
else:
url_canonical = None
try:
url_content = UrlContent.objects.get(pk=id)
except UrlContent.DoesNotExist:
url_content = {}
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
ollama = OllamaClient()
context = {
'url_item': url_item,
'sources': url_sources,
'searches': url_searches,
'models': ollama.get_models(),
'prompt': ollama.get_prompt(),
'url_content': url_content,
'url_canonical': url_canonical,
}
return render(request, 'url_detail.html', context)
####################################################################################################
from django.shortcuts import render
from django.http import JsonResponse
from django.db.models import Count
from datetime import timedelta
from django.utils import timezone
from .models import Urls, UrlsSourceSearch
def charts(request):
return render(request, 'charts.html')
def urls_by_fetch_date(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by fetch date
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
.values('ts_fetch__date') \
.annotate(count=Count('id')) \
.order_by('ts_fetch__date')
# Format data to return as JSON
data = {
'labels': [item['ts_fetch__date'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)
def urls_per_status(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by status within the date range
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
.values('status') \
.annotate(count=Count('id')) \
.order_by('status')
# Format data for JSON
data = {
'labels': [item['status'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)
def urls_per_source(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by source
urls_data = UrlsSourceSearch.objects \
.filter(id_url__ts_fetch__gte=start_date) \
.values('id_source__source') \
.annotate(count=Count('id_url')) \
.order_by('id_source__source')
# Format data for JSON
data = {
'labels': [item['id_source__source'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)
def urls_per_search(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by search
urls_data = UrlsSourceSearch.objects \
.filter(id_url__ts_fetch__gte=start_date) \
.values('id_search__search') \
.annotate(count=Count('id_url')) \
.order_by('id_search__search')
# Format data for JSON
data = {
'labels': [item['id_search__search'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)
####################################################################################################
from django.shortcuts import render
from .models import Urls, Search, Source
from django.db.models import Q
from django.utils.timezone import now, timedelta
def filtered_urls(request):
statuses = Urls.STATUS_ENUM.choices
searches = Search.objects.all()
sources = Source.objects.all()
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True)) # TODO: Cache languages
languages = ["Unknown"] + [l for l in languages if l is not None]
valid_contents = ["True", "False", "Unknown"]
# Get selected parameters
selected_status = request.GET.getlist('status', ["null"])
selected_search = request.GET.getlist('search', ["null"])
selected_source = request.GET.getlist('source', ["null"])
selected_language = request.GET.getlist('language', ["null"])
selected_valid_contents = request.GET.getlist('valid_content', ["null"])
selected_min_sources = int(request.GET.get('min_sources', 1))
selected_days = request.GET.get("days", 30)
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
page_number = request.GET.get('page') # Get the current page number
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
selected_status = ["all"]
selected_search = ["all"]
selected_source = ["all"]
selected_language = ["all"]
selected_valid_contents = ["all"]
else:
# All elements
all_status = [str(status[0]) for status in statuses]
all_search = [str(search.id) for search in searches]
all_source = [str(source.id) for source in sources]
all_languages = languages
all_valid_contents = valid_contents
# Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
selected_status = ["all"] if (set(selected_status) == set(all_status)) else selected_status
selected_search = ["all"] if (set(selected_search) == set(all_search)) else selected_search
selected_source = ["all"] if (set(selected_source) == set(all_source)) else selected_source
selected_language = ["all"] if (set(selected_language) == set(all_languages)) else selected_language
selected_valid_contents = ["all"] if (set(selected_valid_contents) == set(all_valid_contents)) else selected_valid_contents
# Filter URLs based on selected filters
if any( 'null' in l for l in [selected_status, selected_search, selected_source, selected_language, selected_valid_contents] ):
urls = []
else:
# Filter by date
query = Q(ts_fetch__gte=now() - timedelta(days=float(selected_days)))
# Additional filters
if ("all" not in selected_status):
query &= Q(status__in=selected_status)
if ("all" not in selected_source):
query &= Q(urlssourcesearch__id_source__in=selected_source)
if ("all" not in selected_search):
query &= Q(urlssourcesearch__id_search__in=selected_search)
if ("all" not in selected_language):
# URLs with selected languages
subquery = Q(urlcontent__language__in=selected_language)
if ("Unknown" in selected_language):
# URLs with NULL language
subquery |= Q(urlcontent__language__isnull=True)
# URLs with no UrlContent record at all (similar to URLs with NULL language)
subquery |= Q(urlcontent__id_url__isnull=True)
# Update query
query &= (subquery)
if ("all" not in selected_valid_contents):
# Boolean array
bool_array = []
if ('True' in selected_valid_contents):
bool_array.append(True)
if ('False' in selected_valid_contents):
bool_array.append(False)
# URLs with selected valid_contents
subquery = Q(urlcontent__valid_content__in=bool_array)
if ("Unknown" in selected_valid_contents):
# URLs with NULL valid_content
subquery |= Q(urlcontent__valid_content__isnull=True)
# URLs with no UrlContent record at all (similar to URLs with NULL valid_content)
subquery |= Q(urlcontent__id_url__isnull=True)
# Update query
query &= (subquery)
if (selected_min_sources > 1):
query &= Q(pk__in=UrlsSourceSearch.objects.values('id_url').annotate(search_count=Count('id_source', distinct=True)).filter(search_count__gte=selected_min_sources).values('id_url'))
# Run query
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
# Pagination
paginator = Paginator(urls, per_page) # Paginate the filtered URLs
page_obj = paginator.get_page(page_number) # Get the current page object
# Map URL IDs to their sources & searches, only for subset of URLs (page of interest)
sources_map = {
url.id: list(Source.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
}
searches_map = {
url.id: list(Search.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
}
url_content_map = {
url.id: UrlContent.objects.filter(pk=url).first() for url in page_obj.object_list
}
# Custom replace search type text
for s in searches:
s.type = s.type.replace("rss_feed", "rss").replace("url_host", "url").replace("keyword_search", "keyword")
context = {
'urls': page_obj, # Pass the paginated URLs
'per_page': per_page, # Send per_page value for dynamic pagination
'statuses': statuses,
'searches': sorted(searches, key=lambda x: (x.type, x.search)),
'sources': sorted(sources, key=lambda x: x.source),
'languages': sorted(languages, key=lambda x: (x is None, x)),
'valid_contents': valid_contents,
# Selection
'selected_status': selected_status,
'selected_search': selected_search,
'selected_source': selected_source,
'selected_language': selected_language,
'selected_valid_contents': selected_valid_contents,
"selected_min_sources": selected_min_sources,
"selected_days": selected_days,
# Map
"sources_map": sources_map,
"searches_map": searches_map,
"url_content_map": url_content_map,
# "charts": charts,
# "list_per_page": [15, 100, 500],
# "list_days_text": ([0.25, 1, 7, 30, 365], ["Last 6 hours", "Last 24 hours", "Last 7 days", "Last 30 days", "Last 365 days"])
}
return render(request, 'filtered_urls.html', context)
####################################################################################################