343 lines
15 KiB
Python
343 lines
15 KiB
Python
from .tasks import background_task
|
|
from django.core.paginator import Paginator
|
|
from django.shortcuts import render, get_object_or_404
|
|
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
|
|
import ollama
|
|
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
|
|
import os
|
|
|
|
####################################################################################################
|
|
def trigger_task(request, task):
|
|
# Enqueue function in "default" queue
|
|
background_task.delay(task)
|
|
return JsonResponse({"message": "Task has been enqueued!", "task": task})
|
|
|
|
####################################################################################################
|
|
def link_list(request):
|
|
prefix = "http://localhost:8000/task"
|
|
links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
|
|
|
list_links = [
|
|
# DB
|
|
"http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500",
|
|
# Admin panel
|
|
"http://localhost:8000/admin",
|
|
# Logs
|
|
"http://localhost:8000/logs/debug",
|
|
"http://localhost:8000/logs/info",
|
|
"http://localhost:8000/logs/error",
|
|
# URLs
|
|
"http://localhost:8000/urls",
|
|
# Charts
|
|
"http://localhost:8000/charts",
|
|
# API tasks
|
|
] + [os.path.join(prefix, l) for l in links]
|
|
# Json
|
|
return JsonResponse({"links": list_links })
|
|
|
|
####################################################################################################
|
|
def logs(request, log_type):
|
|
# Capture output: python manage.py rqstats
|
|
try:
|
|
with open(os.getenv("PATH_LOGS_DEBUG", "logs/log_app_fetcher_{}.log".format(log_type)), "r") as f:
|
|
file_content = f.read()
|
|
except Exception as e:
|
|
file_content = "Error reading logs for log type :{}".format(log_type)
|
|
return HttpResponse(file_content, content_type="text/plain")
|
|
|
|
####################################################################################################
|
|
class OllamaClient():
|
|
def __init__(self):
|
|
self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
|
|
|
|
def _get_default_model(self):
|
|
return "llama3.2:3b"
|
|
|
|
def get_models(self):
|
|
models = sorted([m.model for m in self.client.list().models])
|
|
if (self._get_default_model() in models):
|
|
return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
|
|
else:
|
|
return models
|
|
|
|
def get_prompt(self):
|
|
return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
|
|
#return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
|
|
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
|
|
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
|
|
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
|
|
# return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
|
|
#return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
|
|
|
|
# TODO: move to ollamajs...
|
|
def fetch_details(request, id):
|
|
url_item = get_object_or_404(Urls, id=id)
|
|
url_param = request.GET.get("url", "") # Get URL
|
|
model = request.GET.get("model", "") # Get LLM model
|
|
text = request.GET.get("text", "") # Get LLM prompt
|
|
|
|
# print(request)
|
|
# print(text)
|
|
|
|
# LLM
|
|
ollama = OllamaClient()
|
|
|
|
def stream_response():
|
|
msg_content = {
|
|
"role": "user",
|
|
"content": text,
|
|
}
|
|
response = ollama.client.chat(model=model, messages=[msg_content], stream=True)
|
|
for chunk in response:
|
|
yield chunk["message"]["content"] # Stream each chunk of text
|
|
|
|
return StreamingHttpResponse(stream_response(), content_type="text/plain")
|
|
|
|
|
|
def url_detail_view(request, id):
|
|
url_item = get_object_or_404(Urls, id=id)
|
|
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
|
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
|
# url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
|
|
|
|
try:
|
|
url_content = UrlContent.objects.get(pk=id)
|
|
except UrlContent.DoesNotExist:
|
|
url_content = {}
|
|
|
|
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
|
|
ollama = OllamaClient()
|
|
|
|
context = {
|
|
'url_item': url_item,
|
|
'sources': url_sources,
|
|
'searches': url_searches,
|
|
'models': ollama.get_models(),
|
|
'prompt': ollama.get_prompt(),
|
|
'url_content': url_content,
|
|
}
|
|
return render(request, 'url_detail.html', context)
|
|
|
|
####################################################################################################
|
|
from django.shortcuts import render
|
|
from django.http import JsonResponse
|
|
from django.db.models import Count
|
|
from datetime import timedelta
|
|
from django.utils import timezone
|
|
from .models import Urls, UrlsSourceSearch
|
|
|
|
def charts(request):
|
|
return render(request, 'charts.html')
|
|
|
|
def urls_by_fetch_date(request):
|
|
# Get the date for 30 days ago
|
|
start_date = timezone.now() - timedelta(days=30)
|
|
|
|
# Count the number of URLs grouped by fetch date
|
|
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
|
|
.values('ts_fetch__date') \
|
|
.annotate(count=Count('id')) \
|
|
.order_by('ts_fetch__date')
|
|
|
|
# Format data to return as JSON
|
|
data = {
|
|
'dates': [item['ts_fetch__date'] for item in urls_data],
|
|
'counts': [item['count'] for item in urls_data],
|
|
}
|
|
|
|
return JsonResponse(data)
|
|
|
|
def urls_per_status(request):
|
|
# Get the filtering date parameter
|
|
days = float(request.GET.get('days', 30)) # Default is 30 days
|
|
start_date = timezone.now() - timedelta(days=days)
|
|
|
|
# Count the number of URLs grouped by status within the date range
|
|
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
|
|
.values('status') \
|
|
.annotate(count=Count('id')) \
|
|
.order_by('status')
|
|
|
|
# Format data for JSON
|
|
data = {
|
|
'statuses': [item['status'] for item in urls_data],
|
|
'counts': [item['count'] for item in urls_data],
|
|
}
|
|
|
|
return JsonResponse(data)
|
|
|
|
def urls_per_source(request):
|
|
# Count the number of URLs grouped by source
|
|
urls_data = UrlsSourceSearch.objects \
|
|
.values('id_source__source') \
|
|
.annotate(count=Count('id_url')) \
|
|
.order_by('id_source__source')
|
|
|
|
# Format data for JSON
|
|
data = {
|
|
'sources': [item['id_source__source'] for item in urls_data],
|
|
'counts': [item['count'] for item in urls_data],
|
|
}
|
|
|
|
return JsonResponse(data)
|
|
|
|
def urls_per_search(request):
|
|
# Count the number of URLs grouped by search
|
|
urls_data = UrlsSourceSearch.objects \
|
|
.values('id_search__search') \
|
|
.annotate(count=Count('id_url')) \
|
|
.order_by('id_search__search')
|
|
|
|
# Format data for JSON
|
|
data = {
|
|
'searches': [item['id_search__search'] for item in urls_data],
|
|
'counts': [item['count'] for item in urls_data],
|
|
}
|
|
|
|
return JsonResponse(data)
|
|
|
|
|
|
|
|
####################################################################################################
|
|
from django.shortcuts import render
|
|
from .models import Urls, Search, Source
|
|
from django.db.models import Q
|
|
from django.utils.timezone import now, timedelta
|
|
|
|
|
|
def filtered_urls(request):
|
|
statuses = Urls.STATUS_ENUM.choices
|
|
searches = Search.objects.all()
|
|
sources = Source.objects.all()
|
|
# TODO: Cache languages, update once every N
|
|
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
|
|
# Null for visualization
|
|
languages = ["Unknown"] + [l for l in languages if l is not None]
|
|
valid_contents = ["True", "False", "Unknown"]
|
|
|
|
# Get selected parameters
|
|
selected_status = request.GET.getlist('status', ["null"])
|
|
selected_search = request.GET.getlist('search', ["null"])
|
|
selected_source = request.GET.getlist('source', ["null"])
|
|
selected_language = request.GET.getlist('language', ["null"])
|
|
selected_valid_contents = request.GET.getlist('valid_content', ["null"])
|
|
selected_days = request.GET.get("days", 30)
|
|
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
|
|
page_number = request.GET.get('page') # Get the current page number
|
|
|
|
|
|
all_status = [str(status[0]) for status in statuses]
|
|
all_search = [str(search.id) for search in searches]
|
|
all_source = [str(source.id) for source in sources]
|
|
all_languages = languages
|
|
all_valid_contents = valid_contents
|
|
|
|
|
|
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
|
|
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
|
|
selected_status = ["all"]
|
|
selected_search = ["all"]
|
|
selected_source = ["all"]
|
|
selected_language = ["all"]
|
|
selected_valid_contents = ["all"]
|
|
else:
|
|
# Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
|
|
if (set(selected_status) == set(all_status)):
|
|
selected_status = ["all"]
|
|
if (set(selected_search) == set(all_search)):
|
|
selected_search = ["all"]
|
|
if (set(selected_source) == set(all_source)):
|
|
selected_source = ["all"]
|
|
if (set(selected_language) == set(all_languages)):
|
|
selected_language = ["all"]
|
|
if (set(selected_valid_contents) == set(all_valid_contents)):
|
|
selected_valid_contents = ["all"]
|
|
|
|
# Filter URLs based on selected filters
|
|
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
|
|
urls = []
|
|
else:
|
|
# Filter by date
|
|
query = Q(ts_fetch__gte=now() - timedelta(days=float(selected_days)))
|
|
# Additional filters
|
|
if ("all" not in selected_status):
|
|
query &= Q(status__in=selected_status)
|
|
if ("all" not in selected_source):
|
|
query &= Q(urlssourcesearch__id_source__in=selected_source)
|
|
if ("all" not in selected_search):
|
|
query &= Q(urlssourcesearch__id_search__in=selected_search)
|
|
if ("all" not in selected_language):
|
|
# URLs with selected languages
|
|
subquery = Q(urlcontent__language__in=selected_language)
|
|
if ("Unknown" in selected_language):
|
|
# URLs with NULL language
|
|
subquery |= Q(urlcontent__language__isnull=True)
|
|
# URLs with no UrlContent record at all (similar to URLs with NULL language)
|
|
subquery |= Q(urlcontent__id_url__isnull=True)
|
|
# Update query
|
|
query &= (subquery)
|
|
if ("all" not in selected_valid_contents):
|
|
# Boolean array
|
|
bool_array = []
|
|
if ('True' in selected_valid_contents):
|
|
bool_array.append(True)
|
|
if ('False' in selected_valid_contents):
|
|
bool_array.append(False)
|
|
# URLs with selected valid_contents
|
|
subquery = Q(urlcontent__valid_content__in=bool_array)
|
|
if ("Unknown" in selected_valid_contents):
|
|
# URLs with NULL valid_content
|
|
subquery |= Q(urlcontent__valid_content__isnull=True)
|
|
# URLs with no UrlContent record at all (similar to URLs with NULL valid_content)
|
|
subquery |= Q(urlcontent__id_url__isnull=True)
|
|
# Update query
|
|
query &= (subquery)
|
|
|
|
# Run query
|
|
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
|
|
# print(urls.query)
|
|
|
|
# Pagination
|
|
paginator = Paginator(urls, per_page) # Paginate the filtered URLs
|
|
page_obj = paginator.get_page(page_number) # Get the current page object
|
|
|
|
# Map URL IDs to their sources & searches, only for subset of URLs (page of interest)
|
|
sources_map = {
|
|
url.id: list(Source.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
|
|
}
|
|
searches_map = {
|
|
url.id: list(Search.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
|
|
}
|
|
url_content_map = {
|
|
url.id: UrlContent.objects.filter(pk=url).first() for url in page_obj.object_list
|
|
}
|
|
# Custom replace search type text
|
|
for s in searches:
|
|
s.type = s.type.replace("rss_feed", "rss").replace("url_host", "url").replace("keyword_search", "keyword")
|
|
|
|
context = {
|
|
'urls': page_obj, # Pass the paginated URLs
|
|
'per_page': per_page, # Send per_page value for dynamic pagination
|
|
'statuses': statuses,
|
|
'searches': sorted(searches, key=lambda x: (x.type, x.search)),
|
|
'sources': sorted(sources, key=lambda x: x.source),
|
|
'languages': sorted(languages, key=lambda x: (x is None, x)),
|
|
'valid_contents': valid_contents,
|
|
# Selection
|
|
'selected_status': selected_status,
|
|
'selected_search': selected_search,
|
|
'selected_source': selected_source,
|
|
'selected_language': selected_language,
|
|
'selected_valid_contents': selected_valid_contents,
|
|
"selected_days": selected_days,
|
|
# Map
|
|
"sources_map": sources_map,
|
|
"searches_map": searches_map,
|
|
"url_content_map": url_content_map,
|
|
# "charts": charts,
|
|
# "list_per_page": [15, 100, 500],
|
|
# "list_days_text": ([0.25, 1, 7, 30, 365], ["Last 6 hours", "Last 24 hours", "Last 7 days", "Last 30 days", "Last 365 days"])
|
|
}
|
|
|
|
return render(request, 'filtered_urls.html', context)
|
|
#################################################################################################### |