Process all missing kids task, urls views cleaner, adding language filter WIP

This commit is contained in:
Luciano Gervasoni
2025-03-28 16:21:35 +01:00
parent e34284abbe
commit b3f896b35a
11 changed files with 284 additions and 196 deletions

View File

@@ -61,9 +61,9 @@ class DB_Handler():
# URL
obj_url, created = Urls.objects.get_or_create(url=url)
if (created):
logger.info("CREATED: {}".format(obj_url.url))
logger.debug("Inserted: {}".format(obj_url.url))
else:
logger.info("NOT CREATED: {}".format(obj_url.url))
logger.debug("Not inserted: {}".format(obj_url.url))
# (URL, source, search)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
except Exception as e:
@@ -76,7 +76,7 @@ class DB_Handler():
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
logger.info("Inserted #{} raw URLs".format(len(urls_to_insert)))
logger.info("Inserted #{} raw URLs, Source-Search {}-{}".format(len(urls_to_insert), obj_source.source, obj_search.search))
except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
@@ -243,15 +243,19 @@ class DB_Handler():
except Exception as e:
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
def process_missing_kids_urls(self, batch_size):
def process_missing_kids_urls(self, batch_size=None):
try:
logger.debug("Processing MissingKids URLs")
logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size))
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
&
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
)[:batch_size]
)
# Get batch size
if (batch_size is not None):
missingkids_urls = missingkids_urls[:batch_size]
# Per URL
for obj_url in missingkids_urls:

View File

@@ -64,7 +64,7 @@ def process_url(url):
if ("Website protected with PerimeterX" in str(e.args)):
logger.debug("TODO: process_url Implement bypass PerimeterX")
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e.args)))
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
return None
except Exception as e:
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))

View File

@@ -56,6 +56,12 @@ def process_missing_kids_urls(batch_size=50):
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
logger.info("Task completed: {}".format(task))
@job('default')
def process_missing_kids_urls_all(batch_size=None):
task = "Process Missing Kids URLs ALL"
logger.info("Task triggered: {}".format(task))
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
logger.info("Task completed: {}".format(task))
@@ -76,7 +82,10 @@ def background_task(process_type: str):
# FetchMissingKids().run()
elif ("process_" in process_type):
# Batch size encoded in URL
batch_size = int(process_type.split("_")[-1])
try:
batch_size = int(process_type.split("_")[-1])
except Exception as e:
batch_size = None
# Task type
if ("process_raw_urls" in process_type):
DB_Handler().process_raw_urls(batch_size=batch_size)
@@ -87,7 +96,6 @@ def background_task(process_type: str):
else:
logger.info("Task unknown!: {}".format(process_type))
'''
# Selenium based
elif (process_type == "fetch_missing_kids_reduced"):

View File

@@ -60,6 +60,7 @@
<div class="filter-container">
<label for="daysFilter">Select Number of Days:</label>
<select id="daysFilter">
<option value="0.25">Last 6 Hours</option>
<option value="1">Last 24 Hours</option>
<option value="3">Last 3 Days</option>
<option value="7" selected>Last 7 Days</option>
@@ -102,22 +103,22 @@
function fetchDataAndRenderCharts(days) {
// Fetch and render the URL Fetch Date chart
$.getJSON(`/api/urls-by-fetch-date/?days=${days}`, function (data) {
$.getJSON(`/urls-by-fetch-date/?days=${days}`, function (data) {
renderUrlFetchDateChart(data);
});
// Fetch and render the URL Status chart (with dynamic date filtering)
$.getJSON(`/api/urls-per-status/?days=${days}`, function (data) {
$.getJSON(`/urls-per-status/?days=${days}`, function (data) {
renderUrlStatusChart(data);
});
// Fetch and render the URLs per Source chart
$.getJSON(`/api/urls-per-source/?days=${days}`, function (data) {
$.getJSON(`/urls-per-source/?days=${days}`, function (data) {
renderUrlsPerSourceChart(data);
});
// Fetch and render the URLs per Search chart
$.getJSON(`/api/urls-per-search/?days=${days}`, function (data) {
$.getJSON(`/urls-per-search/?days=${days}`, function (data) {
renderUrlsPerSearchChart(data);
});
}

View File

@@ -48,15 +48,17 @@ a:hover {
/* Sidebar */
.sidebar {
width: 250px;
padding: 10px;
min-width: 110px; /* Minimum width */
max-width: 200px; /* Maximum width */
width: 100%; /* Make it take full width within the defined min and max */
padding: 5px;
box-sizing: border-box; /* Ensure padding doesn't increase the overall width */
transition: width 0.3s ease-in-out; /* Smooth transition for resizing */
background-color: #f4f4f4;
margin-right: 20px;
overflow-x: hidden;
white-space: normal;
word-wrap: break-word;
word-break: break-word;
transition: background 0.1s ease, color 0.1s ease;
box-sizing: border-box;
word-wrap: break-word; /* Allow wrapping of long words */
overflow-wrap: break-word; /* Ensures wrapping across browsers */
white-space: normal; /* Ensure normal word wrapping */
}
.dark-mode .sidebar {
@@ -65,7 +67,8 @@ a:hover {
/* Sidebar Headers */
.sidebar h3 {
margin-top: 5px;
margin-top: 15px;
margin-bottom: 2px;
font-size: 16px;
}
@@ -141,6 +144,41 @@ input[type="checkbox"] {
/* PAGINATION */
.pagination-container {
display: flex;
justify-content: center;
align-items: center;
gap: 10px;
font-family: Arial, sans-serif;
}
.pagination-link {
padding: 8px 15px;
background-color: #007bff;
color: white;
text-decoration: none;
border-radius: 25px;
font-size: 14px;
display: inline-block;
transition: background-color 0.3s ease, transform 0.2s ease;
}
.pagination-link:hover {
background-color: #0056b3;
transform: scale(1.1);
}
.pagination-link:active {
background-color: #003366;
transform: scale(0.95);
}
.first-page, .last-page {
font-weight: bold;
}
.prev-page, .next-page {
font-weight: normal;
}
/* ROUNDED SWITCH*/
/* Hide the default checkbox */
.checkbox-slider {
@@ -232,7 +270,7 @@ input[type="checkbox"] {
<option value="100" {% if per_page|stringformat:"s" == '100' %}selected{% endif %}>100</option>
<option value="500" {% if per_page|stringformat:"s" == '500' %}selected{% endif %}>500</option>
</select>
<br><br>
<br>
<!-- Filter by Time Range -->
<h3>Fetch Date</h3>
@@ -249,10 +287,23 @@ input[type="checkbox"] {
<option value="90" {% if selected_days|stringformat:"s" == '90' %}selected{% endif %}>Last 90 days</option>
<option value="365" {% if selected_days|stringformat:"s" == '365' %}selected{% endif %}>Last 365 days</option>
</select>
<br><br>
<br>
<!-- Filter by Status -->
<h3>Status</h3>
<!--
<label for="toggle-all-checkbox">
<input type="checkbox" id="toggle-all-checkbox" class="toggle-all-checkbox"> Toggle All
</label><br>
{% for status in statuses %}
<label>
<input type="checkbox" name="status" value="{{ status.0 }}"
{% if status.0 in selected_status %}checked{% endif %}
class="status-checkbox">
{{ status.1 }}
</label><br>
{% endfor %}
-->
<button type="button" class="toggle-all-btn" data-toggle="status">Toggle All</button><br>
{% for status in statuses %}
<label>
@@ -261,7 +312,7 @@ input[type="checkbox"] {
{{ status.1 }}
</label><br>
{% endfor %}
<br><br>
<!-- Filter by Search -->
<h3>Search</h3>
@@ -270,10 +321,10 @@ input[type="checkbox"] {
<label>
<input type="checkbox" name="search" value="{{ search.id }}"
{% if search.id|stringformat:"s" in selected_search %}checked{% endif %}>
[{{ search.type }}] {{ search.search|truncatechars:70 }}
[{{ search.type }}] {{ search.search|truncatechars:50 }}
</label><br>
{% endfor %}
<br><br>
<!-- Filter by Source -->
<h3>Source</h3>
@@ -282,10 +333,21 @@ input[type="checkbox"] {
<label>
<input type="checkbox" name="source" value="{{ source.id }}"
{% if source.id|stringformat:"s" in selected_source %}checked{% endif %}>
{{ source.source|truncatechars:70 }}
{{ source.source|truncatechars:50 }}
</label><br>
{% endfor %}
<br><br>
<!-- Filter by language -->
<h3>Language</h3>
<button type="button" class="toggle-all-btn" data-toggle="language">Toggle All</button><br>
{% for lang in languages %}
<label>
<input type="checkbox" name="language" value="{{ lang }}"
{% if lang|stringformat:"s" in selected_lang %}checked{% endif %}>
{{ lang|truncatechars:50 }}
</label><br>
{% endfor %}
</form>
</div>
@@ -300,6 +362,8 @@ input[type="checkbox"] {
<th>Fetch Date</th>
<th>Search</th>
<th>Source</th>
<th>Valid content?</th>
<th>Language</th>
</tr>
</thead>
<tbody>
@@ -349,7 +413,18 @@ input[type="checkbox"] {
{% endif %}
{% endwith %}
</td>
<td>
{% with url_content_map|dict_get:url.id as content %}
{{ content.valid_content }}
{% endwith %}
</td>
<td>
{% with url_content_map|dict_get:url.id as content %}
{{ content.language }}
{% endwith %}
</td>
</tr>
{% empty %}
<tr>
<td colspan="5">No URLs found for the selected filters.</td>
@@ -360,7 +435,8 @@ input[type="checkbox"] {
<!-- Pagination Controls -->
<div class="pagination">
<div class="pagination-controls">
<!-- <div class="pagination-controls"> -->
<div class="pagination-container" style="margin-top: 20px;margin-bottom: 20px;">
{% if urls.has_previous %}
<a href="#" class="pagination-link" data-page="1">« First</a>
<a href="#" class="pagination-link" data-page="{{ urls.previous_page_number }}">Previous</a>

View File

@@ -11,6 +11,8 @@
<script src="https://code.jquery.com/jquery-3.6.4.min.js"></script>
<!-- Markdown -->
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<!-- Bootstrap JS -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
<!-- Custom Styles -->
<style>
@@ -34,10 +36,63 @@
word-break: break-word;
}
</style>
.table {
table-layout: auto;
width: 100%;
}
th {
white-space: nowrap;
}
td {
word-wrap: break-word;
overflow-wrap: break-word;
}
/* Sidebar */
.sidebar {
min-width: 110px; /* Minimum width */
max-width: 200px; /* Maximum width */
width: 100%; /* Make it take full width within the defined min and max */
padding: 5px;
box-sizing: border-box; /* Ensure padding doesn't increase the overall width */
transition: width 0.3s ease-in-out; /* Smooth transition for resizing */
background-color: #f4f4f4;
box-sizing: border-box;
word-wrap: break-word; /* Allow wrapping of long words */
overflow-wrap: break-word; /* Ensures wrapping across browsers */
white-space: normal; /* Ensure normal word wrapping */
}
.dark-mode .sidebar {
background-color: #1e1e1e;
}
</style>
</head>
<script>
//////////////////////////////////////////////////////////////////////
document.addEventListener("DOMContentLoaded", function () {
//////////////////////////////////////////////
// Timestamp to local timezone
document.querySelectorAll(".ts-fetch").forEach(element => {
let utcDate = element.getAttribute("data-ts"); // Get timestamp from data attribute
let options = { year: 'numeric', month: 'numeric', day: 'numeric', hour: '2-digit', minute: '2-digit', second: '2-digit', hour12:false};
if (utcDate) {
let localDate = new Date(utcDate).toLocaleString("en-GB", options); // Convert to local timezone
element.textContent = localDate; // Update the text content
}
});
document.querySelectorAll(".ts-publish").forEach(element => {
let utcDate = element.getAttribute("data-ts"); // Get timestamp from data attribute
let options = { year: 'numeric', month: 'numeric', day: 'numeric', hour: '2-digit', minute: '2-digit', second: '2-digit', hour12:false};
if (utcDate) {
let localDate = new Date(utcDate).toLocaleString("en-GB", options); // Convert to local timezone
element.textContent = localDate; // Update the text content
}
});
});
function fetchDetails(urlId, url) {
// Show the loading spinner
@@ -54,15 +109,13 @@
}
// Fetch URL
let fetchUrl = `/api/url/${urlId}/fetch/?url=${encodeURIComponent(url)}&model=${encodeURIComponent(selectedModel)}&text=${encodeURIComponent(inputText)}`;
let fetchUrl = `/urls/${urlId}/fetch/?url=${encodeURIComponent(url)}&model=${encodeURIComponent(selectedModel)}&text=${encodeURIComponent(inputText)}`;
let resultContainer = $("#chat-output");
resultContainer.html(""); // Clear previous content before fetching
let fetchButton = $("button[onclick^='fetchDetails']"); // Select the button
fetchButton.prop("disabled", true); // Disable button
fetch(fetchUrl/*, {
method: "POST",
body: JSON.stringify({
@@ -89,7 +142,6 @@
fetchButton.prop("disabled", false); // Re-enable button when done
return;
}
// Decode the streamed chunk
let chunk = decoder.decode(value);
// Append to the accumulated text
@@ -111,14 +163,21 @@
// Hide the loading spinner after request is complete
document.getElementById("loading-spinner").style.display = "none";
});
;
}
</script>
<body>
<div class="sidebar">
<div class="button-container">
<button id="homeButton" class="home-button">🏠</button>
<button id="themeToggle" class="theme-button">🌙</button>
</div>
</div>
<!-- Main Content -->
<div class="container mt-4">
<h2>URL Details</h2>
<!-- <h2>URL Details</h2> -->
<table class="table table-bordered">
<tr>
<th>URL</th>
@@ -126,7 +185,7 @@
</tr>
<tr>
<th>Fetch Date</th>
<td>{{ url_item.ts_fetch }} UTC</td>
<td> <span class="ts-fetch" data-ts="{{ url_item.ts_fetch|date:'c' }}"></span> </td>
</tr>
<tr>
<th>Source</th>
@@ -142,59 +201,59 @@
</tr>
<tr>
<th>URL host</th>
<td><a href="{{ url_content.url_host|safe }}" target="_blank">{{ url_content.url_host }}</a></td>
<td> <a href="{{ url_content.url_host|safe }}" target="_blank">{{ url_content.url_host }}</a> </td>
</tr>
<tr>
<th>Site name</th>
<td>{{ url_content.site_name }}</td>
<td>{{ url_content.site_name|default:"" }}</td>
</tr>
<tr>
<th>Published Date</th>
<td>{{ url_content.date_published }} UTC</td>
<td> <span class="ts-publish" data-ts="{{ url_content.date_published|date:'c' }}"></span> </td>
</tr>
<tr>
<th>Valid news article content?</th>
<th>Valid news content?</th>
<td>{{ url_content.valid_content }}</td>
</tr>
<tr>
<th>Tags</th>
<td>{{ url_content.tags }}</td>
<td>{{ url_content.tags|default:"" }}</td>
</tr>
<tr>
<th>Authors</th>
<td>{{ url_content.authors }}</td>
<td>{{ url_content.authors|default:"" }}</td>
</tr>
<tr>
<th>Keywords</th>
<td>{{ url_content.keywords }}</td>
<td>{{ url_content.keywords|default:"" }}</td>
</tr>
<tr>
<th>Language</th>
<td>{{ url_content.language }}</td>
<td>{{ url_content.language|default:"" }}</td>
</tr>
<tr>
<th>Main image</th>
<td><a href="{{ url_content.image_main_url|safe }}" target="_blank">{{ url_content.image_main_url }}</a></td>
<td><a href="{{ url_content.image_main_url|safe }}" target="_blank">{{ url_content.image_main_url|default:"" }}</a></td>
</tr>
<tr>
<th>Image URLs</th>
<td>{{ url_content.image_urls }}</td>
<td>{{ url_content.image_urls|default:"" }}</td>
</tr>
<tr>
<th>Video URLs</th>
<td>{{ url_content.videos_url }}</td>
<td>{{ url_content.videos_url|default:"" }}</td>
</tr>
<tr>
<th>Title</th>
<td>{{ url_content.title }}</td>
<td>{{ url_content.title|default:"" }}</td>
</tr>
<tr>
<th>Description</th>
<td>{{ url_content.description }}</td>
<td>{{ url_content.description|default:"" }}</td>
</tr>
<tr>
<th>Content</th>
<td>{{ url_content.content }}</td>
<td>{{ url_content.content|default:"" }}</td>
</tr>
</table>
@@ -232,9 +291,6 @@
</div>
<!-- Bootstrap JS -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
{% block extra_js %}{% endblock %}
</body>
</html>

View File

@@ -18,9 +18,9 @@ urlpatterns = [
path('urls/<int:id>/', views.url_detail_view, name='url_detail'),
path('urls/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
#
path('url/', views.urls, name='url_detail'),
path('url/<int:id>/', views.url_detail_view, name='url_detail'),
path('url/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
#path('url/', views.urls, name='url_detail'),
#path('url/<int:id>/', views.url_detail_view, name='url_detail'),
#path('url/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
#
path('task/<str:task>', views.trigger_task, name='trigger_task'),
]

View File

@@ -1,108 +1,53 @@
# import django_rq
from .tasks import background_task
from django.http import JsonResponse
from django.core.paginator import Paginator
from django.shortcuts import render, get_object_or_404
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
import ollama
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
import os
####################################################################################################
def trigger_task(request, task):
# View that enqueues a task
# Enqueue function in "default" queue
background_task.delay(task)
return JsonResponse({"message": "Task has been enqueued!", "task": task})
# queue = django_rq.get_queue('default') # Get the default queue
# job = queue.enqueue(background_task, task, job_timeout="30m")
# return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})
####################################################################################################
def link_list(request):
prefix = "http://localhost:8000/api/task"
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_500000"]
prefix = "http://localhost:8000/task"
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
list_links = [
# DB
"http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500",
# Admin panel
"http://localhost:8000/admin",
# URLs
"http://localhost:8000/api/url",
# Charts
"http://localhost:8000/api/charts",
# Logs
"http://localhost:8000/api/logs_debug",
"http://localhost:8000/api/logs_info",
"http://localhost:8000/api/logs_error",
"http://localhost:8000/logs_debug",
"http://localhost:8000/logs_info",
"http://localhost:8000/logs_error",
# URLs
"http://localhost:8000/urls",
# Charts
"http://localhost:8000/charts",
# API tasks
] + [os.path.join(prefix, l) for l in links]
# Json
return JsonResponse({"links": list_links })
from django.http import StreamingHttpResponse, JsonResponse
from django.shortcuts import render, get_object_or_404
from django.core.paginator import Paginator
import ollama
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
# Create your views here.
def urls(request):
# URLs
urls = Urls.objects.all()
# Sources
sources = Source.objects.all()
searches = Search.objects.all()
# Parameters
page_number = request.GET.get("page", 1)
num_items = request.GET.get("items", 15)
source_ids = request.GET.get("sources", ','.join([str(s.id) for s in sources]))
search_ids = request.GET.get("searches", ','.join([str(s.id) for s in searches]))
status_filters = request.GET.get("status", None)
# Filters
if (status_filters) and (status_filters != "all"):
if (status_filters == "none"):
urls = []
else:
urls = urls.filter(status__in=status_filters.split(","))
if (source_ids) and (source_ids != "all"):
if (source_ids == "none"):
urls = []
else:
urls = urls.filter(urlssourcesearch__id_source__in=source_ids.split(",")) # .distinct()
if (search_ids) and (search_ids != "all"):
if (search_ids == "none"):
urls = []
else:
urls = urls.filter(urlssourcesearch__id_search__in=search_ids.split(",")) # .distinct()
# Pagination
paginator = Paginator(urls, num_items)
page_obj = paginator.get_page(page_number)
# Map URL IDs to their sources & searches, only for subset of URLs (page of interest)
sources_map = {
url.id: list(Source.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
}
searches_map = {
url.id: list(Search.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
}
context = {
"page_obj": page_obj,
"sources": sources,
"searches": searches,
"sources_map": sources_map,
"searches_map": searches_map,
"list_status": Urls.STATUS_ENUM.values,
"list_urls_per_page": [15, 100, 500],
}
# If request is AJAX, return JSON response
if request.headers.get("X-Requested-With") == "XMLHttpRequest":
return JsonResponse({'urls': render(request, 'urls_partial.html', context).content.decode('utf-8')})
return render(request, "urls.html", context)
####################################################################################################
def logs_error(request):
with open(os.getenv("PATH_LOGS_ERROR", "logs/log_app_fetcher_error.log"), "r") as f:
file_content = f.read()
return HttpResponse(file_content, content_type="text/plain")
def logs_info(request):
with open(os.getenv("PATH_LOGS_INFO", "logs/log_app_fetcher_info.log"), "r") as f:
file_content = f.read()
return HttpResponse(file_content, content_type="text/plain")
def logs_debug(request):
with open(os.getenv("PATH_LOGS_DEBUG", "logs/log_app_fetcher_debug.log"), "r") as f:
file_content = f.read()
return HttpResponse(file_content, content_type="text/plain")
####################################################################################################
class OllamaClient():
@@ -128,31 +73,6 @@ class OllamaClient():
# return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
#return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
def url_detail_view(request, id):
url_item = get_object_or_404(Urls, id=id)
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
# url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
try:
url_content = UrlContent.objects.get(pk=id)
except UrlContent.DoesNotExist:
url_content = {}
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
ollama = OllamaClient()
context = {
'url_item': url_item,
'sources': url_sources,
'searches': url_searches,
'models': ollama.get_models(),
'prompt': ollama.get_prompt(),
'url_content': url_content,
}
return render(request, 'url_detail.html', context)
# TODO: move to ollamajs...
def fetch_details(request, id):
url_item = get_object_or_404(Urls, id=id)
@@ -178,6 +98,30 @@ def fetch_details(request, id):
return StreamingHttpResponse(stream_response(), content_type="text/plain")
def url_detail_view(request, id):
url_item = get_object_or_404(Urls, id=id)
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
# url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
try:
url_content = UrlContent.objects.get(pk=id)
except UrlContent.DoesNotExist:
url_content = {}
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
ollama = OllamaClient()
context = {
'url_item': url_item,
'sources': url_sources,
'searches': url_searches,
'models': ollama.get_models(),
'prompt': ollama.get_prompt(),
'url_content': url_content,
}
return render(request, 'url_detail.html', context)
####################################################################################################
from django.shortcuts import render
from django.http import JsonResponse
@@ -256,23 +200,7 @@ def urls_per_search(request):
return JsonResponse(data)
####################################################################################################
from django.http import HttpResponse
def logs_error(request):
with open(os.getenv("PATH_LOGS_ERROR", "logs/log_app_fetcher_error.log"), "r") as f:
file_content = f.read()
return HttpResponse(file_content, content_type="text/plain")
def logs_info(request):
with open(os.getenv("PATH_LOGS_INFO", "logs/log_app_fetcher_info.log"), "r") as f:
file_content = f.read()
return HttpResponse(file_content, content_type="text/plain")
def logs_debug(request):
with open(os.getenv("PATH_LOGS_DEBUG", "logs/log_app_fetcher_debug.log"), "r") as f:
file_content = f.read()
return HttpResponse(file_content, content_type="text/plain")
####################################################################################################
from django.shortcuts import render
@@ -284,33 +212,39 @@ def filtered_urls(request):
statuses = Urls.STATUS_ENUM.choices
searches = Search.objects.all()
sources = Source.objects.all()
# TODO: Cache languages, update once every N
languages = UrlContent.objects.distinct('language').values_list('language', flat=True)
# languages = [l for l in languages if l is not None]
# Get selected parameters
selected_status = request.GET.getlist('status')
selected_search = request.GET.getlist('search')
selected_source = request.GET.getlist('source')
selected_language = request.GET.getlist('language')
selected_days = request.GET.get("days", 30)
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
page_number = request.GET.get('page') # Get the current page number
# charts = request.GET.get('charts', False)
# "Home" -> No parameters -> Override filter with default values
if ( len(request.GET.keys()) == 0 ):
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
selected_status = [str(status[0]) for status in statuses]
selected_search = [str(search.id) for search in searches]
selected_source = [str(source.id) for source in sources]
selected_language = languages
# Filter URLs based on selected filters
if ('' in selected_status) or ('' in selected_search) or ('' in selected_source):
urls = []
else:
urls = Urls.objects.filter(
Q(urlssourcesearch__id_source__in=selected_source) &
Q(urlssourcesearch__id_search__in=selected_search) &
Q(status__in=selected_status) &
query = Q(urlssourcesearch__id_source__in=selected_source) & \
Q(urlssourcesearch__id_search__in=selected_search) & \
Q(status__in=selected_status) & \
Q(ts_fetch__gte=now() - timedelta(days=float(selected_days)))
).distinct() # .order_by('-ts_fetch')
if selected_language:
query &= Q(urlcontent__language__in=selected_language)
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
# Custom replace search type
for s in searches:
@@ -327,22 +261,31 @@ def filtered_urls(request):
searches_map = {
url.id: list(Search.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
}
url_content_map = {
url.id: UrlContent.objects.filter(pk=url).first() for url in page_obj.object_list
}
context = {
'urls': page_obj, # Pass the paginated URLs
'per_page': per_page, # Send per_page value for dynamic pagination
'statuses': statuses,
'searches': searches,
'sources': sources,
'searches': sorted(searches, key=lambda x: (x.type, x.search)),
'sources': sorted(sources, key=lambda x: x.source),
'languages': sorted(languages, key=lambda x: (x is None, x)),
# Selection
'selected_status': selected_status,
'selected_search': selected_search,
'selected_source': selected_source,
'selected_language': selected_language,
"selected_days": selected_days,
# Map
"sources_map": sources_map,
"searches_map": searches_map,
"url_content_map": url_content_map,
# "charts": charts,
# "list_per_page": [15, 100, 500],
# "list_days_text": ([0.25, 1, 7, 30, 365], ["Last 6 hours", "Last 24 hours", "Last 7 days", "Last 30 days", "Last 365 days"])
}
return render(request, 'filtered_urls.html', context)
####################################################################################################

View File

@@ -19,6 +19,6 @@ from django.urls import path, include
urlpatterns = [
path('admin/', admin.site.urls),
path('api/', include('api.urls')),
path('scheduler/', include('scheduler.urls')),
path('', include('api.urls')),
]