URLs view refactor, article exception handling, visualize logs, charts

This commit is contained in:
Luciano Gervasoni
2025-03-26 14:28:57 +01:00
parent 9d2550b374
commit e1f4787119
8 changed files with 739 additions and 9 deletions

View File

@@ -85,6 +85,10 @@ REDIS_PORT=${REDIS_PORT:-6379}
RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900}
# Default RQ job queue TTL
RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600}
# Logs path
PATH_LOGS_ERROR=logs/log_app_fetcher_error.log
PATH_LOGS=logs/log_app_fetcher.log
```
* Deploy

View File

@@ -62,7 +62,7 @@ def search_gnews(keyword_search, period="1d", language="en", country="US", max_r
def search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region="wt-wt"):
# [source] [category] [period] [language-country] [max_results]
source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("None", "").strip()
source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("max_results=None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# region="{}-{}".format(langauge, country.lower())

View File

@@ -1,6 +1,10 @@
import logging
import os
''' TODO: PATH LOGS
PATH_LOGS_ERROR=logs/log_app_fetcher_error.log
PATH_LOGS=logs/log_app_fetcher.log
'''
os.makedirs("logs", exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')

View File

@@ -50,21 +50,21 @@ def process_url(url):
except newspaper.ArticleException as e:
# Too many requests? Cool down...
if ("Status code 429" in str(e)):
if ("Status code 429" in str(e.args)):
# TODO: cool down and retry once?, proxy/VPN, ...
logger.debug("TODO: Implement code 429")
# Unavailable for legal reasons
if ("Status code 451" in str(e)):
if ("Status code 451" in str(e.args)):
# TODO: Bypass with VPN
logger.debug("TODO: Implement code 451")
# CloudFlare protection?
if ("Website protected with Cloudflare" in str(e)):
if ("Website protected with Cloudflare" in str(e.args)):
logger.debug("TODO: Implement bypass CloudFlare")
# PerimeterX protection?
if ("Website protected with PerimeterX" in str(e)):
if ("Website protected with PerimeterX" in str(e.args)):
logger.debug("TODO: Implement bypass PerimeterX")
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e.args)))
return None
except Exception as e:
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))

View File

@@ -0,0 +1,294 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Charts</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<style>
body {
background-color: #333;
color: #fff;
font-family: Arial, sans-serif;
}
h2 {
color: #fff;
text-align: center;
margin-bottom: 40px;
}
.chart-container {
width: 45%;
display: inline-block;
margin: 20px;
background-color: #444;
border-radius: 10px;
padding: 5px;
}
canvas {
background-color: #2c2c2c;
border-radius: 5px;
}
.container {
display: flex;
justify-content: center;
flex-wrap: wrap;
}
.filter-container {
text-align: center;
margin-bottom: 20px;
}
select {
padding: 8px;
background-color: #555;
color: white;
border: 1px solid #444;
border-radius: 5px;
}
</style>
</head>
<body>
<h2>Data Visualizations</h2>
<!-- Filter for Number of Days -->
<div class="filter-container">
<label for="daysFilter">Select Number of Days:</label>
<select id="daysFilter">
<option value="1">Last 24 Hours</option>
<option value="3">Last 3 Days</option>
<option value="7" selected>Last 7 Days</option>
<option value="30">Last 30 Days</option>
<option value="90">Last 90 Days</option>
<option value="365">Last 365 Days</option>
</select>
</div>
<div class="container">
<div class="chart-container">
<canvas id="urlFetchDateChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlStatusChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSourceChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSearchChart"></canvas>
</div>
</div>
<script>
$(document).ready(function () {
// Fetch initial data (default 30 days)
const defaultDays = 7;
fetchDataAndRenderCharts(defaultDays);
// Apply the filter automatically when the user changes the selection
$('#daysFilter').change(function () {
const selectedDays = $(this).val();
fetchDataAndRenderCharts(selectedDays);
});
});
function fetchDataAndRenderCharts(days) {
// Fetch and render the URL Fetch Date chart
$.getJSON(`/api/urls-by-fetch-date/?days=${days}`, function (data) {
renderUrlFetchDateChart(data);
});
// Fetch and render the URL Status chart (with dynamic date filtering)
$.getJSON(`/api/urls-per-status/?days=${days}`, function (data) {
renderUrlStatusChart(data);
});
// Fetch and render the URLs per Source chart
$.getJSON(`/api/urls-per-source/?days=${days}`, function (data) {
renderUrlsPerSourceChart(data);
});
// Fetch and render the URLs per Search chart
$.getJSON(`/api/urls-per-search/?days=${days}`, function (data) {
renderUrlsPerSearchChart(data);
});
}
function renderUrlFetchDateChart(data) {
new Chart(document.getElementById("urlFetchDateChart"), {
type: 'bar',
data: {
labels: data.dates,
datasets: [{
label: 'URLs by Fetch Date',
data: data.counts,
backgroundColor: 'blue',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
function renderUrlStatusChart(data) {
new Chart(document.getElementById("urlStatusChart"), {
type: 'bar',
data: {
labels: data.statuses,
datasets: [{
label: 'URLs by Status',
data: data.counts,
backgroundColor: 'green',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
function renderUrlsPerSourceChart(data) {
new Chart(document.getElementById("urlsPerSourceChart"), {
type: 'bar',
data: {
labels: data.sources,
datasets: [{
label: 'URLs by Source',
data: data.counts,
backgroundColor: 'purple',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
function renderUrlsPerSearchChart(data) {
new Chart(document.getElementById("urlsPerSearchChart"), {
type: 'bar',
data: {
labels: data.searches,
datasets: [{
label: 'URLs by Search',
data: data.counts,
backgroundColor: 'orange',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
</script>
</body>
</html>

View File

@@ -0,0 +1,277 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>URLs</title>
<!--
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
-->
<style>
/* General Styling */
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 0;
background-color: #fff;
color: #333;
/*transition: background 0.3s ease, color 0.3s ease;*/
}
/* Dark Mode Styles */
.dark-mode {
background-color: #121212;
color: #e0e0e0;
}
/* Default Link Style */
a {
color: #0066cc; /* Default color for links */
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
/* Dark Mode Links */
.dark-mode a {
color: #52a8ff; /* Adjust this color to make the link more visible in dark mode */
}
.dark-mode a:hover {
color: #66ccff; /* Change the hover color to something lighter or a contrasting color */
}
/* Layout */
.container {
display: flex;
}
/* Sidebar */
.sidebar {
width: 250px;
padding: 10px;
background-color: #f4f4f4;
margin-right: 20px;
overflow-x: hidden;
white-space: normal;
word-wrap: break-word;
word-break: break-word;
transition: background 0.1s ease, color 0.1s ease;
}
.dark-mode .sidebar {
background-color: #1e1e1e;
}
/* Sidebar Headers */
.sidebar h3 {
margin-top: 5px;
font-size: 16px;
}
/* Table Container */
.table-container {
flex-grow: 1;
}
/* Table */
table {
width: 97.5%;
border-collapse: collapse;
margin-top: 20px;
}
table, th, td {
border: 1px solid #ddd;
}
th, td {
padding: 10px;
text-align: left;
}
/* Dark Mode Table */
.dark-mode table {
border-color: #444;
}
.dark-mode th, .dark-mode td {
border-color: #555;
}
/* Dark Mode Checkbox Labels */
.dark-mode label {
color: #e0e0e0;
}
/* Checkbox Styling */
input[type="checkbox"] {
cursor: pointer;
}
/* Themed Toggle Button */
.theme-button {
background-color: var(--sidebar);
border: 1px solid var(--sidebar);
border-radius: 50%;
width: 45px;
height: 45px;
font-size: 25px;
display: flex;
align-items: center;
justify-content: center;
transition: background-color 0.1s, color 0.1s, transform 0.1s;
cursor: pointer;
}
.theme-button:hover {
transform: rotate(20deg);
}
.theme-button:active {
transform: scale(0.95);
}
</style>
</head>
<body>
<div class="container">
<div class="sidebar">
<button id="themeToggle" class="theme-button">🌙</button>
<form method="GET" action="" id="filterForm">
<!-- Filter by Status -->
<h3>Status</h3>
{% for status in statuses %}
<label>
<input type="checkbox" name="status" value="{{ status.0 }}"
{% if status.0 in selected_status %}checked{% endif %}>
{{ status.1 }}
</label><br>
{% endfor %}
<!-- Filter by Search -->
<h3>Search</h3>
{% for search in searches %}
<label>
<input type="checkbox" name="search" value="{{ search.id }}"
{% if search.id|stringformat:"s" in selected_search %}checked{% endif %}>
[{{ search.type }}] {{ search.search }}
</label><br>
{% endfor %}
<!-- Filter by Source -->
<h3>Source</h3>
{% for source in sources %}
<label>
<input type="checkbox" name="source" value="{{ source.id }}"
{% if source.id|stringformat:"s" in selected_source %}checked{% endif %}>
{{ source.source }}
</label><br>
{% endfor %}
</form>
</div>
<div class="table-container">
<table>
<thead>
<tr>
<th>ID</th>
<th>URL</th>
<th>Status</th>
<th>Fetch Date</th>
<th>Search</th>
<th>Source</th>
</tr>
</thead>
<tbody>
{% for url in urls %}
<tr>
<td><a href="./{{ url.id }}" class="btn btn-primary btn-sm" target="_blank">{{ url.id }}</a></td>
<td><a href="{{ url.url }}/" target="_blank">{{ url.url }}</a></td>
<td>
{% if url.status == 'raw' %}
<span class="badge bg-secondary">{{ url.status|capfirst }}</span>
{% elif url.status == 'error' %}
<span class="badge bg-danger">{{ url.status|capfirst }}</span>
{% elif url.status == 'valid' %}
<span class="badge bg-success">{{ url.status|capfirst }}</span>
{% elif url.status == 'unknown' %}
<span class="badge bg-warning">{{ url.status|capfirst }}</span>
{% elif url.status == 'invalid' %}
<span class="badge bg-danger">{{ url.status|capfirst }}</span>
{% elif url.status == 'duplicate' %}
<span class="badge bg-info">{{ url.status|capfirst }}</span>
{% else %}
<span class="badge bg-light">Unknown</span>
{% endif %}
</td>
<td>{{ url.ts_fetch }}</td>
<td>
{% for search in url.urlssourcesearch_set.all %}
{{ search.id_search.search }}<br>
{% endfor %}
</td>
<td>
{% for source in url.urlssourcesearch_set.all %}
{{ source.id_source.source }}<br>
{% endfor %}
</td>
</tr>
{% empty %}
<tr>
<td colspan="5">No URLs found for the selected filters.</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
<!-- Passing the selected filters as JavaScript variables -->
<script type="text/javascript">
// Make sure these variables are accessible in your JavaScript
var selectedStatus = {{ selected_status|safe }};
var selectedSearch = {{ selected_search|safe }};
var selectedSource = {{ selected_source|safe }};
</script>
<script>
// Automatically submit the form when any checkbox changes
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
checkbox.addEventListener('change', function() {
// Automatically submit the form when a checkbox is toggled
document.getElementById('filterForm').submit();
});
});
document.addEventListener("DOMContentLoaded", function () {
const themeToggle = document.getElementById("themeToggle");
const body = document.body;
// Load theme from localStorage
if (localStorage.getItem("theme") === "dark") {
body.classList.add("dark-mode");
themeToggle.textContent = "🌞";
}
// Toggle theme on button click
themeToggle.addEventListener("click", function () {
if (body.classList.contains("dark-mode")) {
body.classList.remove("dark-mode");
localStorage.setItem("theme", "light");
themeToggle.textContent = "🌙";
} else {
body.classList.add("dark-mode");
localStorage.setItem("theme", "dark");
themeToggle.textContent = "🌞";
}
});
});
</script>
</body>
</html>

View File

@@ -3,8 +3,20 @@ from . import views
urlpatterns = [
path('', views.link_list, name='link_list'),
#
path('logs', views.logs, name='logs'),
path('logs_error', views.logs_error, name='logs_error'),
#
path('charts/', views.charts, name='charts'),
path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
path('urls-per-status/', views.urls_per_status, name='urls_per_status'),
path('urls-per-source/', views.urls_per_source, name='urls_per_source'),
path('urls-per-search/', views.urls_per_search, name='urls_per_search'),
#
path('filtered-urls/', views.filtered_urls, name='filtered_urls'),
#
path('url/', views.urls, name='url_detail'),
path('url/<int:id>/', views.url_detail_view, name='url_detail'),
path('url/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
path('task/<str:task>', views.trigger_task, name='trigger_task'),
path('task/<str:task>', views.trigger_task, name='trigger_task'),
]

View File

@@ -25,6 +25,11 @@ def link_list(request):
"http://localhost:8000/admin",
# URLs
"http://localhost:8000/api/url",
# Charts
"http://localhost:8000/api/charts",
# Logs
"http://localhost:8000/api/logs",
"http://localhost:8000/api/logs_error",
# API tasks
] + [os.path.join(prefix, l) for l in links]
# Json
@@ -98,7 +103,7 @@ def urls(request):
return render(request, "urls.html", context)
####################################################################################################
class OllamaClient():
def __init__(self):
self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
@@ -170,3 +175,137 @@ def fetch_details(request, id):
yield chunk["message"]["content"] # Stream each chunk of text
return StreamingHttpResponse(stream_response(), content_type="text/plain")
####################################################################################################
from django.shortcuts import render
from django.http import JsonResponse
from django.db.models import Count
from datetime import timedelta
from django.utils import timezone
from .models import Urls, UrlsSourceSearch
def charts(request):
return render(request, 'charts.html')
def urls_by_fetch_date(request):
# Get the date for 30 days ago
start_date = timezone.now() - timedelta(days=30)
# Count the number of URLs grouped by fetch date
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
.values('ts_fetch__date') \
.annotate(count=Count('id')) \
.order_by('ts_fetch__date')
# Format data to return as JSON
data = {
'dates': [item['ts_fetch__date'] for item in urls_data],
'counts': [item['count'] for item in urls_data],
}
return JsonResponse(data)
def urls_per_status(request):
# Get the filtering date parameter
days = int(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by status within the date range
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
.values('status') \
.annotate(count=Count('id')) \
.order_by('status')
# Format data for JSON
data = {
'statuses': [item['status'] for item in urls_data],
'counts': [item['count'] for item in urls_data],
}
return JsonResponse(data)
def urls_per_source(request):
# Count the number of URLs grouped by source
urls_data = UrlsSourceSearch.objects \
.values('id_source__source') \
.annotate(count=Count('id_url')) \
.order_by('id_source__source')
# Format data for JSON
data = {
'sources': [item['id_source__source'] for item in urls_data],
'counts': [item['count'] for item in urls_data],
}
return JsonResponse(data)
def urls_per_search(request):
# Count the number of URLs grouped by search
urls_data = UrlsSourceSearch.objects \
.values('id_search__search') \
.annotate(count=Count('id_url')) \
.order_by('id_search__search')
# Format data for JSON
data = {
'searches': [item['id_search__search'] for item in urls_data],
'counts': [item['count'] for item in urls_data],
}
return JsonResponse(data)
####################################################################################################
from django.http import HttpResponse
def logs_error(request):
with open(os.getenv("PATH_LOGS_ERROR", "logs/log_app_fetcher_error.log"), "r") as f:
file_content = f.read()
return HttpResponse(file_content, content_type="text/plain")
def logs(request):
with open(os.getenv("PATH_LOGS", "logs/log_app_fetcher.log"), "r") as f:
file_content = f.read()
return HttpResponse(file_content, content_type="text/plain")
####################################################################################################
from django.shortcuts import render
from .models import Urls, Search, Source
def filtered_urls(request):
statuses = Urls.STATUS_ENUM.choices
searches = Search.objects.all()
sources = Source.objects.all()
# Check if filters are applied; if not, select all by default
if not request.GET:
selected_status = [str(status[0]) for status in statuses]
selected_search = [str(search.id) for search in searches]
selected_source = [str(source.id) for source in sources]
else:
selected_status = request.GET.getlist('status')
selected_search = request.GET.getlist('search')
selected_source = request.GET.getlist('source')
# Filter URLs based on selected filters
urls = Urls.objects.all()
if selected_status:
urls = urls.filter(status__in=selected_status)
if selected_search:
urls = urls.filter(urlssourcesearch__id_search__in=selected_search)
if selected_source:
urls = urls.filter(urlssourcesearch__id_source__in=selected_source)
context = {
'urls': urls,
'statuses': statuses,
'searches': searches,
'sources': sources,
'selected_status': selected_status,
'selected_search': selected_search,
'selected_source': selected_source,
}
return render(request, 'filtered_urls.html', context)
####################################################################################################