diff --git a/app_urls/README.md b/app_urls/README.md
index d4162a7..cdf59ec 100644
--- a/app_urls/README.md
+++ b/app_urls/README.md
@@ -85,6 +85,10 @@ REDIS_PORT=${REDIS_PORT:-6379}
RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900}
# Default RQ job queue TTL
RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600}
+
+# Logs path
+PATH_LOGS_ERROR=logs/log_app_fetcher_error.log
+PATH_LOGS=logs/log_app_fetcher.log
```
* Deploy
diff --git a/app_urls/api/src/fetch_search_utils.py b/app_urls/api/src/fetch_search_utils.py
index e6cea03..32df8f8 100644
--- a/app_urls/api/src/fetch_search_utils.py
+++ b/app_urls/api/src/fetch_search_utils.py
@@ -62,7 +62,7 @@ def search_gnews(keyword_search, period="1d", language="en", country="US", max_r
def search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region="wt-wt"):
# [source] [category] [period] [language-country] [max_results]
- source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("None", "").strip()
+ source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("max_results=None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# region="{}-{}".format(langauge, country.lower())
diff --git a/app_urls/api/src/logger.py b/app_urls/api/src/logger.py
index c2cae1d..93a82a4 100644
--- a/app_urls/api/src/logger.py
+++ b/app_urls/api/src/logger.py
@@ -1,6 +1,10 @@
import logging
-
import os
+
+''' TODO: PATH LOGS
+PATH_LOGS_ERROR=logs/log_app_fetcher_error.log
+PATH_LOGS=logs/log_app_fetcher.log
+'''
os.makedirs("logs", exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
diff --git a/app_urls/api/src/url_processor.py b/app_urls/api/src/url_processor.py
index 1c2e04d..90a575d 100644
--- a/app_urls/api/src/url_processor.py
+++ b/app_urls/api/src/url_processor.py
@@ -50,21 +50,21 @@ def process_url(url):
except newspaper.ArticleException as e:
# Too many requests? Cool down...
- if ("Status code 429" in str(e)):
+ if ("Status code 429" in str(e.args)):
# TODO: cool down and retry once?, proxy/VPN, ...
logger.debug("TODO: Implement code 429")
# Unavailable for legal reasons
- if ("Status code 451" in str(e)):
+ if ("Status code 451" in str(e.args)):
# TODO: Bypass with VPN
logger.debug("TODO: Implement code 451")
# CloudFlare protection?
- if ("Website protected with Cloudflare" in str(e)):
+ if ("Website protected with Cloudflare" in str(e.args)):
logger.debug("TODO: Implement bypass CloudFlare")
# PerimeterX protection?
- if ("Website protected with PerimeterX" in str(e)):
+ if ("Website protected with PerimeterX" in str(e.args)):
logger.debug("TODO: Implement bypass PerimeterX")
- logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
+ logger.warning("ArticleException for input URL {}\n{}".format(url, str(e.args)))
return None
except Exception as e:
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
diff --git a/app_urls/api/templates/charts.html b/app_urls/api/templates/charts.html
new file mode 100644
index 0000000..f39445d
--- /dev/null
+++ b/app_urls/api/templates/charts.html
@@ -0,0 +1,294 @@
+
+
+
+
+
+ Charts
+
+
+
+
+
+ Data Visualizations
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/app_urls/api/templates/filtered_urls.html b/app_urls/api/templates/filtered_urls.html
new file mode 100644
index 0000000..2be7028
--- /dev/null
+++ b/app_urls/api/templates/filtered_urls.html
@@ -0,0 +1,277 @@
+
+
+
+
+
+ URLs
+
+
+
+
+
+
+
+
+
+
+
+
+
+ | ID |
+ URL |
+ Status |
+ Fetch Date |
+ Search |
+ Source |
+
+
+
+ {% for url in urls %}
+
+ | {{ url.id }} |
+ {{ url.url }} |
+
+ {% if url.status == 'raw' %}
+ {{ url.status|capfirst }}
+ {% elif url.status == 'error' %}
+ {{ url.status|capfirst }}
+ {% elif url.status == 'valid' %}
+ {{ url.status|capfirst }}
+ {% elif url.status == 'unknown' %}
+ {{ url.status|capfirst }}
+ {% elif url.status == 'invalid' %}
+ {{ url.status|capfirst }}
+ {% elif url.status == 'duplicate' %}
+ {{ url.status|capfirst }}
+ {% else %}
+ Unknown
+ {% endif %}
+ |
+ {{ url.ts_fetch }} |
+
+ {% for search in url.urlssourcesearch_set.all %}
+ {{ search.id_search.search }}
+ {% endfor %}
+ |
+
+ {% for source in url.urlssourcesearch_set.all %}
+ {{ source.id_source.source }}
+ {% endfor %}
+ |
+
+ {% empty %}
+
+ | No URLs found for the selected filters. |
+
+ {% endfor %}
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/app_urls/api/urls.py b/app_urls/api/urls.py
index aa416ad..34e839f 100644
--- a/app_urls/api/urls.py
+++ b/app_urls/api/urls.py
@@ -3,8 +3,20 @@ from . import views
urlpatterns = [
path('', views.link_list, name='link_list'),
+ #
+ path('logs', views.logs, name='logs'),
+ path('logs_error', views.logs_error, name='logs_error'),
+ #
+ path('charts/', views.charts, name='charts'),
+ path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
+ path('urls-per-status/', views.urls_per_status, name='urls_per_status'),
+ path('urls-per-source/', views.urls_per_source, name='urls_per_source'),
+ path('urls-per-search/', views.urls_per_search, name='urls_per_search'),
+ #
+ path('filtered-urls/', views.filtered_urls, name='filtered_urls'),
+ #
path('url/', views.urls, name='url_detail'),
path('url//', views.url_detail_view, name='url_detail'),
path('url//fetch/', views.fetch_details, name='fetch_details'),
- path('task/', views.trigger_task, name='trigger_task'),
+ path('task/', views.trigger_task, name='trigger_task'),
]
diff --git a/app_urls/api/views.py b/app_urls/api/views.py
index 5d84e11..30b4411 100644
--- a/app_urls/api/views.py
+++ b/app_urls/api/views.py
@@ -25,6 +25,11 @@ def link_list(request):
"http://localhost:8000/admin",
# URLs
"http://localhost:8000/api/url",
+ # Charts
+ "http://localhost:8000/api/charts",
+ # Logs
+ "http://localhost:8000/api/logs",
+ "http://localhost:8000/api/logs_error",
# API tasks
] + [os.path.join(prefix, l) for l in links]
# Json
@@ -98,7 +103,7 @@ def urls(request):
return render(request, "urls.html", context)
-
+####################################################################################################
class OllamaClient():
def __init__(self):
self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
@@ -170,3 +175,137 @@ def fetch_details(request, id):
yield chunk["message"]["content"] # Stream each chunk of text
return StreamingHttpResponse(stream_response(), content_type="text/plain")
+
+
+####################################################################################################
+from django.shortcuts import render
+from django.http import JsonResponse
+from django.db.models import Count
+from datetime import timedelta
+from django.utils import timezone
+from .models import Urls, UrlsSourceSearch
+
+def charts(request):
+ return render(request, 'charts.html')
+
+def urls_by_fetch_date(request):
+ # Get the date for 30 days ago
+ start_date = timezone.now() - timedelta(days=30)
+
+ # Count the number of URLs grouped by fetch date
+ urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
+ .values('ts_fetch__date') \
+ .annotate(count=Count('id')) \
+ .order_by('ts_fetch__date')
+
+ # Format data to return as JSON
+ data = {
+ 'dates': [item['ts_fetch__date'] for item in urls_data],
+ 'counts': [item['count'] for item in urls_data],
+ }
+
+ return JsonResponse(data)
+
+def urls_per_status(request):
+ # Get the filtering date parameter
+ days = int(request.GET.get('days', 30)) # Default is 30 days
+ start_date = timezone.now() - timedelta(days=days)
+
+ # Count the number of URLs grouped by status within the date range
+ urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
+ .values('status') \
+ .annotate(count=Count('id')) \
+ .order_by('status')
+
+ # Format data for JSON
+ data = {
+ 'statuses': [item['status'] for item in urls_data],
+ 'counts': [item['count'] for item in urls_data],
+ }
+
+ return JsonResponse(data)
+
+def urls_per_source(request):
+ # Count the number of URLs grouped by source
+ urls_data = UrlsSourceSearch.objects \
+ .values('id_source__source') \
+ .annotate(count=Count('id_url')) \
+ .order_by('id_source__source')
+
+ # Format data for JSON
+ data = {
+ 'sources': [item['id_source__source'] for item in urls_data],
+ 'counts': [item['count'] for item in urls_data],
+ }
+
+ return JsonResponse(data)
+
+def urls_per_search(request):
+ # Count the number of URLs grouped by search
+ urls_data = UrlsSourceSearch.objects \
+ .values('id_search__search') \
+ .annotate(count=Count('id_url')) \
+ .order_by('id_search__search')
+
+ # Format data for JSON
+ data = {
+ 'searches': [item['id_search__search'] for item in urls_data],
+ 'counts': [item['count'] for item in urls_data],
+ }
+
+ return JsonResponse(data)
+
+####################################################################################################
+from django.http import HttpResponse
+
+def logs_error(request):
+ with open(os.getenv("PATH_LOGS_ERROR", "logs/log_app_fetcher_error.log"), "r") as f:
+ file_content = f.read()
+ return HttpResponse(file_content, content_type="text/plain")
+
+def logs(request):
+ with open(os.getenv("PATH_LOGS", "logs/log_app_fetcher.log"), "r") as f:
+ file_content = f.read()
+ return HttpResponse(file_content, content_type="text/plain")
+
+####################################################################################################
+from django.shortcuts import render
+from .models import Urls, Search, Source
+
+def filtered_urls(request):
+ statuses = Urls.STATUS_ENUM.choices
+ searches = Search.objects.all()
+ sources = Source.objects.all()
+
+ # Check if filters are applied; if not, select all by default
+ if not request.GET:
+ selected_status = [str(status[0]) for status in statuses]
+ selected_search = [str(search.id) for search in searches]
+ selected_source = [str(source.id) for source in sources]
+ else:
+ selected_status = request.GET.getlist('status')
+ selected_search = request.GET.getlist('search')
+ selected_source = request.GET.getlist('source')
+
+ # Filter URLs based on selected filters
+ urls = Urls.objects.all()
+ if selected_status:
+ urls = urls.filter(status__in=selected_status)
+ if selected_search:
+ urls = urls.filter(urlssourcesearch__id_search__in=selected_search)
+ if selected_source:
+ urls = urls.filter(urlssourcesearch__id_source__in=selected_source)
+
+ context = {
+ 'urls': urls,
+ 'statuses': statuses,
+ 'searches': searches,
+ 'sources': sources,
+ 'selected_status': selected_status,
+ 'selected_search': selected_search,
+ 'selected_source': selected_source,
+ }
+
+ return render(request, 'filtered_urls.html', context)
+
+####################################################################################################
\ No newline at end of file