diff --git a/app_urls/README.md b/app_urls/README.md index d4162a7..cdf59ec 100644 --- a/app_urls/README.md +++ b/app_urls/README.md @@ -85,6 +85,10 @@ REDIS_PORT=${REDIS_PORT:-6379} RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900} # Default RQ job queue TTL RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600} + +# Logs path +PATH_LOGS_ERROR=logs/log_app_fetcher_error.log +PATH_LOGS=logs/log_app_fetcher.log ``` * Deploy diff --git a/app_urls/api/src/fetch_search_utils.py b/app_urls/api/src/fetch_search_utils.py index e6cea03..32df8f8 100644 --- a/app_urls/api/src/fetch_search_utils.py +++ b/app_urls/api/src/fetch_search_utils.py @@ -62,7 +62,7 @@ def search_gnews(keyword_search, period="1d", language="en", country="US", max_r def search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region="wt-wt"): # [source] [category] [period] [language-country] [max_results] - source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("None", "").strip() + source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("max_results=None", "").strip() logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) # region="{}-{}".format(langauge, country.lower()) diff --git a/app_urls/api/src/logger.py b/app_urls/api/src/logger.py index c2cae1d..93a82a4 100644 --- a/app_urls/api/src/logger.py +++ b/app_urls/api/src/logger.py @@ -1,6 +1,10 @@ import logging - import os + +''' TODO: PATH LOGS +PATH_LOGS_ERROR=logs/log_app_fetcher_error.log +PATH_LOGS=logs/log_app_fetcher.log +''' os.makedirs("logs", exist_ok=True) logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') diff --git a/app_urls/api/src/url_processor.py b/app_urls/api/src/url_processor.py index 1c2e04d..90a575d 100644 --- a/app_urls/api/src/url_processor.py +++ b/app_urls/api/src/url_processor.py @@ -50,21 +50,21 @@ def process_url(url): except newspaper.ArticleException as e: # Too many requests? Cool down... - if ("Status code 429" in str(e)): + if ("Status code 429" in str(e.args)): # TODO: cool down and retry once?, proxy/VPN, ... logger.debug("TODO: Implement code 429") # Unavailable for legal reasons - if ("Status code 451" in str(e)): + if ("Status code 451" in str(e.args)): # TODO: Bypass with VPN logger.debug("TODO: Implement code 451") # CloudFlare protection? - if ("Website protected with Cloudflare" in str(e)): + if ("Website protected with Cloudflare" in str(e.args)): logger.debug("TODO: Implement bypass CloudFlare") # PerimeterX protection? - if ("Website protected with PerimeterX" in str(e)): + if ("Website protected with PerimeterX" in str(e.args)): logger.debug("TODO: Implement bypass PerimeterX") - logger.warning("ArticleException for input URL {}\n{}".format(url, str(e))) + logger.warning("ArticleException for input URL {}\n{}".format(url, str(e.args))) return None except Exception as e: logger.warning("Exception for input URL {}\n{}".format(url, str(e))) diff --git a/app_urls/api/templates/charts.html b/app_urls/api/templates/charts.html new file mode 100644 index 0000000..f39445d --- /dev/null +++ b/app_urls/api/templates/charts.html @@ -0,0 +1,294 @@ + + + + + + Charts + + + + + +

Data Visualizations

+ + +
+ + +
+ +
+
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+
+ + + + diff --git a/app_urls/api/templates/filtered_urls.html b/app_urls/api/templates/filtered_urls.html new file mode 100644 index 0000000..2be7028 --- /dev/null +++ b/app_urls/api/templates/filtered_urls.html @@ -0,0 +1,277 @@ + + + + + + URLs + + + + + + +
+ + +
+ + + + + + + + + + + + + {% for url in urls %} + + + + + + + + + {% empty %} + + + + {% endfor %} + +
IDURLStatusFetch DateSearchSource
{{ url.id }}{{ url.url }} + {% if url.status == 'raw' %} + {{ url.status|capfirst }} + {% elif url.status == 'error' %} + {{ url.status|capfirst }} + {% elif url.status == 'valid' %} + {{ url.status|capfirst }} + {% elif url.status == 'unknown' %} + {{ url.status|capfirst }} + {% elif url.status == 'invalid' %} + {{ url.status|capfirst }} + {% elif url.status == 'duplicate' %} + {{ url.status|capfirst }} + {% else %} + Unknown + {% endif %} + {{ url.ts_fetch }} + {% for search in url.urlssourcesearch_set.all %} + {{ search.id_search.search }}
+ {% endfor %} +
+ {% for source in url.urlssourcesearch_set.all %} + {{ source.id_source.source }}
+ {% endfor %} +
No URLs found for the selected filters.
+
+
+ + + + + + + + diff --git a/app_urls/api/urls.py b/app_urls/api/urls.py index aa416ad..34e839f 100644 --- a/app_urls/api/urls.py +++ b/app_urls/api/urls.py @@ -3,8 +3,20 @@ from . import views urlpatterns = [ path('', views.link_list, name='link_list'), + # + path('logs', views.logs, name='logs'), + path('logs_error', views.logs_error, name='logs_error'), + # + path('charts/', views.charts, name='charts'), + path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'), + path('urls-per-status/', views.urls_per_status, name='urls_per_status'), + path('urls-per-source/', views.urls_per_source, name='urls_per_source'), + path('urls-per-search/', views.urls_per_search, name='urls_per_search'), + # + path('filtered-urls/', views.filtered_urls, name='filtered_urls'), + # path('url/', views.urls, name='url_detail'), path('url//', views.url_detail_view, name='url_detail'), path('url//fetch/', views.fetch_details, name='fetch_details'), - path('task/', views.trigger_task, name='trigger_task'), + path('task/', views.trigger_task, name='trigger_task'), ] diff --git a/app_urls/api/views.py b/app_urls/api/views.py index 5d84e11..30b4411 100644 --- a/app_urls/api/views.py +++ b/app_urls/api/views.py @@ -25,6 +25,11 @@ def link_list(request): "http://localhost:8000/admin", # URLs "http://localhost:8000/api/url", + # Charts + "http://localhost:8000/api/charts", + # Logs + "http://localhost:8000/api/logs", + "http://localhost:8000/api/logs_error", # API tasks ] + [os.path.join(prefix, l) for l in links] # Json @@ -98,7 +103,7 @@ def urls(request): return render(request, "urls.html", context) - +#################################################################################################### class OllamaClient(): def __init__(self): self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org")) @@ -170,3 +175,137 @@ def fetch_details(request, id): yield chunk["message"]["content"] # Stream each chunk of text return StreamingHttpResponse(stream_response(), content_type="text/plain") + + +#################################################################################################### +from django.shortcuts import render +from django.http import JsonResponse +from django.db.models import Count +from datetime import timedelta +from django.utils import timezone +from .models import Urls, UrlsSourceSearch + +def charts(request): + return render(request, 'charts.html') + +def urls_by_fetch_date(request): + # Get the date for 30 days ago + start_date = timezone.now() - timedelta(days=30) + + # Count the number of URLs grouped by fetch date + urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \ + .values('ts_fetch__date') \ + .annotate(count=Count('id')) \ + .order_by('ts_fetch__date') + + # Format data to return as JSON + data = { + 'dates': [item['ts_fetch__date'] for item in urls_data], + 'counts': [item['count'] for item in urls_data], + } + + return JsonResponse(data) + +def urls_per_status(request): + # Get the filtering date parameter + days = int(request.GET.get('days', 30)) # Default is 30 days + start_date = timezone.now() - timedelta(days=days) + + # Count the number of URLs grouped by status within the date range + urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \ + .values('status') \ + .annotate(count=Count('id')) \ + .order_by('status') + + # Format data for JSON + data = { + 'statuses': [item['status'] for item in urls_data], + 'counts': [item['count'] for item in urls_data], + } + + return JsonResponse(data) + +def urls_per_source(request): + # Count the number of URLs grouped by source + urls_data = UrlsSourceSearch.objects \ + .values('id_source__source') \ + .annotate(count=Count('id_url')) \ + .order_by('id_source__source') + + # Format data for JSON + data = { + 'sources': [item['id_source__source'] for item in urls_data], + 'counts': [item['count'] for item in urls_data], + } + + return JsonResponse(data) + +def urls_per_search(request): + # Count the number of URLs grouped by search + urls_data = UrlsSourceSearch.objects \ + .values('id_search__search') \ + .annotate(count=Count('id_url')) \ + .order_by('id_search__search') + + # Format data for JSON + data = { + 'searches': [item['id_search__search'] for item in urls_data], + 'counts': [item['count'] for item in urls_data], + } + + return JsonResponse(data) + +#################################################################################################### +from django.http import HttpResponse + +def logs_error(request): + with open(os.getenv("PATH_LOGS_ERROR", "logs/log_app_fetcher_error.log"), "r") as f: + file_content = f.read() + return HttpResponse(file_content, content_type="text/plain") + +def logs(request): + with open(os.getenv("PATH_LOGS", "logs/log_app_fetcher.log"), "r") as f: + file_content = f.read() + return HttpResponse(file_content, content_type="text/plain") + +#################################################################################################### +from django.shortcuts import render +from .models import Urls, Search, Source + +def filtered_urls(request): + statuses = Urls.STATUS_ENUM.choices + searches = Search.objects.all() + sources = Source.objects.all() + + # Check if filters are applied; if not, select all by default + if not request.GET: + selected_status = [str(status[0]) for status in statuses] + selected_search = [str(search.id) for search in searches] + selected_source = [str(source.id) for source in sources] + else: + selected_status = request.GET.getlist('status') + selected_search = request.GET.getlist('search') + selected_source = request.GET.getlist('source') + + # Filter URLs based on selected filters + urls = Urls.objects.all() + if selected_status: + urls = urls.filter(status__in=selected_status) + if selected_search: + urls = urls.filter(urlssourcesearch__id_search__in=selected_search) + if selected_source: + urls = urls.filter(urlssourcesearch__id_source__in=selected_source) + + context = { + 'urls': urls, + 'statuses': statuses, + 'searches': searches, + 'sources': sources, + 'selected_status': selected_status, + 'selected_search': selected_search, + 'selected_source': selected_source, + } + + return render(request, 'filtered_urls.html', context) + +#################################################################################################### \ No newline at end of file