From f44b784715a1ac98899c1443ad520533fdc26f00 Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Tue, 9 Sep 2025 22:06:23 +0200 Subject: [PATCH] Notifications, info and warning, try catch --- app_urls/fetcher/src/notifier.py | 228 ++++++++++++++++--------------- 1 file changed, 118 insertions(+), 110 deletions(-) diff --git a/app_urls/fetcher/src/notifier.py b/app_urls/fetcher/src/notifier.py index d1a4913..86d245a 100644 --- a/app_urls/fetcher/src/notifier.py +++ b/app_urls/fetcher/src/notifier.py @@ -4,129 +4,50 @@ from ..models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDup from django.db.models import Count import requests import os +from .logger import get_logger +logger = get_logger() def notify_telegram_info(last_hours, channel="INFO"): - start_date = timezone.now() - timedelta(hours=last_hours) - - # Count the number of URLs grouped by status within the date range - urls_data_status = Urls.objects.filter(ts_fetch__gte=start_date) \ - .values('status') \ - .annotate(count=Count('id')) \ - .order_by('status') - - # Count the number of URLs grouped by source - urls_data_source = UrlsSourceSearch.objects \ - .filter(id_url__ts_fetch__gte=start_date) \ - .values('id_source__source') \ - .annotate(count=Count('id_url')) \ - .order_by('id_source__source') - - # Count the number of URLs grouped by search - urls_data_search = UrlsSourceSearch.objects \ - .filter(id_url__ts_fetch__gte=start_date) \ - .values('id_search__search') \ - .annotate(count=Count('id_url')) \ - .order_by('id_search__search') + try: + start_date = timezone.now() - timedelta(hours=last_hours) + + # Count the number of URLs grouped by status within the date range + urls_data_status = Urls.objects.filter(ts_fetch__gte=start_date) \ + .values('status') \ + .annotate(count=Count('id')) \ + .order_by('status') + + # Count the number of URLs grouped by source + urls_data_source = UrlsSourceSearch.objects \ + .filter(id_url__ts_fetch__gte=start_date) \ + .values('id_source__source') \ + .annotate(count=Count('id_url')) \ + .order_by('id_source__source') + + # Count the number of URLs grouped by search + urls_data_search = UrlsSourceSearch.objects \ + .filter(id_url__ts_fetch__gte=start_date) \ + .values('id_search__search') \ + .annotate(count=Count('id_url')) \ + .order_by('id_search__search') - bot_token = os.environ.get("TELEGRAM_{}_BOT_TOKEN".format(channel), "") - chat_id = os.environ.get("TELEGRAM_{}_CHAT_ID".format(channel), "") + bot_token = os.environ.get("TELEGRAM_{}_BOT_TOKEN".format(channel), "") + chat_id = os.environ.get("TELEGRAM_{}_CHAT_ID".format(channel), "") - message = "During the last {} hours:\n".format(last_hours) - - message += "\nURLs per status:\n" - for o in urls_data_status: - message += " {}: {}\n".format(o.get("status"), o.get("count")) - message += "\nURLs per source:\n" - for o in urls_data_source: - message += " {}: {}\n".format(o.get("id_source__source"), o.get("count")) - message += "\nURLs per search:\n" - for o in urls_data_search: - message += " {}: {}\n".format(o.get("id_search__search"), o.get("count")) - - - url = f"https://api.telegram.org/bot{bot_token}/sendMessage" - params = { - "chat_id": chat_id, - "text": message - } - - # POST - response = requests.post(url, params=params) - - -def notify_telegram_warning(last_hours, channel="WARNING"): - # Message appending logic - message = "" - - start_date = timezone.now() - timedelta(hours=last_hours) - - # Count the number of URLs grouped by status within the date range - urls_data_status = Urls.objects.filter(ts_fetch__gte=start_date) \ - .values('status') \ - .annotate(count=Count('id')) \ - .order_by('status') - - # Build dictionary - urls_data_status_dict = {} - for o in urls_data_status: - # #STATUS - urls_data_status_dict[o.get("status")] = o.get("count") - # #TOTAL - urls_data_status_dict["total"] = urls_data_status_dict.get("total", 0) + o.get("count") - - MINIMUM_URLS_THRESHOLD = 10 - MINIMUM_PROCESSED_URLS_RATIO = 0.5 - - # Minimum amount of URLs - if (urls_data_status_dict.get("total") < MINIMUM_URLS_THRESHOLD): - message += "WARNING - Total #URLS during the last {} hours: {}\n".format(last_hours, urls_data_status_dict.get("total")) + message = "During the last {} hours:\n".format(last_hours) + message += "\nURLs per status:\n" for o in urls_data_status: message += " {}: {}\n".format(o.get("status"), o.get("count")) - - # Minimum ratio of processed raw urls - if (urls_data_status_dict.get("total") > 0): - if (urls_data_status_dict.get("raw") / urls_data_status_dict.get("total") < MINIMUM_PROCESSED_URLS_RATIO): - message += "WARNING - Small ratio of processed raw URLs during the last {} hours: {}\n".format(last_hours, urls_data_status_dict.get("total")) - message += "\nURLs per status:\n" - for o in urls_data_status: - message += " {}: {}\n".format(o.get("status"), o.get("count")) - - - # Count the number of URLs grouped by source - urls_data_source = UrlsSourceSearch.objects \ - .filter(id_url__ts_fetch__gte=start_date) \ - .values('id_source__source') \ - .annotate(count=Count('id_url')) \ - .order_by('id_source__source') - - MINIMUM_SOURCES = 2 - if (len(urls_data_source) < MINIMUM_SOURCES): - message += "WARNING - Very few sources found URLs during the last {} hours".format(last_hours) message += "\nURLs per source:\n" for o in urls_data_source: message += " {}: {}\n".format(o.get("id_source__source"), o.get("count")) + message += "\nURLs per search:\n" + for o in urls_data_search: + message += " {}: {}\n".format(o.get("id_search__search"), o.get("count")) - """ - # TODO: URLs per search, key should be present for cnbc.com, foxnews.com, zerohedge.com, breitbart.com, child abuse, child neglect - # Count the number of URLs grouped by search - urls_data_search = UrlsSourceSearch.objects \ - .filter(id_url__ts_fetch__gte=start_date) \ - .values('id_search__search') \ - .annotate(count=Count('id_url')) \ - .order_by('id_search__search') - message += "\nURLs per search:\n" - for o in urls_data_search: - message += " {}: {}\n".format(o.get("id_search__search"), o.get("count")) - """ - - # Valid message body? - if (message != ""): - bot_token = os.environ.get("TELEGRAM_{}_BOT_TOKEN".format(channel), "") - chat_id = os.environ.get("TELEGRAM_{}_CHAT_ID".format(channel), "") - url = f"https://api.telegram.org/bot{bot_token}/sendMessage" params = { "chat_id": chat_id, @@ -135,6 +56,93 @@ def notify_telegram_warning(last_hours, channel="WARNING"): # POST response = requests.post(url, params=params) + except Exception as e: + logger.info("Exception while notifying status: {}".format(str(e))) + + +def notify_telegram_warning(last_hours, channel="WARNING"): + try: + # Message appending logic + message = "" + + start_date = timezone.now() - timedelta(hours=last_hours) + + # Count the number of URLs grouped by status within the date range + urls_data_status = Urls.objects.filter(ts_fetch__gte=start_date) \ + .values('status') \ + .annotate(count=Count('id')) \ + .order_by('status') + + # Build dictionary + urls_data_status_dict = {} + for o in urls_data_status: + # #STATUS + urls_data_status_dict[o.get("status")] = o.get("count") + # #TOTAL + urls_data_status_dict["total"] = urls_data_status_dict.get("total", 0) + o.get("count") + + MINIMUM_URLS_THRESHOLD = 10 + MINIMUM_PROCESSED_URLS_RATIO = 0.5 + + # Minimum amount of URLs + if (urls_data_status_dict.get("total") < MINIMUM_URLS_THRESHOLD): + message += "WARNING - Total #URLS during the last {} hours: {}\n".format(last_hours, urls_data_status_dict.get("total")) + message += "\nURLs per status:\n" + for o in urls_data_status: + message += " {}: {}\n".format(o.get("status"), o.get("count")) + + # Minimum ratio of processed raw urls + if (urls_data_status_dict.get("total") > 0): + if (urls_data_status_dict.get("raw") / urls_data_status_dict.get("total") < MINIMUM_PROCESSED_URLS_RATIO): + message += "WARNING - Small ratio of processed raw URLs during the last {} hours: {}\n".format(last_hours, urls_data_status_dict.get("total")) + message += "\nURLs per status:\n" + for o in urls_data_status: + message += " {}: {}\n".format(o.get("status"), o.get("count")) + + + # Count the number of URLs grouped by source + urls_data_source = UrlsSourceSearch.objects \ + .filter(id_url__ts_fetch__gte=start_date) \ + .values('id_source__source') \ + .annotate(count=Count('id_url')) \ + .order_by('id_source__source') + + MINIMUM_SOURCES = 2 + if (len(urls_data_source) < MINIMUM_SOURCES): + message += "WARNING - Very few sources found URLs during the last {} hours".format(last_hours) + message += "\nURLs per source:\n" + for o in urls_data_source: + message += " {}: {}\n".format(o.get("id_source__source"), o.get("count")) + + """ + # TODO: URLs per search, key should be present for cnbc.com, foxnews.com, zerohedge.com, breitbart.com, child abuse, child neglect + # Count the number of URLs grouped by search + urls_data_search = UrlsSourceSearch.objects \ + .filter(id_url__ts_fetch__gte=start_date) \ + .values('id_search__search') \ + .annotate(count=Count('id_url')) \ + .order_by('id_search__search') + + message += "\nURLs per search:\n" + for o in urls_data_search: + message += " {}: {}\n".format(o.get("id_search__search"), o.get("count")) + """ + + # Valid message body? + if (message != ""): + bot_token = os.environ.get("TELEGRAM_{}_BOT_TOKEN".format(channel), "") + chat_id = os.environ.get("TELEGRAM_{}_CHAT_ID".format(channel), "") + + url = f"https://api.telegram.org/bot{bot_token}/sendMessage" + params = { + "chat_id": chat_id, + "text": message + } + + # POST + response = requests.post(url, params=params) + except Exception as e: + logger.info("Exception while notifying status: {}".format(str(e))) def notify_telegram(last_hours=12):