From 76079d7bd014b5e5166a7997fa0f909d6cf56ea8 Mon Sep 17 00:00:00 2001
From: Luciano Gervasoni <luciano.gervasoni@3dlook.me>
Date: Fri, 4 Apr 2025 12:28:22 +0200
Subject: [PATCH] Wait db connection, login required, dev mode enable

---
 README.md                                     | 13 +++--
 app_urls/Dockerfile                           |  5 +-
 app_urls/core/settings.py                     |  3 +-
 app_urls/db.py                                | 27 +++++++++
 app_urls/fetcher/middleware/login_required.py | 24 ++++++++
 app_urls/fetcher/views.py                     | 57 ++++++++-----------
 app_urls/scheduled_tasks.json                 | 22 +++----
 docker-compose.yml                            |  8 ++-
 8 files changed, 105 insertions(+), 54 deletions(-)
 create mode 100644 app_urls/fetcher/middleware/login_required.py

diff --git a/README.md b/README.md
index ae32f84..54ff357 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
     - Fetcher -> Inserts raw URLs
         - Fetch parsing URL host
         - Fetch from RSS feed
-        - Fetch searching (Google search & news, DuckDuckGo, ...)
+        - Fetch keyword search (Google search & news, DuckDuckGo, ...)
             ++ Sources -> Robustness to TooManyRequests block
                 - Selenium based
                     - Sites change their logic, request captcha, ...
@@ -13,20 +13,23 @@
                 - Bing API
                     - Subscription required
                 - Yandex. No API?
+            ++ Proxy / VPN?
+                TooManyRequests, ...
+            ++ Search per locale (nl-NL, fr-FR, en-GB)
     - Process URLs -> Updates raw URLs
         - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
         - Determines if it is a valid article content
+        ++ Proxy / VPN?
+            Bypass geoblock
     - Valid URLs
         - Generate summary
+            - One paragraph
+            - At most three paragraphs
         - Classification
             - 5W: Who, What, When, Where, Why of a Story
             - Related to child abuse?
             - ...
 
-Georgia Institute of Technology
-https://comm.gatech.edu › resources › writers
-
-
 - Visualization of URLs
     - Filter URLs
         - By status, search, source, language
diff --git a/app_urls/Dockerfile b/app_urls/Dockerfile
index 1a1ddd0..b1dd674 100644
--- a/app_urls/Dockerfile
+++ b/app_urls/Dockerfile
@@ -29,7 +29,6 @@ RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
    echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \
    echo 'else' >> /opt/app/initialize.sh && \
    echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \
-   echo 'sleep 5' >> /opt/app/initialize.sh && \
    echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \
    echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \
    echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \
@@ -40,8 +39,10 @@ RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
 
 # Serving script
 RUN echo '#!/bin/bash' > /opt/app/run.sh && \
+   # Prod mode:
    echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
-   #echo 'python manage.py runserver & python manage.py rqworker high default low' >> /opt/app/run.sh && \
+   # Dev mode:
+   #echo 'gunicorn core.wsgi:application --reload --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
    chmod +x /opt/app/run.sh
 
 # Run Django’s server & workers
diff --git a/app_urls/core/settings.py b/app_urls/core/settings.py
index de1dc9c..a2357ae 100644
--- a/app_urls/core/settings.py
+++ b/app_urls/core/settings.py
@@ -24,7 +24,6 @@ SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-54mqLbW5NlO8OlVDsT3
 
 # SECURITY WARNING: don't run with debug turned on in production!
 DEBUG = (os.environ.get('DJANGO_DEBUG') == "True")
-print("Django debug mode:", DEBUG)
 
 ALLOWED_HOSTS = os.environ.get('DJANGO_ALLOWED_HOSTS', "*").split(",")
 
@@ -51,6 +50,7 @@ MIDDLEWARE = [
     'django.contrib.auth.middleware.AuthenticationMiddleware',
     'django.contrib.messages.middleware.MessageMiddleware',
     'django.middleware.clickjacking.XFrameOptionsMiddleware',
+    'fetcher.middleware.login_required.LoginRequiredMiddleware',
 ]
 
 STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'
@@ -148,6 +148,7 @@ AUTH_PASSWORD_VALIDATORS = [
     },
 ]
 
+LOGIN_URL = '/admin/'
 
 # Internationalization
 
diff --git a/app_urls/db.py b/app_urls/db.py
index ecf7583..7f1bedf 100644
--- a/app_urls/db.py
+++ b/app_urls/db.py
@@ -2,6 +2,7 @@ import argparse
 import os
 import psycopg
 import re
+import time
 
 connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
     os.environ.get("DB_HOST", "localhost"),
@@ -11,6 +12,29 @@ connection_info = "host={} port={} dbname={} user={} password={} connect_timeout
     os.environ.get("DB_PASSWORD", "supermatitos")
 )
 
+def wait_connection():
+    connected = False
+    while (not connected):
+        try:
+            # Connect to an existing database
+            with psycopg.connect(connection_info) as conn:
+                # Open a cursor to perform database operations
+                with conn.cursor() as cur:
+                    # Create URLs table
+                    c = cur.execute("SELECT 1;").fetchall()
+                    connected = True
+
+        except psycopg.OperationalError as e:
+            # Connection not ready...
+            # print(".", end="")
+            time.sleep(2)
+        except Exception as e:
+            # Connection not ready...
+            # print("e", end="")
+            time.sleep(2)
+
+    print("DB connection ready")
+
 def initialize_tables():
     # Connect to an existing database
     with psycopg.connect(connection_info) as conn:
@@ -137,6 +161,9 @@ if __name__ == '__main__':
     parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
     args = parser.parse_args()
 
+    # Wait for DB connection
+    wait_connection()
+
     if (args.initialize_tables):
         print("Initializing tables")
         initialize_tables()
diff --git a/app_urls/fetcher/middleware/login_required.py b/app_urls/fetcher/middleware/login_required.py
new file mode 100644
index 0000000..cf930be
--- /dev/null
+++ b/app_urls/fetcher/middleware/login_required.py
@@ -0,0 +1,24 @@
+from django.shortcuts import redirect
+from django.conf import settings
+from django.urls import reverse
+
+EXEMPT_URLS = [
+    # reverse('login'),  # or the name of your login view
+    reverse('admin:login'),
+    reverse('admin:index'),
+    # reverse('logout'),  # optional
+    '/admin/',  # allow full access to admin
+    settings.STATIC_URL,  # allow static files
+    # path('scheduler/', include('scheduler.urls')),
+]
+
+class LoginRequiredMiddleware:
+    def __init__(self, get_response):
+        self.get_response = get_response
+
+    def __call__(self, request):
+        if not request.user.is_authenticated:
+            path = request.path
+            if not any(path.startswith(url) for url in EXEMPT_URLS):
+                return redirect(settings.LOGIN_URL)
+        return self.get_response(request)
diff --git a/app_urls/fetcher/views.py b/app_urls/fetcher/views.py
index e94fe6c..49335ce 100644
--- a/app_urls/fetcher/views.py
+++ b/app_urls/fetcher/views.py
@@ -4,7 +4,7 @@ from django.shortcuts import render, get_object_or_404
 from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
 from django.contrib.auth.decorators import login_required
 import ollama
-from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
+from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
 import os
 
 ####################################################################################################
@@ -37,7 +37,6 @@ def link_list(request):
     return JsonResponse({"links": list_links })
 
 ####################################################################################################
-# @login_required(login_url='/admin')
 def logs(request, log_type):
     # Capture output: python manage.py rqstats
     try:
@@ -71,25 +70,20 @@ class OllamaClient():
         # return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
         #return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
 
-# TODO: move to ollamajs...
+
 def fetch_details(request, id):
     url_item = get_object_or_404(Urls, id=id)
     url_param = request.GET.get("url", "")  # Get URL
     model = request.GET.get("model", "")  # Get LLM model
+    # TODO: post with body
     text = request.GET.get("text", "")  # Get LLM prompt
 
-    # print(request)
-    # print(text)
-
-    # LLM
-    ollama = OllamaClient()
-
     def stream_response():
         msg_content = {
             "role": "user", 
             "content": text,
         }
-        response = ollama.client.chat(model=model, messages=[msg_content], stream=True)
+        response = OllamaClient().client.chat(model=model, messages=[msg_content], stream=True)
         for chunk in response:
             yield chunk["message"]["content"]  # Stream each chunk of text
     
@@ -102,6 +96,12 @@ def url_detail_view(request, id):
     url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
     # url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
     
+    url_duplicate = UrlsDuplicate.objects.get(id_url_duplicated=url_item)
+    #id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True)  # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
+    #id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
+    
+    url_duplicate.id_url_duplicated
+
     try:
         url_content = UrlContent.objects.get(pk=id)
     except UrlContent.DoesNotExist:
@@ -222,9 +222,7 @@ def filtered_urls(request):
     statuses = Urls.STATUS_ENUM.choices
     searches = Search.objects.all()
     sources = Source.objects.all()
-    # TODO: Cache languages, update once every N
-    languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
-    # Null for visualization
+    languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True)) # TODO: Cache languages
     languages = ["Unknown"] + [l for l in languages if l is not None]
     valid_contents = ["True", "False", "Unknown"]
     
@@ -237,15 +235,7 @@ def filtered_urls(request):
     selected_days = request.GET.get("days", 30)
     per_page = request.GET.get('per_page', 100)  # Default is X URLs per page
     page_number = request.GET.get('page')  # Get the current page number
-
     
-    all_status = [str(status[0]) for status in statuses]
-    all_search = [str(search.id) for search in searches]
-    all_source = [str(source.id) for source in sources]
-    all_languages = languages
-    all_valid_contents = valid_contents
-    
-
     # Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page" 
     if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
         selected_status = ["all"]
@@ -254,20 +244,22 @@ def filtered_urls(request):
         selected_language = ["all"]
         selected_valid_contents = ["all"]
     else:
+        # All elements
+        all_status = [str(status[0]) for status in statuses]
+        all_search = [str(search.id) for search in searches]
+        all_source = [str(source.id) for source in sources]
+        all_languages = languages
+        all_valid_contents = valid_contents
+        
         # Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
-        if (set(selected_status) == set(all_status)):
-            selected_status = ["all"]
-        if (set(selected_search) == set(all_search)):
-            selected_search = ["all"]
-        if (set(selected_source) == set(all_source)):
-            selected_source = ["all"]
-        if (set(selected_language) == set(all_languages)):
-            selected_language = ["all"]
-        if (set(selected_valid_contents) == set(all_valid_contents)):
-            selected_valid_contents = ["all"]
+        selected_status = ["all"] if (set(selected_status) == set(all_status)) else selected_status
+        selected_search = ["all"] if (set(selected_search) == set(all_search)) else selected_search
+        selected_source = ["all"] if (set(selected_source) == set(all_source)) else selected_source
+        selected_language = ["all"] if (set(selected_language) == set(all_languages)) else selected_language
+        selected_valid_contents = ["all"] if (set(selected_valid_contents) == set(all_valid_contents)) else selected_valid_contents
 
     # Filter URLs based on selected filters
-    if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
+    if any( 'null' in l for l in [selected_status, selected_search, selected_source, selected_language, selected_valid_contents] ):
         urls = []
     else:
         # Filter by date
@@ -308,7 +300,6 @@ def filtered_urls(request):
 
         # Run query
         urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
-        # print(urls.query)
 
     # Pagination
     paginator = Paginator(urls, per_page)  # Paginate the filtered URLs
diff --git a/app_urls/scheduled_tasks.json b/app_urls/scheduled_tasks.json
index 8de7cad..a76294d 100644
--- a/app_urls/scheduled_tasks.json
+++ b/app_urls/scheduled_tasks.json
@@ -12,7 +12,7 @@
     "timeout": null,
     "result_ttl": 86400,
     "cron_string": null,
-    "scheduled_time": "2025-04-01T12:36:21+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
     "interval": 4,
     "interval_unit": "hours",
     "successful_runs": 0,
@@ -33,7 +33,7 @@
     "timeout": null,
     "result_ttl": 86400,
     "cron_string": null,
-    "scheduled_time": "2025-04-01T10:20:08+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
     "interval": 10,
     "interval_unit": "minutes",
     "successful_runs": 0,
@@ -54,7 +54,7 @@
     "timeout": null,
     "result_ttl": 86400,
     "cron_string": null,
-    "scheduled_time": "2025-04-01T10:37:50+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
     "interval": 4,
     "interval_unit": "hours",
     "successful_runs": 0,
@@ -73,9 +73,9 @@
     "repeat": null,
     "at_front": false,
     "timeout": null,
-    "result_ttl": null,
+    "result_ttl": 86400,
     "cron_string": null,
-    "scheduled_time": "2025-04-07T15:59:49+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
     "interval": 1,
     "interval_unit": "weeks",
     "successful_runs": 0,
@@ -96,8 +96,8 @@
     "timeout": null,
     "result_ttl": 86400,
     "cron_string": null,
-    "scheduled_time": "2025-04-01T10:18:56+00:00",
-    "interval": 15,
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
+    "interval": 10,
     "interval_unit": "minutes",
     "successful_runs": 0,
     "failed_runs": 0,
@@ -117,7 +117,7 @@
     "timeout": null,
     "result_ttl": 86400,
     "cron_string": null,
-    "scheduled_time": "2025-04-01T10:25:42+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
     "interval": 1,
     "interval_unit": "hours",
     "successful_runs": 0,
@@ -138,7 +138,7 @@
     "timeout": null,
     "result_ttl": 86400,
     "cron_string": null,
-    "scheduled_time": "2025-04-01T10:29:33+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
     "interval": 1,
     "interval_unit": "hours",
     "successful_runs": 0,
@@ -159,7 +159,7 @@
     "timeout": null,
     "result_ttl": 86400,
     "cron_string": null,
-    "scheduled_time": "2025-04-01T10:29:33+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
     "interval": 4,
     "interval_unit": "hours",
     "successful_runs": 0,
@@ -180,7 +180,7 @@
     "timeout": null,
     "result_ttl": 86400,
     "cron_string": null,
-    "scheduled_time": "2025-04-01T10:29:33+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
     "interval": 1,
     "interval_unit": "weeks",
     "successful_runs": 0,
diff --git a/docker-compose.yml b/docker-compose.yml
index 0c0ce9e..f695204 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -59,6 +59,10 @@ services:
       # Selenium
       - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
       - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
+    ########################
+    #volumes:   # Dev mode
+    #  - ./app_urls:/opt/app
+    ########################
     ports:
       - 8000:8000
     depends_on:
@@ -84,8 +88,8 @@ services:
       POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos}
       POSTGRES_USER: ${DB_USER:-supermatitos}
       POSTGRES_INITDB_ARGS: '--data-checksums'
-    #volumes:   # Persistent DB?
-    #  - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
+    volumes:   # Persistent DB?
+      - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
     ports:
       - 5432 #:5432