Wait db connection, login required, dev mode enable

2025-04-04 12:28:22 +02:00
parent 4dbe2e55ef
commit 76079d7bd0
8 changed files with 105 additions and 54 deletions
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
    - Fetcher -> Inserts raw URLs
        - Fetch parsing URL host
        - Fetch from RSS feed
-        - Fetch searching (Google search & news, DuckDuckGo, ...)
+        - Fetch keyword search (Google search & news, DuckDuckGo, ...)
            ++ Sources -> Robustness to TooManyRequests block
                - Selenium based
                    - Sites change their logic, request captcha, ...
@@ -13,20 +13,23 @@
                - Bing API
                    - Subscription required
                - Yandex. No API?
            ++ Proxy / VPN?
                TooManyRequests, ...
            ++ Search per locale (nl-NL, fr-FR, en-GB)
    - Process URLs -> Updates raw URLs
        - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
        - Determines if it is a valid article content
        ++ Proxy / VPN?
            Bypass geoblock
    - Valid URLs
        - Generate summary
            - One paragraph
            - At most three paragraphs
        - Classification
            - 5W: Who, What, When, Where, Why of a Story
            - Related to child abuse?
            - ...
 Georgia Institute of Technology
 https://comm.gatech.edu › resources › writers
 - Visualization of URLs
    - Filter URLs
        - By status, search, source, language
--- a/app_urls/Dockerfile
+++ b/app_urls/Dockerfile
@@ -29,7 +29,6 @@ RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
   echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \
   echo 'else' >> /opt/app/initialize.sh && \
   echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \
   echo 'sleep 5' >> /opt/app/initialize.sh && \
   echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \
   echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \
   echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \
@@ -40,8 +39,10 @@ RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
 # Serving script
 RUN echo '#!/bin/bash' > /opt/app/run.sh && \
   # Prod mode:
   echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
-   #echo 'python manage.py runserver & python manage.py rqworker high default low' >> /opt/app/run.sh && \
+   # Dev mode:
   #echo 'gunicorn core.wsgi:application --reload --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
   chmod +x /opt/app/run.sh
 # Run Django’s server & workers
--- a/app_urls/core/settings.py
+++ b/app_urls/core/settings.py
@@ -24,7 +24,6 @@ SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-54mqLbW5NlO8OlVDsT3
 # SECURITY WARNING: don't run with debug turned on in production!
 DEBUG = (os.environ.get('DJANGO_DEBUG') == "True")
 print("Django debug mode:", DEBUG)
 ALLOWED_HOSTS = os.environ.get('DJANGO_ALLOWED_HOSTS', "*").split(",")
@@ -51,6 +50,7 @@ MIDDLEWARE = [
    'django.contrib.auth.middleware.AuthenticationMiddleware',
    'django.contrib.messages.middleware.MessageMiddleware',
    'django.middleware.clickjacking.XFrameOptionsMiddleware',
    'fetcher.middleware.login_required.LoginRequiredMiddleware',
 ]
 STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'
@@ -148,6 +148,7 @@ AUTH_PASSWORD_VALIDATORS = [
    },
 ]
 LOGIN_URL = '/admin/'
 # Internationalization
--- a/app_urls/db.py
+++ b/app_urls/db.py
@@ -2,6 +2,7 @@ import argparse
 import os
 import psycopg
 import re
 import time
 connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
    os.environ.get("DB_HOST", "localhost"),
@@ -11,6 +12,29 @@ connection_info = "host={} port={} dbname={} user={} password={} connect_timeout
    os.environ.get("DB_PASSWORD", "supermatitos")
 )
 def wait_connection():
    connected = False
    while (not connected):
        try:
            # Connect to an existing database
            with psycopg.connect(connection_info) as conn:
                # Open a cursor to perform database operations
                with conn.cursor() as cur:
                    # Create URLs table
                    c = cur.execute("SELECT 1;").fetchall()
                    connected = True
        except psycopg.OperationalError as e:
            # Connection not ready...
            # print(".", end="")
            time.sleep(2)
        except Exception as e:
            # Connection not ready...
            # print("e", end="")
            time.sleep(2)
    print("DB connection ready")
 def initialize_tables():
    # Connect to an existing database
    with psycopg.connect(connection_info) as conn:
@@ -137,6 +161,9 @@ if __name__ == '__main__':
    parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
    args = parser.parse_args()
    # Wait for DB connection
    wait_connection()
    if (args.initialize_tables):
        print("Initializing tables")
        initialize_tables()
--- a/app_urls/fetcher/middleware/login_required.py
+++ b/app_urls/fetcher/middleware/login_required.py
@@ -0,0 +1,24 @@
 from django.shortcuts import redirect
 from django.conf import settings
 from django.urls import reverse
 EXEMPT_URLS = [
    # reverse('login'),  # or the name of your login view
    reverse('admin:login'),
    reverse('admin:index'),
    # reverse('logout'),  # optional
    '/admin/',  # allow full access to admin
    settings.STATIC_URL,  # allow static files
    # path('scheduler/', include('scheduler.urls')),
 ]
 class LoginRequiredMiddleware:
    def __init__(self, get_response):
        self.get_response = get_response
    def __call__(self, request):
        if not request.user.is_authenticated:
            path = request.path
            if not any(path.startswith(url) for url in EXEMPT_URLS):
                return redirect(settings.LOGIN_URL)
        return self.get_response(request)
--- a/app_urls/fetcher/views.py
+++ b/app_urls/fetcher/views.py
@@ -4,7 +4,7 @@ from django.shortcuts import render, get_object_or_404
 from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
 from django.contrib.auth.decorators import login_required
 import ollama
-from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
+from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
 import os
 ####################################################################################################
@@ -37,7 +37,6 @@ def link_list(request):
    return JsonResponse({"links": list_links })
 ####################################################################################################
 # @login_required(login_url='/admin')
 def logs(request, log_type):
    # Capture output: python manage.py rqstats
    try:
@@ -71,25 +70,20 @@ class OllamaClient():
        # return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
        #return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
-# TODO: move to ollamajs...
+
 def fetch_details(request, id):
    url_item = get_object_or_404(Urls, id=id)
    url_param = request.GET.get("url", "")  # Get URL
    model = request.GET.get("model", "")  # Get LLM model
    # TODO: post with body
    text = request.GET.get("text", "")  # Get LLM prompt
    # print(request)
    # print(text)
    # LLM
    ollama = OllamaClient()
    def stream_response():
        msg_content = {
            "role": "user", 
            "content": text,
        }
-        response = ollama.client.chat(model=model, messages=[msg_content], stream=True)
+        response = OllamaClient().client.chat(model=model, messages=[msg_content], stream=True)
        for chunk in response:
            yield chunk["message"]["content"]  # Stream each chunk of text
@@ -102,6 +96,12 @@ def url_detail_view(request, id):
    url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
    # url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
    url_duplicate = UrlsDuplicate.objects.get(id_url_duplicated=url_item)
    #id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True)  # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
    #id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
    url_duplicate.id_url_duplicated
    try:
        url_content = UrlContent.objects.get(pk=id)
    except UrlContent.DoesNotExist:
@@ -222,9 +222,7 @@ def filtered_urls(request):
    statuses = Urls.STATUS_ENUM.choices
    searches = Search.objects.all()
    sources = Source.objects.all()
-    # TODO: Cache languages, update once every N
+    languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True)) # TODO: Cache languages
    languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
    # Null for visualization
    languages = ["Unknown"] + [l for l in languages if l is not None]
    valid_contents = ["True", "False", "Unknown"]
@@ -237,15 +235,7 @@ def filtered_urls(request):
    selected_days = request.GET.get("days", 30)
    per_page = request.GET.get('per_page', 100)  # Default is X URLs per page
    page_number = request.GET.get('page')  # Get the current page number
    all_status = [str(status[0]) for status in statuses]
    all_search = [str(search.id) for search in searches]
    all_source = [str(source.id) for source in sources]
    all_languages = languages
    all_valid_contents = valid_contents
    # Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page" 
    if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
        selected_status = ["all"]
@@ -254,20 +244,22 @@ def filtered_urls(request):
        selected_language = ["all"]
        selected_valid_contents = ["all"]
    else:
        # All elements
        all_status = [str(status[0]) for status in statuses]
        all_search = [str(search.id) for search in searches]
        all_source = [str(source.id) for source in sources]
        all_languages = languages
        all_valid_contents = valid_contents
        # Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
-        if (set(selected_status) == set(all_status)):
+        selected_status = ["all"] if (set(selected_status) == set(all_status)) else selected_status
-            selected_status = ["all"]
+        selected_search = ["all"] if (set(selected_search) == set(all_search)) else selected_search
-        if (set(selected_search) == set(all_search)):
+        selected_source = ["all"] if (set(selected_source) == set(all_source)) else selected_source
-            selected_search = ["all"]
+        selected_language = ["all"] if (set(selected_language) == set(all_languages)) else selected_language
-        if (set(selected_source) == set(all_source)):
+        selected_valid_contents = ["all"] if (set(selected_valid_contents) == set(all_valid_contents)) else selected_valid_contents
            selected_source = ["all"]
        if (set(selected_language) == set(all_languages)):
            selected_language = ["all"]
        if (set(selected_valid_contents) == set(all_valid_contents)):
            selected_valid_contents = ["all"]
    # Filter URLs based on selected filters
-    if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
+    if any( 'null' in l for l in [selected_status, selected_search, selected_source, selected_language, selected_valid_contents] ):
        urls = []
    else:
        # Filter by date
@@ -308,7 +300,6 @@ def filtered_urls(request):
        # Run query
        urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
        # print(urls.query)
    # Pagination
    paginator = Paginator(urls, per_page)  # Paginate the filtered URLs
--- a/app_urls/scheduled_tasks.json
+++ b/app_urls/scheduled_tasks.json
@@ -12,7 +12,7 @@
    "timeout": null,
    "result_ttl": 86400,
    "cron_string": null,
-    "scheduled_time": "2025-04-01T12:36:21+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
    "interval": 4,
    "interval_unit": "hours",
    "successful_runs": 0,
@@ -33,7 +33,7 @@
    "timeout": null,
    "result_ttl": 86400,
    "cron_string": null,
-    "scheduled_time": "2025-04-01T10:20:08+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
    "interval": 10,
    "interval_unit": "minutes",
    "successful_runs": 0,
@@ -54,7 +54,7 @@
    "timeout": null,
    "result_ttl": 86400,
    "cron_string": null,
-    "scheduled_time": "2025-04-01T10:37:50+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
    "interval": 4,
    "interval_unit": "hours",
    "successful_runs": 0,
@@ -73,9 +73,9 @@
    "repeat": null,
    "at_front": false,
    "timeout": null,
-    "result_ttl": null,
+    "result_ttl": 86400,
    "cron_string": null,
-    "scheduled_time": "2025-04-07T15:59:49+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
    "interval": 1,
    "interval_unit": "weeks",
    "successful_runs": 0,
@@ -96,8 +96,8 @@
    "timeout": null,
    "result_ttl": 86400,
    "cron_string": null,
-    "scheduled_time": "2025-04-01T10:18:56+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
-    "interval": 15,
+    "interval": 10,
    "interval_unit": "minutes",
    "successful_runs": 0,
    "failed_runs": 0,
@@ -117,7 +117,7 @@
    "timeout": null,
    "result_ttl": 86400,
    "cron_string": null,
-    "scheduled_time": "2025-04-01T10:25:42+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
    "interval": 1,
    "interval_unit": "hours",
    "successful_runs": 0,
@@ -138,7 +138,7 @@
    "timeout": null,
    "result_ttl": 86400,
    "cron_string": null,
-    "scheduled_time": "2025-04-01T10:29:33+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
    "interval": 1,
    "interval_unit": "hours",
    "successful_runs": 0,
@@ -159,7 +159,7 @@
    "timeout": null,
    "result_ttl": 86400,
    "cron_string": null,
-    "scheduled_time": "2025-04-01T10:29:33+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
    "interval": 4,
    "interval_unit": "hours",
    "successful_runs": 0,
@@ -180,7 +180,7 @@
    "timeout": null,
    "result_ttl": 86400,
    "cron_string": null,
-    "scheduled_time": "2025-04-01T10:29:33+00:00",
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
    "interval": 1,
    "interval_unit": "weeks",
    "successful_runs": 0,
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -59,6 +59,10 @@ services:
      # Selenium
      - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
      - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
    ########################
    #volumes:   # Dev mode
    #  - ./app_urls:/opt/app
    ########################
    ports:
      - 8000:8000
    depends_on:
@@ -84,8 +88,8 @@ services:
      POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos}
      POSTGRES_USER: ${DB_USER:-supermatitos}
      POSTGRES_INITDB_ARGS: '--data-checksums'
-    #volumes:   # Persistent DB?
+    volumes:   # Persistent DB?
-    #  - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
+      - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
    ports:
      - 5432 #:5432