From 76079d7bd014b5e5166a7997fa0f909d6cf56ea8 Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Fri, 4 Apr 2025 12:28:22 +0200 Subject: [PATCH] Wait db connection, login required, dev mode enable --- README.md | 13 +++-- app_urls/Dockerfile | 5 +- app_urls/core/settings.py | 3 +- app_urls/db.py | 27 +++++++++ app_urls/fetcher/middleware/login_required.py | 24 ++++++++ app_urls/fetcher/views.py | 57 ++++++++----------- app_urls/scheduled_tasks.json | 22 +++---- docker-compose.yml | 8 ++- 8 files changed, 105 insertions(+), 54 deletions(-) create mode 100644 app_urls/fetcher/middleware/login_required.py diff --git a/README.md b/README.md index ae32f84..54ff357 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ - Fetcher -> Inserts raw URLs - Fetch parsing URL host - Fetch from RSS feed - - Fetch searching (Google search & news, DuckDuckGo, ...) + - Fetch keyword search (Google search & news, DuckDuckGo, ...) ++ Sources -> Robustness to TooManyRequests block - Selenium based - Sites change their logic, request captcha, ... @@ -13,20 +13,23 @@ - Bing API - Subscription required - Yandex. No API? + ++ Proxy / VPN? + TooManyRequests, ... + ++ Search per locale (nl-NL, fr-FR, en-GB) - Process URLs -> Updates raw URLs - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date - Determines if it is a valid article content + ++ Proxy / VPN? + Bypass geoblock - Valid URLs - Generate summary + - One paragraph + - At most three paragraphs - Classification - 5W: Who, What, When, Where, Why of a Story - Related to child abuse? - ... -Georgia Institute of Technology -https://comm.gatech.edu › resources › writers - - - Visualization of URLs - Filter URLs - By status, search, source, language diff --git a/app_urls/Dockerfile b/app_urls/Dockerfile index 1a1ddd0..b1dd674 100644 --- a/app_urls/Dockerfile +++ b/app_urls/Dockerfile @@ -29,7 +29,6 @@ RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \ echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \ echo 'else' >> /opt/app/initialize.sh && \ echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \ - echo 'sleep 5' >> /opt/app/initialize.sh && \ echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \ echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \ echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \ @@ -40,8 +39,10 @@ RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \ # Serving script RUN echo '#!/bin/bash' > /opt/app/run.sh && \ + # Prod mode: echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \ - #echo 'python manage.py runserver & python manage.py rqworker high default low' >> /opt/app/run.sh && \ + # Dev mode: + #echo 'gunicorn core.wsgi:application --reload --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \ chmod +x /opt/app/run.sh # Run Django’s server & workers diff --git a/app_urls/core/settings.py b/app_urls/core/settings.py index de1dc9c..a2357ae 100644 --- a/app_urls/core/settings.py +++ b/app_urls/core/settings.py @@ -24,7 +24,6 @@ SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-54mqLbW5NlO8OlVDsT3 # SECURITY WARNING: don't run with debug turned on in production! DEBUG = (os.environ.get('DJANGO_DEBUG') == "True") -print("Django debug mode:", DEBUG) ALLOWED_HOSTS = os.environ.get('DJANGO_ALLOWED_HOSTS', "*").split(",") @@ -51,6 +50,7 @@ MIDDLEWARE = [ 'django.contrib.auth.middleware.AuthenticationMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware', + 'fetcher.middleware.login_required.LoginRequiredMiddleware', ] STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage' @@ -148,6 +148,7 @@ AUTH_PASSWORD_VALIDATORS = [ }, ] +LOGIN_URL = '/admin/' # Internationalization diff --git a/app_urls/db.py b/app_urls/db.py index ecf7583..7f1bedf 100644 --- a/app_urls/db.py +++ b/app_urls/db.py @@ -2,6 +2,7 @@ import argparse import os import psycopg import re +import time connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format( os.environ.get("DB_HOST", "localhost"), @@ -11,6 +12,29 @@ connection_info = "host={} port={} dbname={} user={} password={} connect_timeout os.environ.get("DB_PASSWORD", "supermatitos") ) +def wait_connection(): + connected = False + while (not connected): + try: + # Connect to an existing database + with psycopg.connect(connection_info) as conn: + # Open a cursor to perform database operations + with conn.cursor() as cur: + # Create URLs table + c = cur.execute("SELECT 1;").fetchall() + connected = True + + except psycopg.OperationalError as e: + # Connection not ready... + # print(".", end="") + time.sleep(2) + except Exception as e: + # Connection not ready... + # print("e", end="") + time.sleep(2) + + print("DB connection ready") + def initialize_tables(): # Connect to an existing database with psycopg.connect(connection_info) as conn: @@ -137,6 +161,9 @@ if __name__ == '__main__': parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False) args = parser.parse_args() + # Wait for DB connection + wait_connection() + if (args.initialize_tables): print("Initializing tables") initialize_tables() diff --git a/app_urls/fetcher/middleware/login_required.py b/app_urls/fetcher/middleware/login_required.py new file mode 100644 index 0000000..cf930be --- /dev/null +++ b/app_urls/fetcher/middleware/login_required.py @@ -0,0 +1,24 @@ +from django.shortcuts import redirect +from django.conf import settings +from django.urls import reverse + +EXEMPT_URLS = [ + # reverse('login'), # or the name of your login view + reverse('admin:login'), + reverse('admin:index'), + # reverse('logout'), # optional + '/admin/', # allow full access to admin + settings.STATIC_URL, # allow static files + # path('scheduler/', include('scheduler.urls')), +] + +class LoginRequiredMiddleware: + def __init__(self, get_response): + self.get_response = get_response + + def __call__(self, request): + if not request.user.is_authenticated: + path = request.path + if not any(path.startswith(url) for url in EXEMPT_URLS): + return redirect(settings.LOGIN_URL) + return self.get_response(request) diff --git a/app_urls/fetcher/views.py b/app_urls/fetcher/views.py index e94fe6c..49335ce 100644 --- a/app_urls/fetcher/views.py +++ b/app_urls/fetcher/views.py @@ -4,7 +4,7 @@ from django.shortcuts import render, get_object_or_404 from django.http import StreamingHttpResponse, JsonResponse, HttpResponse from django.contrib.auth.decorators import login_required import ollama -from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch +from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate import os #################################################################################################### @@ -37,7 +37,6 @@ def link_list(request): return JsonResponse({"links": list_links }) #################################################################################################### -# @login_required(login_url='/admin') def logs(request, log_type): # Capture output: python manage.py rqstats try: @@ -71,25 +70,20 @@ class OllamaClient(): # return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:" #return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content) -# TODO: move to ollamajs... + def fetch_details(request, id): url_item = get_object_or_404(Urls, id=id) url_param = request.GET.get("url", "") # Get URL model = request.GET.get("model", "") # Get LLM model + # TODO: post with body text = request.GET.get("text", "") # Get LLM prompt - # print(request) - # print(text) - - # LLM - ollama = OllamaClient() - def stream_response(): msg_content = { "role": "user", "content": text, } - response = ollama.client.chat(model=model, messages=[msg_content], stream=True) + response = OllamaClient().client.chat(model=model, messages=[msg_content], stream=True) for chunk in response: yield chunk["message"]["content"] # Stream each chunk of text @@ -102,6 +96,12 @@ def url_detail_view(request, id): url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct()) # url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item) + url_duplicate = UrlsDuplicate.objects.get(id_url_duplicated=url_item) + #id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected. + #id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set') + + url_duplicate.id_url_duplicated + try: url_content = UrlContent.objects.get(pk=id) except UrlContent.DoesNotExist: @@ -222,9 +222,7 @@ def filtered_urls(request): statuses = Urls.STATUS_ENUM.choices searches = Search.objects.all() sources = Source.objects.all() - # TODO: Cache languages, update once every N - languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True)) - # Null for visualization + languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True)) # TODO: Cache languages languages = ["Unknown"] + [l for l in languages if l is not None] valid_contents = ["True", "False", "Unknown"] @@ -237,15 +235,7 @@ def filtered_urls(request): selected_days = request.GET.get("days", 30) per_page = request.GET.get('per_page', 100) # Default is X URLs per page page_number = request.GET.get('page') # Get the current page number - - all_status = [str(status[0]) for status in statuses] - all_search = [str(search.id) for search in searches] - all_source = [str(source.id) for source in sources] - all_languages = languages - all_valid_contents = valid_contents - - # Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page" if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())): selected_status = ["all"] @@ -254,20 +244,22 @@ def filtered_urls(request): selected_language = ["all"] selected_valid_contents = ["all"] else: + # All elements + all_status = [str(status[0]) for status in statuses] + all_search = [str(search.id) for search in searches] + all_source = [str(source.id) for source in sources] + all_languages = languages + all_valid_contents = valid_contents + # Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query - if (set(selected_status) == set(all_status)): - selected_status = ["all"] - if (set(selected_search) == set(all_search)): - selected_search = ["all"] - if (set(selected_source) == set(all_source)): - selected_source = ["all"] - if (set(selected_language) == set(all_languages)): - selected_language = ["all"] - if (set(selected_valid_contents) == set(all_valid_contents)): - selected_valid_contents = ["all"] + selected_status = ["all"] if (set(selected_status) == set(all_status)) else selected_status + selected_search = ["all"] if (set(selected_search) == set(all_search)) else selected_search + selected_source = ["all"] if (set(selected_source) == set(all_source)) else selected_source + selected_language = ["all"] if (set(selected_language) == set(all_languages)) else selected_language + selected_valid_contents = ["all"] if (set(selected_valid_contents) == set(all_valid_contents)) else selected_valid_contents # Filter URLs based on selected filters - if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents): + if any( 'null' in l for l in [selected_status, selected_search, selected_source, selected_language, selected_valid_contents] ): urls = [] else: # Filter by date @@ -308,7 +300,6 @@ def filtered_urls(request): # Run query urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch') - # print(urls.query) # Pagination paginator = Paginator(urls, per_page) # Paginate the filtered URLs diff --git a/app_urls/scheduled_tasks.json b/app_urls/scheduled_tasks.json index 8de7cad..a76294d 100644 --- a/app_urls/scheduled_tasks.json +++ b/app_urls/scheduled_tasks.json @@ -12,7 +12,7 @@ "timeout": null, "result_ttl": 86400, "cron_string": null, - "scheduled_time": "2025-04-01T12:36:21+00:00", + "scheduled_time": "2025-01-01T00:00:00+00:00", "interval": 4, "interval_unit": "hours", "successful_runs": 0, @@ -33,7 +33,7 @@ "timeout": null, "result_ttl": 86400, "cron_string": null, - "scheduled_time": "2025-04-01T10:20:08+00:00", + "scheduled_time": "2025-01-01T00:00:00+00:00", "interval": 10, "interval_unit": "minutes", "successful_runs": 0, @@ -54,7 +54,7 @@ "timeout": null, "result_ttl": 86400, "cron_string": null, - "scheduled_time": "2025-04-01T10:37:50+00:00", + "scheduled_time": "2025-01-01T00:00:00+00:00", "interval": 4, "interval_unit": "hours", "successful_runs": 0, @@ -73,9 +73,9 @@ "repeat": null, "at_front": false, "timeout": null, - "result_ttl": null, + "result_ttl": 86400, "cron_string": null, - "scheduled_time": "2025-04-07T15:59:49+00:00", + "scheduled_time": "2025-01-01T00:00:00+00:00", "interval": 1, "interval_unit": "weeks", "successful_runs": 0, @@ -96,8 +96,8 @@ "timeout": null, "result_ttl": 86400, "cron_string": null, - "scheduled_time": "2025-04-01T10:18:56+00:00", - "interval": 15, + "scheduled_time": "2025-01-01T00:00:00+00:00", + "interval": 10, "interval_unit": "minutes", "successful_runs": 0, "failed_runs": 0, @@ -117,7 +117,7 @@ "timeout": null, "result_ttl": 86400, "cron_string": null, - "scheduled_time": "2025-04-01T10:25:42+00:00", + "scheduled_time": "2025-01-01T00:00:00+00:00", "interval": 1, "interval_unit": "hours", "successful_runs": 0, @@ -138,7 +138,7 @@ "timeout": null, "result_ttl": 86400, "cron_string": null, - "scheduled_time": "2025-04-01T10:29:33+00:00", + "scheduled_time": "2025-01-01T00:00:00+00:00", "interval": 1, "interval_unit": "hours", "successful_runs": 0, @@ -159,7 +159,7 @@ "timeout": null, "result_ttl": 86400, "cron_string": null, - "scheduled_time": "2025-04-01T10:29:33+00:00", + "scheduled_time": "2025-01-01T00:00:00+00:00", "interval": 4, "interval_unit": "hours", "successful_runs": 0, @@ -180,7 +180,7 @@ "timeout": null, "result_ttl": 86400, "cron_string": null, - "scheduled_time": "2025-04-01T10:29:33+00:00", + "scheduled_time": "2025-01-01T00:00:00+00:00", "interval": 1, "interval_unit": "weeks", "successful_runs": 0, diff --git a/docker-compose.yml b/docker-compose.yml index 0c0ce9e..f695204 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -59,6 +59,10 @@ services: # Selenium - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80} - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org} + ######################## + #volumes: # Dev mode + # - ./app_urls:/opt/app + ######################## ports: - 8000:8000 depends_on: @@ -84,8 +88,8 @@ services: POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos} POSTGRES_USER: ${DB_USER:-supermatitos} POSTGRES_INITDB_ARGS: '--data-checksums' - #volumes: # Persistent DB? - # - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data + volumes: # Persistent DB? + - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data ports: - 5432 #:5432