Wait db connection, login required, dev mode enable

This commit is contained in:
Luciano Gervasoni
2025-04-04 12:28:22 +02:00
parent 4dbe2e55ef
commit 76079d7bd0
8 changed files with 105 additions and 54 deletions

View File

@@ -4,7 +4,7 @@
- Fetcher -> Inserts raw URLs - Fetcher -> Inserts raw URLs
- Fetch parsing URL host - Fetch parsing URL host
- Fetch from RSS feed - Fetch from RSS feed
- Fetch searching (Google search & news, DuckDuckGo, ...) - Fetch keyword search (Google search & news, DuckDuckGo, ...)
++ Sources -> Robustness to TooManyRequests block ++ Sources -> Robustness to TooManyRequests block
- Selenium based - Selenium based
- Sites change their logic, request captcha, ... - Sites change their logic, request captcha, ...
@@ -13,20 +13,23 @@
- Bing API - Bing API
- Subscription required - Subscription required
- Yandex. No API? - Yandex. No API?
++ Proxy / VPN?
TooManyRequests, ...
++ Search per locale (nl-NL, fr-FR, en-GB)
- Process URLs -> Updates raw URLs - Process URLs -> Updates raw URLs
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
- Determines if it is a valid article content - Determines if it is a valid article content
++ Proxy / VPN?
Bypass geoblock
- Valid URLs - Valid URLs
- Generate summary - Generate summary
- One paragraph
- At most three paragraphs
- Classification - Classification
- 5W: Who, What, When, Where, Why of a Story - 5W: Who, What, When, Where, Why of a Story
- Related to child abuse? - Related to child abuse?
- ... - ...
Georgia Institute of Technology
https://comm.gatech.edu resources writers
- Visualization of URLs - Visualization of URLs
- Filter URLs - Filter URLs
- By status, search, source, language - By status, search, source, language

View File

@@ -29,7 +29,6 @@ RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \ echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \
echo 'else' >> /opt/app/initialize.sh && \ echo 'else' >> /opt/app/initialize.sh && \
echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \ echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \
echo 'sleep 5' >> /opt/app/initialize.sh && \
echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \ echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \
echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \ echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \
echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \ echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \
@@ -40,8 +39,10 @@ RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
# Serving script # Serving script
RUN echo '#!/bin/bash' > /opt/app/run.sh && \ RUN echo '#!/bin/bash' > /opt/app/run.sh && \
# Prod mode:
echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \ echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
#echo 'python manage.py runserver & python manage.py rqworker high default low' >> /opt/app/run.sh && \ # Dev mode:
#echo 'gunicorn core.wsgi:application --reload --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
chmod +x /opt/app/run.sh chmod +x /opt/app/run.sh
# Run Djangos server & workers # Run Djangos server & workers

View File

@@ -24,7 +24,6 @@ SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-54mqLbW5NlO8OlVDsT3
# SECURITY WARNING: don't run with debug turned on in production! # SECURITY WARNING: don't run with debug turned on in production!
DEBUG = (os.environ.get('DJANGO_DEBUG') == "True") DEBUG = (os.environ.get('DJANGO_DEBUG') == "True")
print("Django debug mode:", DEBUG)
ALLOWED_HOSTS = os.environ.get('DJANGO_ALLOWED_HOSTS', "*").split(",") ALLOWED_HOSTS = os.environ.get('DJANGO_ALLOWED_HOSTS', "*").split(",")
@@ -51,6 +50,7 @@ MIDDLEWARE = [
'django.contrib.auth.middleware.AuthenticationMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware', 'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware',
'fetcher.middleware.login_required.LoginRequiredMiddleware',
] ]
STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage' STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'
@@ -148,6 +148,7 @@ AUTH_PASSWORD_VALIDATORS = [
}, },
] ]
LOGIN_URL = '/admin/'
# Internationalization # Internationalization

View File

@@ -2,6 +2,7 @@ import argparse
import os import os
import psycopg import psycopg
import re import re
import time
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format( connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
os.environ.get("DB_HOST", "localhost"), os.environ.get("DB_HOST", "localhost"),
@@ -11,6 +12,29 @@ connection_info = "host={} port={} dbname={} user={} password={} connect_timeout
os.environ.get("DB_PASSWORD", "supermatitos") os.environ.get("DB_PASSWORD", "supermatitos")
) )
def wait_connection():
connected = False
while (not connected):
try:
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Create URLs table
c = cur.execute("SELECT 1;").fetchall()
connected = True
except psycopg.OperationalError as e:
# Connection not ready...
# print(".", end="")
time.sleep(2)
except Exception as e:
# Connection not ready...
# print("e", end="")
time.sleep(2)
print("DB connection ready")
def initialize_tables(): def initialize_tables():
# Connect to an existing database # Connect to an existing database
with psycopg.connect(connection_info) as conn: with psycopg.connect(connection_info) as conn:
@@ -137,6 +161,9 @@ if __name__ == '__main__':
parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False) parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
args = parser.parse_args() args = parser.parse_args()
# Wait for DB connection
wait_connection()
if (args.initialize_tables): if (args.initialize_tables):
print("Initializing tables") print("Initializing tables")
initialize_tables() initialize_tables()

View File

@@ -0,0 +1,24 @@
from django.shortcuts import redirect
from django.conf import settings
from django.urls import reverse
EXEMPT_URLS = [
# reverse('login'), # or the name of your login view
reverse('admin:login'),
reverse('admin:index'),
# reverse('logout'), # optional
'/admin/', # allow full access to admin
settings.STATIC_URL, # allow static files
# path('scheduler/', include('scheduler.urls')),
]
class LoginRequiredMiddleware:
def __init__(self, get_response):
self.get_response = get_response
def __call__(self, request):
if not request.user.is_authenticated:
path = request.path
if not any(path.startswith(url) for url in EXEMPT_URLS):
return redirect(settings.LOGIN_URL)
return self.get_response(request)

View File

@@ -4,7 +4,7 @@ from django.shortcuts import render, get_object_or_404
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
from django.contrib.auth.decorators import login_required from django.contrib.auth.decorators import login_required
import ollama import ollama
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
import os import os
#################################################################################################### ####################################################################################################
@@ -37,7 +37,6 @@ def link_list(request):
return JsonResponse({"links": list_links }) return JsonResponse({"links": list_links })
#################################################################################################### ####################################################################################################
# @login_required(login_url='/admin')
def logs(request, log_type): def logs(request, log_type):
# Capture output: python manage.py rqstats # Capture output: python manage.py rqstats
try: try:
@@ -71,25 +70,20 @@ class OllamaClient():
# return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:" # return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
#return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content) #return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
# TODO: move to ollamajs...
def fetch_details(request, id): def fetch_details(request, id):
url_item = get_object_or_404(Urls, id=id) url_item = get_object_or_404(Urls, id=id)
url_param = request.GET.get("url", "") # Get URL url_param = request.GET.get("url", "") # Get URL
model = request.GET.get("model", "") # Get LLM model model = request.GET.get("model", "") # Get LLM model
# TODO: post with body
text = request.GET.get("text", "") # Get LLM prompt text = request.GET.get("text", "") # Get LLM prompt
# print(request)
# print(text)
# LLM
ollama = OllamaClient()
def stream_response(): def stream_response():
msg_content = { msg_content = {
"role": "user", "role": "user",
"content": text, "content": text,
} }
response = ollama.client.chat(model=model, messages=[msg_content], stream=True) response = OllamaClient().client.chat(model=model, messages=[msg_content], stream=True)
for chunk in response: for chunk in response:
yield chunk["message"]["content"] # Stream each chunk of text yield chunk["message"]["content"] # Stream each chunk of text
@@ -102,6 +96,12 @@ def url_detail_view(request, id):
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct()) url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
# url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item) # url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
url_duplicate = UrlsDuplicate.objects.get(id_url_duplicated=url_item)
#id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
#id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
url_duplicate.id_url_duplicated
try: try:
url_content = UrlContent.objects.get(pk=id) url_content = UrlContent.objects.get(pk=id)
except UrlContent.DoesNotExist: except UrlContent.DoesNotExist:
@@ -222,9 +222,7 @@ def filtered_urls(request):
statuses = Urls.STATUS_ENUM.choices statuses = Urls.STATUS_ENUM.choices
searches = Search.objects.all() searches = Search.objects.all()
sources = Source.objects.all() sources = Source.objects.all()
# TODO: Cache languages, update once every N languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True)) # TODO: Cache languages
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
# Null for visualization
languages = ["Unknown"] + [l for l in languages if l is not None] languages = ["Unknown"] + [l for l in languages if l is not None]
valid_contents = ["True", "False", "Unknown"] valid_contents = ["True", "False", "Unknown"]
@@ -237,15 +235,7 @@ def filtered_urls(request):
selected_days = request.GET.get("days", 30) selected_days = request.GET.get("days", 30)
per_page = request.GET.get('per_page', 100) # Default is X URLs per page per_page = request.GET.get('per_page', 100) # Default is X URLs per page
page_number = request.GET.get('page') # Get the current page number page_number = request.GET.get('page') # Get the current page number
all_status = [str(status[0]) for status in statuses]
all_search = [str(search.id) for search in searches]
all_source = [str(source.id) for source in sources]
all_languages = languages
all_valid_contents = valid_contents
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page" # Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())): if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
selected_status = ["all"] selected_status = ["all"]
@@ -254,20 +244,22 @@ def filtered_urls(request):
selected_language = ["all"] selected_language = ["all"]
selected_valid_contents = ["all"] selected_valid_contents = ["all"]
else: else:
# All elements
all_status = [str(status[0]) for status in statuses]
all_search = [str(search.id) for search in searches]
all_source = [str(source.id) for source in sources]
all_languages = languages
all_valid_contents = valid_contents
# Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query # Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
if (set(selected_status) == set(all_status)): selected_status = ["all"] if (set(selected_status) == set(all_status)) else selected_status
selected_status = ["all"] selected_search = ["all"] if (set(selected_search) == set(all_search)) else selected_search
if (set(selected_search) == set(all_search)): selected_source = ["all"] if (set(selected_source) == set(all_source)) else selected_source
selected_search = ["all"] selected_language = ["all"] if (set(selected_language) == set(all_languages)) else selected_language
if (set(selected_source) == set(all_source)): selected_valid_contents = ["all"] if (set(selected_valid_contents) == set(all_valid_contents)) else selected_valid_contents
selected_source = ["all"]
if (set(selected_language) == set(all_languages)):
selected_language = ["all"]
if (set(selected_valid_contents) == set(all_valid_contents)):
selected_valid_contents = ["all"]
# Filter URLs based on selected filters # Filter URLs based on selected filters
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents): if any( 'null' in l for l in [selected_status, selected_search, selected_source, selected_language, selected_valid_contents] ):
urls = [] urls = []
else: else:
# Filter by date # Filter by date
@@ -308,7 +300,6 @@ def filtered_urls(request):
# Run query # Run query
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch') urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
# print(urls.query)
# Pagination # Pagination
paginator = Paginator(urls, per_page) # Paginate the filtered URLs paginator = Paginator(urls, per_page) # Paginate the filtered URLs

View File

@@ -12,7 +12,7 @@
"timeout": null, "timeout": null,
"result_ttl": 86400, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-04-01T12:36:21+00:00", "scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 4, "interval": 4,
"interval_unit": "hours", "interval_unit": "hours",
"successful_runs": 0, "successful_runs": 0,
@@ -33,7 +33,7 @@
"timeout": null, "timeout": null,
"result_ttl": 86400, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-04-01T10:20:08+00:00", "scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 10, "interval": 10,
"interval_unit": "minutes", "interval_unit": "minutes",
"successful_runs": 0, "successful_runs": 0,
@@ -54,7 +54,7 @@
"timeout": null, "timeout": null,
"result_ttl": 86400, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-04-01T10:37:50+00:00", "scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 4, "interval": 4,
"interval_unit": "hours", "interval_unit": "hours",
"successful_runs": 0, "successful_runs": 0,
@@ -73,9 +73,9 @@
"repeat": null, "repeat": null,
"at_front": false, "at_front": false,
"timeout": null, "timeout": null,
"result_ttl": null, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-04-07T15:59:49+00:00", "scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 1, "interval": 1,
"interval_unit": "weeks", "interval_unit": "weeks",
"successful_runs": 0, "successful_runs": 0,
@@ -96,8 +96,8 @@
"timeout": null, "timeout": null,
"result_ttl": 86400, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-04-01T10:18:56+00:00", "scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 15, "interval": 10,
"interval_unit": "minutes", "interval_unit": "minutes",
"successful_runs": 0, "successful_runs": 0,
"failed_runs": 0, "failed_runs": 0,
@@ -117,7 +117,7 @@
"timeout": null, "timeout": null,
"result_ttl": 86400, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-04-01T10:25:42+00:00", "scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 1, "interval": 1,
"interval_unit": "hours", "interval_unit": "hours",
"successful_runs": 0, "successful_runs": 0,
@@ -138,7 +138,7 @@
"timeout": null, "timeout": null,
"result_ttl": 86400, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-04-01T10:29:33+00:00", "scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 1, "interval": 1,
"interval_unit": "hours", "interval_unit": "hours",
"successful_runs": 0, "successful_runs": 0,
@@ -159,7 +159,7 @@
"timeout": null, "timeout": null,
"result_ttl": 86400, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-04-01T10:29:33+00:00", "scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 4, "interval": 4,
"interval_unit": "hours", "interval_unit": "hours",
"successful_runs": 0, "successful_runs": 0,
@@ -180,7 +180,7 @@
"timeout": null, "timeout": null,
"result_ttl": 86400, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-04-01T10:29:33+00:00", "scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 1, "interval": 1,
"interval_unit": "weeks", "interval_unit": "weeks",
"successful_runs": 0, "successful_runs": 0,

View File

@@ -59,6 +59,10 @@ services:
# Selenium # Selenium
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80} - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org} - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
########################
#volumes: # Dev mode
# - ./app_urls:/opt/app
########################
ports: ports:
- 8000:8000 - 8000:8000
depends_on: depends_on:
@@ -84,8 +88,8 @@ services:
POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos} POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos}
POSTGRES_USER: ${DB_USER:-supermatitos} POSTGRES_USER: ${DB_USER:-supermatitos}
POSTGRES_INITDB_ARGS: '--data-checksums' POSTGRES_INITDB_ARGS: '--data-checksums'
#volumes: # Persistent DB? volumes: # Persistent DB?
# - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
ports: ports:
- 5432 #:5432 - 5432 #:5432