Wait db connection, login required, dev mode enable
This commit is contained in:
13
README.md
13
README.md
@@ -4,7 +4,7 @@
|
||||
- Fetcher -> Inserts raw URLs
|
||||
- Fetch parsing URL host
|
||||
- Fetch from RSS feed
|
||||
- Fetch searching (Google search & news, DuckDuckGo, ...)
|
||||
- Fetch keyword search (Google search & news, DuckDuckGo, ...)
|
||||
++ Sources -> Robustness to TooManyRequests block
|
||||
- Selenium based
|
||||
- Sites change their logic, request captcha, ...
|
||||
@@ -13,20 +13,23 @@
|
||||
- Bing API
|
||||
- Subscription required
|
||||
- Yandex. No API?
|
||||
++ Proxy / VPN?
|
||||
TooManyRequests, ...
|
||||
++ Search per locale (nl-NL, fr-FR, en-GB)
|
||||
- Process URLs -> Updates raw URLs
|
||||
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
|
||||
- Determines if it is a valid article content
|
||||
++ Proxy / VPN?
|
||||
Bypass geoblock
|
||||
- Valid URLs
|
||||
- Generate summary
|
||||
- One paragraph
|
||||
- At most three paragraphs
|
||||
- Classification
|
||||
- 5W: Who, What, When, Where, Why of a Story
|
||||
- Related to child abuse?
|
||||
- ...
|
||||
|
||||
Georgia Institute of Technology
|
||||
https://comm.gatech.edu › resources › writers
|
||||
|
||||
|
||||
- Visualization of URLs
|
||||
- Filter URLs
|
||||
- By status, search, source, language
|
||||
|
||||
@@ -29,7 +29,6 @@ RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
|
||||
echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \
|
||||
echo 'else' >> /opt/app/initialize.sh && \
|
||||
echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \
|
||||
echo 'sleep 5' >> /opt/app/initialize.sh && \
|
||||
echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \
|
||||
echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \
|
||||
echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \
|
||||
@@ -40,8 +39,10 @@ RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
|
||||
|
||||
# Serving script
|
||||
RUN echo '#!/bin/bash' > /opt/app/run.sh && \
|
||||
# Prod mode:
|
||||
echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
|
||||
#echo 'python manage.py runserver & python manage.py rqworker high default low' >> /opt/app/run.sh && \
|
||||
# Dev mode:
|
||||
#echo 'gunicorn core.wsgi:application --reload --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
|
||||
chmod +x /opt/app/run.sh
|
||||
|
||||
# Run Django’s server & workers
|
||||
|
||||
@@ -24,7 +24,6 @@ SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-54mqLbW5NlO8OlVDsT3
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = (os.environ.get('DJANGO_DEBUG') == "True")
|
||||
print("Django debug mode:", DEBUG)
|
||||
|
||||
ALLOWED_HOSTS = os.environ.get('DJANGO_ALLOWED_HOSTS', "*").split(",")
|
||||
|
||||
@@ -51,6 +50,7 @@ MIDDLEWARE = [
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
'fetcher.middleware.login_required.LoginRequiredMiddleware',
|
||||
]
|
||||
|
||||
STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'
|
||||
@@ -148,6 +148,7 @@ AUTH_PASSWORD_VALIDATORS = [
|
||||
},
|
||||
]
|
||||
|
||||
LOGIN_URL = '/admin/'
|
||||
|
||||
# Internationalization
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ import argparse
|
||||
import os
|
||||
import psycopg
|
||||
import re
|
||||
import time
|
||||
|
||||
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
|
||||
os.environ.get("DB_HOST", "localhost"),
|
||||
@@ -11,6 +12,29 @@ connection_info = "host={} port={} dbname={} user={} password={} connect_timeout
|
||||
os.environ.get("DB_PASSWORD", "supermatitos")
|
||||
)
|
||||
|
||||
def wait_connection():
|
||||
connected = False
|
||||
while (not connected):
|
||||
try:
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Create URLs table
|
||||
c = cur.execute("SELECT 1;").fetchall()
|
||||
connected = True
|
||||
|
||||
except psycopg.OperationalError as e:
|
||||
# Connection not ready...
|
||||
# print(".", end="")
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
# Connection not ready...
|
||||
# print("e", end="")
|
||||
time.sleep(2)
|
||||
|
||||
print("DB connection ready")
|
||||
|
||||
def initialize_tables():
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
@@ -137,6 +161,9 @@ if __name__ == '__main__':
|
||||
parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Wait for DB connection
|
||||
wait_connection()
|
||||
|
||||
if (args.initialize_tables):
|
||||
print("Initializing tables")
|
||||
initialize_tables()
|
||||
|
||||
24
app_urls/fetcher/middleware/login_required.py
Normal file
24
app_urls/fetcher/middleware/login_required.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from django.shortcuts import redirect
|
||||
from django.conf import settings
|
||||
from django.urls import reverse
|
||||
|
||||
EXEMPT_URLS = [
|
||||
# reverse('login'), # or the name of your login view
|
||||
reverse('admin:login'),
|
||||
reverse('admin:index'),
|
||||
# reverse('logout'), # optional
|
||||
'/admin/', # allow full access to admin
|
||||
settings.STATIC_URL, # allow static files
|
||||
# path('scheduler/', include('scheduler.urls')),
|
||||
]
|
||||
|
||||
class LoginRequiredMiddleware:
|
||||
def __init__(self, get_response):
|
||||
self.get_response = get_response
|
||||
|
||||
def __call__(self, request):
|
||||
if not request.user.is_authenticated:
|
||||
path = request.path
|
||||
if not any(path.startswith(url) for url in EXEMPT_URLS):
|
||||
return redirect(settings.LOGIN_URL)
|
||||
return self.get_response(request)
|
||||
@@ -4,7 +4,7 @@ from django.shortcuts import render, get_object_or_404
|
||||
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
|
||||
from django.contrib.auth.decorators import login_required
|
||||
import ollama
|
||||
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
|
||||
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
|
||||
import os
|
||||
|
||||
####################################################################################################
|
||||
@@ -37,7 +37,6 @@ def link_list(request):
|
||||
return JsonResponse({"links": list_links })
|
||||
|
||||
####################################################################################################
|
||||
# @login_required(login_url='/admin')
|
||||
def logs(request, log_type):
|
||||
# Capture output: python manage.py rqstats
|
||||
try:
|
||||
@@ -71,25 +70,20 @@ class OllamaClient():
|
||||
# return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
|
||||
#return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
|
||||
|
||||
# TODO: move to ollamajs...
|
||||
|
||||
def fetch_details(request, id):
|
||||
url_item = get_object_or_404(Urls, id=id)
|
||||
url_param = request.GET.get("url", "") # Get URL
|
||||
model = request.GET.get("model", "") # Get LLM model
|
||||
# TODO: post with body
|
||||
text = request.GET.get("text", "") # Get LLM prompt
|
||||
|
||||
# print(request)
|
||||
# print(text)
|
||||
|
||||
# LLM
|
||||
ollama = OllamaClient()
|
||||
|
||||
def stream_response():
|
||||
msg_content = {
|
||||
"role": "user",
|
||||
"content": text,
|
||||
}
|
||||
response = ollama.client.chat(model=model, messages=[msg_content], stream=True)
|
||||
response = OllamaClient().client.chat(model=model, messages=[msg_content], stream=True)
|
||||
for chunk in response:
|
||||
yield chunk["message"]["content"] # Stream each chunk of text
|
||||
|
||||
@@ -102,6 +96,12 @@ def url_detail_view(request, id):
|
||||
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
||||
# url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
|
||||
|
||||
url_duplicate = UrlsDuplicate.objects.get(id_url_duplicated=url_item)
|
||||
#id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
|
||||
#id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
|
||||
|
||||
url_duplicate.id_url_duplicated
|
||||
|
||||
try:
|
||||
url_content = UrlContent.objects.get(pk=id)
|
||||
except UrlContent.DoesNotExist:
|
||||
@@ -222,9 +222,7 @@ def filtered_urls(request):
|
||||
statuses = Urls.STATUS_ENUM.choices
|
||||
searches = Search.objects.all()
|
||||
sources = Source.objects.all()
|
||||
# TODO: Cache languages, update once every N
|
||||
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
|
||||
# Null for visualization
|
||||
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True)) # TODO: Cache languages
|
||||
languages = ["Unknown"] + [l for l in languages if l is not None]
|
||||
valid_contents = ["True", "False", "Unknown"]
|
||||
|
||||
@@ -238,14 +236,6 @@ def filtered_urls(request):
|
||||
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
|
||||
page_number = request.GET.get('page') # Get the current page number
|
||||
|
||||
|
||||
all_status = [str(status[0]) for status in statuses]
|
||||
all_search = [str(search.id) for search in searches]
|
||||
all_source = [str(source.id) for source in sources]
|
||||
all_languages = languages
|
||||
all_valid_contents = valid_contents
|
||||
|
||||
|
||||
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
|
||||
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
|
||||
selected_status = ["all"]
|
||||
@@ -254,20 +244,22 @@ def filtered_urls(request):
|
||||
selected_language = ["all"]
|
||||
selected_valid_contents = ["all"]
|
||||
else:
|
||||
# All elements
|
||||
all_status = [str(status[0]) for status in statuses]
|
||||
all_search = [str(search.id) for search in searches]
|
||||
all_source = [str(source.id) for source in sources]
|
||||
all_languages = languages
|
||||
all_valid_contents = valid_contents
|
||||
|
||||
# Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
|
||||
if (set(selected_status) == set(all_status)):
|
||||
selected_status = ["all"]
|
||||
if (set(selected_search) == set(all_search)):
|
||||
selected_search = ["all"]
|
||||
if (set(selected_source) == set(all_source)):
|
||||
selected_source = ["all"]
|
||||
if (set(selected_language) == set(all_languages)):
|
||||
selected_language = ["all"]
|
||||
if (set(selected_valid_contents) == set(all_valid_contents)):
|
||||
selected_valid_contents = ["all"]
|
||||
selected_status = ["all"] if (set(selected_status) == set(all_status)) else selected_status
|
||||
selected_search = ["all"] if (set(selected_search) == set(all_search)) else selected_search
|
||||
selected_source = ["all"] if (set(selected_source) == set(all_source)) else selected_source
|
||||
selected_language = ["all"] if (set(selected_language) == set(all_languages)) else selected_language
|
||||
selected_valid_contents = ["all"] if (set(selected_valid_contents) == set(all_valid_contents)) else selected_valid_contents
|
||||
|
||||
# Filter URLs based on selected filters
|
||||
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
|
||||
if any( 'null' in l for l in [selected_status, selected_search, selected_source, selected_language, selected_valid_contents] ):
|
||||
urls = []
|
||||
else:
|
||||
# Filter by date
|
||||
@@ -308,7 +300,6 @@ def filtered_urls(request):
|
||||
|
||||
# Run query
|
||||
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
|
||||
# print(urls.query)
|
||||
|
||||
# Pagination
|
||||
paginator = Paginator(urls, per_page) # Paginate the filtered URLs
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T12:36:21+00:00",
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 4,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
@@ -33,7 +33,7 @@
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:20:08+00:00",
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 10,
|
||||
"interval_unit": "minutes",
|
||||
"successful_runs": 0,
|
||||
@@ -54,7 +54,7 @@
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:37:50+00:00",
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 4,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
@@ -73,9 +73,9 @@
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": null,
|
||||
"result_ttl": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-07T15:59:49+00:00",
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "weeks",
|
||||
"successful_runs": 0,
|
||||
@@ -96,8 +96,8 @@
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:18:56+00:00",
|
||||
"interval": 15,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 10,
|
||||
"interval_unit": "minutes",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
@@ -117,7 +117,7 @@
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:25:42+00:00",
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
@@ -138,7 +138,7 @@
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:29:33+00:00",
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
@@ -159,7 +159,7 @@
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:29:33+00:00",
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 4,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
@@ -180,7 +180,7 @@
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:29:33+00:00",
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "weeks",
|
||||
"successful_runs": 0,
|
||||
|
||||
@@ -59,6 +59,10 @@ services:
|
||||
# Selenium
|
||||
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
|
||||
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
|
||||
########################
|
||||
#volumes: # Dev mode
|
||||
# - ./app_urls:/opt/app
|
||||
########################
|
||||
ports:
|
||||
- 8000:8000
|
||||
depends_on:
|
||||
@@ -84,8 +88,8 @@ services:
|
||||
POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos}
|
||||
POSTGRES_USER: ${DB_USER:-supermatitos}
|
||||
POSTGRES_INITDB_ARGS: '--data-checksums'
|
||||
#volumes: # Persistent DB?
|
||||
# - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
|
||||
volumes: # Persistent DB?
|
||||
- ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
|
||||
ports:
|
||||
- 5432 #:5432
|
||||
|
||||
|
||||
Reference in New Issue
Block a user