URLs site with status filter, refactoring, django-tasks-scheduler low high priority queues

This commit is contained in:
Luciano Gervasoni
2025-03-25 21:44:26 +01:00
parent 24b4614049
commit 9d2550b374
9 changed files with 111 additions and 55 deletions

View File

@@ -236,6 +236,7 @@
" # Websites of interest\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.missingkids.org/poster', 'url_host');\" )\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n",
" # Search keywords\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
" \n",

View File

@@ -4,7 +4,6 @@ conda create -n matitos_urls python=3.12
conda activate matitos_urls
# Core
pip install django psycopg[binary] django-redis django-tasks-scheduler
# django-rq
# Fetcher
pip install feedparser python-dateutil newspaper4k lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews
# News visualization
@@ -88,33 +87,41 @@ RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900}
RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600}
```
* Django DB
* Deploy
```
# Generate content for models.py
python manage.py inspectdb
# Migrations
python manage.py makemigrations api; python manage.py migrate --fake-initial
# Create user
python manage.py createsuperuser
```
* Deploy
```
# Server
# 1) Server
python manage.py runserver
# Workers
# python manage.py rqworker high default low
# 2) Workers
python manage.py rqworker high default low
# Visualize DB
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id
```
* Scheduled tasks
```
Names: Fetch Feeds, Fetch Parser, Fetch Search
Callable: api.tasks.fetch_feeds, api.tasks.fetch_parser, api.tasks.fetch_search
Task type: Repetable task (or cron...)
Queue: Default
Interval: 15min, 2h, 30min
Names: Process raw URLs, Process error URLs, Process MissingKids URLs
Callable: api.tasks.process_raw_urls, api.tasks.process_error_urls, api.tasks.process_missing_kids_urls_50
Task type: Repetable task (or cron...)
Queue: Low, Low, Default
Interval: 1h, 4h, 2h
```
* Utils
```
python manage.py rqstats
python manage.py rqstats --interval=1 # Refreshes every second
python manage.py rqstats --json # Output as JSON
python manage.py rqstats --yaml # Output as YAML
```

View File

@@ -41,13 +41,29 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
def process_url(url):
try:
# Slow down if required to avoid too many requests error
url_host_slowdown(url, url_host_slowdown_seconds=2)
url_host_slowdown(url, url_host_slowdown_seconds=5)
# Process
article = newspaper.article(url)
except newspaper.ArticleBinaryDataException:
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
return {"override_status": "invalid"}
except newspaper.ArticleException as e:
# Too many requests? Cool down...
if ("Status code 429" in str(e)):
# TODO: cool down and retry once?, proxy/VPN, ...
logger.debug("TODO: Implement code 429")
# Unavailable for legal reasons
if ("Status code 451" in str(e)):
# TODO: Bypass with VPN
logger.debug("TODO: Implement code 451")
# CloudFlare protection?
if ("Website protected with Cloudflare" in str(e)):
logger.debug("TODO: Implement bypass CloudFlare")
# PerimeterX protection?
if ("Website protected with PerimeterX" in str(e)):
logger.debug("TODO: Implement bypass PerimeterX")
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
return None
except Exception as e:

View File

@@ -7,7 +7,6 @@ from .src.fetch_search import FetchSearcher
from .src.db_utils import DB_Handler
'''
from src.missing_kids_fetch import MissingKidsFetch
from src.missing_kids_status import MissingKidsStatus
'''
from .src.logger import get_logger

View File

@@ -63,44 +63,41 @@
fetchButton.prop("disabled", true); // Disable button
fetch(fetchUrl)
.then(response => {
fetch(fetchUrl/*, {
method: "POST",
body: JSON.stringify({
text: inputText
}),
headers: {
"Content-type": "application/json; charset=UTF-8"
}
}*/).then(response => {
if (!response.ok) {
throw new Error("Error on network response");
}
const reader = response.body.getReader();
const decoder = new TextDecoder();
//////////////////////////////////////
let accumulatedText = ""; // Store streamed text before rendering Markdown
// Create a temporary container for streaming response
let messageContainer = $('<div class="chat-message"></div>');
//let messageContainer = $('');
let messageContainer = $('<div class="chat-message"></div>'); // Create a temporary container for streaming response
resultContainer.append(messageContainer);
//////////////////////////////////////
function read() {
return reader.read().then(({ done, value }) => {
if (done) {
//////////////////////////////////////
messageContainer.html(marked.parse(accumulatedText));
//////////////////////////////////////
fetchButton.prop("disabled", false); // Re-enable button when done
return;
}
//////////////////////////////////////
// Decode the streamed chunk
let chunk = decoder.decode(value);
// Append to the accumulated text
accumulatedText += chunk;
// Render Markdown progressively (but safely)
messageContainer.html(marked.parse(accumulatedText));
//////////////////////////////////////
resultContainer.scrollTop(resultContainer[0].scrollHeight); // Auto-scroll to bottom
// Auto-scroll to bottom
resultContainer.scrollTop(resultContainer[0].scrollHeight);
return read();
});
}
@@ -125,7 +122,7 @@
<table class="table table-bordered">
<tr>
<th>URL</th>
<td><a href="{{ url_item.url }}" target="_blank">{{ url_item.url }}</a></td>
<td><a href="{{ url_item.url|safe }}" target="_blank">{{ url_item.url }}</a></td>
</tr>
<tr>
<th>Fetch Date</th>
@@ -144,16 +141,20 @@
<td>{{ url_item.status }}</td>
</tr>
<tr>
<th>Title</th>
<td>{{ url_content.title }}</td>
<th>URL host</th>
<td><a href="{{ url_content.url_host|safe }}" target="_blank">{{ url_content.url_host }}</a></td>
</tr>
<tr>
<th>Description</th>
<td>{{ url_content.description }}</td>
<th>Site name</th>
<td>{{ url_content.site_name }}</td>
</tr>
<tr>
<th>Content</th>
<td>{{ url_content.content }}</td>
<th>Published Date</th>
<td>{{ url_content.date_published }} UTC</td>
</tr>
<tr>
<th>Valid news article content?</th>
<td>{{ url_content.valid_content }}</td>
</tr>
<tr>
<th>Tags</th>
@@ -163,10 +164,38 @@
<th>Authors</th>
<td>{{ url_content.authors }}</td>
</tr>
<tr>
<th>Keywords</th>
<td>{{ url_content.keywords }}</td>
</tr>
<tr>
<th>Language</th>
<td>{{ url_content.language }}</td>
</tr>
<tr>
<th>Main image</th>
<td><a href="{{ url_content.image_main_url|safe }}" target="_blank">{{ url_content.image_main_url }}</a></td>
</tr>
<tr>
<th>Image URLs</th>
<td>{{ url_content.image_urls }}</td>
</tr>
<tr>
<th>Video URLs</th>
<td>{{ url_content.videos_url }}</td>
</tr>
<tr>
<th>Title</th>
<td>{{ url_content.title }}</td>
</tr>
<tr>
<th>Description</th>
<td>{{ url_content.description }}</td>
</tr>
<tr>
<th>Content</th>
<td>{{ url_content.content }}</td>
</tr>
</table>
<!-- Independent form for optional values -->
@@ -181,7 +210,8 @@
<!-- Input field with a default value -->
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="3">{{ prompt }} {{ url_item.url }}</textarea>
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="5">{{ prompt }}
{{ url_item.url }}</textarea>
<div class="d-flex align-items-center">
<!-- Fetch details button -->

View File

@@ -592,7 +592,7 @@
<!-- Table -->
<div id="item-list">
{% include 'item_list_partial.html' %}
{% include 'urls_partial.html' %}
</div>
<!-- Loading... -->
<div id="loading" class="text-center mt-3" style="display:none;">

View File

@@ -94,9 +94,9 @@ def urls(request):
# If request is AJAX, return JSON response
if request.headers.get("X-Requested-With") == "XMLHttpRequest":
return JsonResponse({'items_html': render(request, 'item_list_partial.html', context).content.decode('utf-8')})
return JsonResponse({'urls': render(request, 'urls_partial.html', context).content.decode('utf-8')})
return render(request, "item_list.html", context)
return render(request, "urls.html", context)
class OllamaClient():
@@ -114,7 +114,8 @@ class OllamaClient():
return models
def get_prompt(self):
return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
#return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
@@ -153,6 +154,9 @@ def fetch_details(request, id):
model = request.GET.get("model", "") # Get LLM model
text = request.GET.get("text", "") # Get LLM prompt
# print(request)
# print(text)
# LLM
ollama = OllamaClient()

View File

@@ -38,7 +38,6 @@ INSTALLED_APPS = [
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
# 'django_rq',
'scheduler',
'api',
]
@@ -105,21 +104,21 @@ CACHES = {
}
}
'''
RQ_QUEUES = {
# https://django-tasks-scheduler.readthedocs.io/en/latest/configuration/
SCHEDULER_QUEUES = {
'default': {
'HOST': os.environ.get("REDIS_HOST", "localhost"),
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 900),
# 'DEFAULT_RESULT_TTL': os.environ.get("RQ_DEFAULT_RESULT_TTL", 3600),
}
}
'''
# https://django-tasks-scheduler.readthedocs.io/en/latest/configuration/
SCHEDULER_QUEUES = {
'default': {
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
},
'high': {
'HOST': os.environ.get("REDIS_HOST", "localhost"),
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
},
'low': {
'HOST': os.environ.get("REDIS_HOST", "localhost"),
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),