URLs site with status filter, refactoring, django-tasks-scheduler low high priority queues
This commit is contained in:
@@ -236,6 +236,7 @@
|
||||
" # Websites of interest\n",
|
||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.missingkids.org/poster', 'url_host');\" )\n",
|
||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n",
|
||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n",
|
||||
" # Search keywords\n",
|
||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
|
||||
" \n",
|
||||
|
||||
@@ -4,7 +4,6 @@ conda create -n matitos_urls python=3.12
|
||||
conda activate matitos_urls
|
||||
# Core
|
||||
pip install django psycopg[binary] django-redis django-tasks-scheduler
|
||||
# django-rq
|
||||
# Fetcher
|
||||
pip install feedparser python-dateutil newspaper4k lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews
|
||||
# News visualization
|
||||
@@ -88,33 +87,41 @@ RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900}
|
||||
RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600}
|
||||
```
|
||||
|
||||
* Django DB
|
||||
* Deploy
|
||||
```
|
||||
# Generate content for models.py
|
||||
python manage.py inspectdb
|
||||
# Migrations
|
||||
python manage.py makemigrations api; python manage.py migrate --fake-initial
|
||||
# Create user
|
||||
python manage.py createsuperuser
|
||||
```
|
||||
|
||||
* Deploy
|
||||
```
|
||||
# Server
|
||||
# 1) Server
|
||||
python manage.py runserver
|
||||
|
||||
# Workers
|
||||
# python manage.py rqworker high default low
|
||||
# 2) Workers
|
||||
python manage.py rqworker high default low
|
||||
|
||||
# Visualize DB
|
||||
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id
|
||||
```
|
||||
|
||||
* Scheduled tasks
|
||||
```
|
||||
Names: Fetch Feeds, Fetch Parser, Fetch Search
|
||||
Callable: api.tasks.fetch_feeds, api.tasks.fetch_parser, api.tasks.fetch_search
|
||||
Task type: Repetable task (or cron...)
|
||||
Queue: Default
|
||||
Interval: 15min, 2h, 30min
|
||||
|
||||
Names: Process raw URLs, Process error URLs, Process MissingKids URLs
|
||||
Callable: api.tasks.process_raw_urls, api.tasks.process_error_urls, api.tasks.process_missing_kids_urls_50
|
||||
Task type: Repetable task (or cron...)
|
||||
Queue: Low, Low, Default
|
||||
Interval: 1h, 4h, 2h
|
||||
|
||||
```
|
||||
|
||||
* Utils
|
||||
```
|
||||
python manage.py rqstats
|
||||
python manage.py rqstats --interval=1 # Refreshes every second
|
||||
python manage.py rqstats --json # Output as JSON
|
||||
python manage.py rqstats --yaml # Output as YAML
|
||||
```
|
||||
@@ -41,13 +41,29 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
||||
def process_url(url):
|
||||
try:
|
||||
# Slow down if required to avoid too many requests error
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=2)
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=5)
|
||||
# Process
|
||||
article = newspaper.article(url)
|
||||
except newspaper.ArticleBinaryDataException:
|
||||
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
|
||||
return {"override_status": "invalid"}
|
||||
except newspaper.ArticleException as e:
|
||||
|
||||
# Too many requests? Cool down...
|
||||
if ("Status code 429" in str(e)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
logger.debug("TODO: Implement code 429")
|
||||
# Unavailable for legal reasons
|
||||
if ("Status code 451" in str(e)):
|
||||
# TODO: Bypass with VPN
|
||||
logger.debug("TODO: Implement code 451")
|
||||
# CloudFlare protection?
|
||||
if ("Website protected with Cloudflare" in str(e)):
|
||||
logger.debug("TODO: Implement bypass CloudFlare")
|
||||
# PerimeterX protection?
|
||||
if ("Website protected with PerimeterX" in str(e)):
|
||||
logger.debug("TODO: Implement bypass PerimeterX")
|
||||
|
||||
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
|
||||
return None
|
||||
except Exception as e:
|
||||
|
||||
@@ -7,7 +7,6 @@ from .src.fetch_search import FetchSearcher
|
||||
from .src.db_utils import DB_Handler
|
||||
'''
|
||||
from src.missing_kids_fetch import MissingKidsFetch
|
||||
from src.missing_kids_status import MissingKidsStatus
|
||||
'''
|
||||
|
||||
from .src.logger import get_logger
|
||||
|
||||
@@ -63,44 +63,41 @@
|
||||
fetchButton.prop("disabled", true); // Disable button
|
||||
|
||||
|
||||
fetch(fetchUrl)
|
||||
.then(response => {
|
||||
fetch(fetchUrl/*, {
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
text: inputText
|
||||
}),
|
||||
headers: {
|
||||
"Content-type": "application/json; charset=UTF-8"
|
||||
}
|
||||
}*/).then(response => {
|
||||
if (!response.ok) {
|
||||
throw new Error("Error on network response");
|
||||
}
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
|
||||
//////////////////////////////////////
|
||||
|
||||
let accumulatedText = ""; // Store streamed text before rendering Markdown
|
||||
// Create a temporary container for streaming response
|
||||
let messageContainer = $('<div class="chat-message"></div>');
|
||||
//let messageContainer = $('');
|
||||
let messageContainer = $('<div class="chat-message"></div>'); // Create a temporary container for streaming response
|
||||
resultContainer.append(messageContainer);
|
||||
//////////////////////////////////////
|
||||
|
||||
function read() {
|
||||
return reader.read().then(({ done, value }) => {
|
||||
if (done) {
|
||||
//////////////////////////////////////
|
||||
messageContainer.html(marked.parse(accumulatedText));
|
||||
//////////////////////////////////////
|
||||
fetchButton.prop("disabled", false); // Re-enable button when done
|
||||
return;
|
||||
}
|
||||
|
||||
//////////////////////////////////////
|
||||
// Decode the streamed chunk
|
||||
let chunk = decoder.decode(value);
|
||||
// Append to the accumulated text
|
||||
accumulatedText += chunk;
|
||||
// Render Markdown progressively (but safely)
|
||||
messageContainer.html(marked.parse(accumulatedText));
|
||||
//////////////////////////////////////
|
||||
|
||||
resultContainer.scrollTop(resultContainer[0].scrollHeight); // Auto-scroll to bottom
|
||||
// Auto-scroll to bottom
|
||||
resultContainer.scrollTop(resultContainer[0].scrollHeight);
|
||||
return read();
|
||||
});
|
||||
}
|
||||
@@ -125,7 +122,7 @@
|
||||
<table class="table table-bordered">
|
||||
<tr>
|
||||
<th>URL</th>
|
||||
<td><a href="{{ url_item.url }}" target="_blank">{{ url_item.url }}</a></td>
|
||||
<td><a href="{{ url_item.url|safe }}" target="_blank">{{ url_item.url }}</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Fetch Date</th>
|
||||
@@ -144,16 +141,20 @@
|
||||
<td>{{ url_item.status }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Title</th>
|
||||
<td>{{ url_content.title }}</td>
|
||||
<th>URL host</th>
|
||||
<td><a href="{{ url_content.url_host|safe }}" target="_blank">{{ url_content.url_host }}</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Description</th>
|
||||
<td>{{ url_content.description }}</td>
|
||||
<th>Site name</th>
|
||||
<td>{{ url_content.site_name }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Content</th>
|
||||
<td>{{ url_content.content }}</td>
|
||||
<th>Published Date</th>
|
||||
<td>{{ url_content.date_published }} UTC</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Valid news article content?</th>
|
||||
<td>{{ url_content.valid_content }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Tags</th>
|
||||
@@ -163,10 +164,38 @@
|
||||
<th>Authors</th>
|
||||
<td>{{ url_content.authors }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Keywords</th>
|
||||
<td>{{ url_content.keywords }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Language</th>
|
||||
<td>{{ url_content.language }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Main image</th>
|
||||
<td><a href="{{ url_content.image_main_url|safe }}" target="_blank">{{ url_content.image_main_url }}</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Image URLs</th>
|
||||
<td>{{ url_content.image_urls }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Video URLs</th>
|
||||
<td>{{ url_content.videos_url }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Title</th>
|
||||
<td>{{ url_content.title }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Description</th>
|
||||
<td>{{ url_content.description }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Content</th>
|
||||
<td>{{ url_content.content }}</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<!-- Independent form for optional values -->
|
||||
@@ -181,7 +210,8 @@
|
||||
|
||||
<!-- Input field with a default value -->
|
||||
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
|
||||
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="3">{{ prompt }} {{ url_item.url }}</textarea>
|
||||
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="5">{{ prompt }}
|
||||
{{ url_item.url }}</textarea>
|
||||
|
||||
<div class="d-flex align-items-center">
|
||||
<!-- Fetch details button -->
|
||||
|
||||
@@ -592,7 +592,7 @@
|
||||
|
||||
<!-- Table -->
|
||||
<div id="item-list">
|
||||
{% include 'item_list_partial.html' %}
|
||||
{% include 'urls_partial.html' %}
|
||||
</div>
|
||||
<!-- Loading... -->
|
||||
<div id="loading" class="text-center mt-3" style="display:none;">
|
||||
@@ -94,9 +94,9 @@ def urls(request):
|
||||
|
||||
# If request is AJAX, return JSON response
|
||||
if request.headers.get("X-Requested-With") == "XMLHttpRequest":
|
||||
return JsonResponse({'items_html': render(request, 'item_list_partial.html', context).content.decode('utf-8')})
|
||||
return JsonResponse({'urls': render(request, 'urls_partial.html', context).content.decode('utf-8')})
|
||||
|
||||
return render(request, "item_list.html", context)
|
||||
return render(request, "urls.html", context)
|
||||
|
||||
|
||||
class OllamaClient():
|
||||
@@ -114,7 +114,8 @@ class OllamaClient():
|
||||
return models
|
||||
|
||||
def get_prompt(self):
|
||||
return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
|
||||
return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
|
||||
#return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
|
||||
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
|
||||
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
|
||||
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
|
||||
@@ -153,6 +154,9 @@ def fetch_details(request, id):
|
||||
model = request.GET.get("model", "") # Get LLM model
|
||||
text = request.GET.get("text", "") # Get LLM prompt
|
||||
|
||||
# print(request)
|
||||
# print(text)
|
||||
|
||||
# LLM
|
||||
ollama = OllamaClient()
|
||||
|
||||
|
||||
@@ -38,7 +38,6 @@ INSTALLED_APPS = [
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
# 'django_rq',
|
||||
'scheduler',
|
||||
'api',
|
||||
]
|
||||
@@ -105,18 +104,6 @@ CACHES = {
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
RQ_QUEUES = {
|
||||
'default': {
|
||||
'HOST': os.environ.get("REDIS_HOST", "localhost"),
|
||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||
'DB': os.environ.get("REDIS_DB", 0),
|
||||
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 900),
|
||||
# 'DEFAULT_RESULT_TTL': os.environ.get("RQ_DEFAULT_RESULT_TTL", 3600),
|
||||
}
|
||||
}
|
||||
'''
|
||||
|
||||
# https://django-tasks-scheduler.readthedocs.io/en/latest/configuration/
|
||||
SCHEDULER_QUEUES = {
|
||||
'default': {
|
||||
@@ -124,7 +111,19 @@ SCHEDULER_QUEUES = {
|
||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||
'DB': os.environ.get("REDIS_DB", 0),
|
||||
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
|
||||
}
|
||||
},
|
||||
'high': {
|
||||
'HOST': os.environ.get("REDIS_HOST", "localhost"),
|
||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||
'DB': os.environ.get("REDIS_DB", 0),
|
||||
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
|
||||
},
|
||||
'low': {
|
||||
'HOST': os.environ.get("REDIS_HOST", "localhost"),
|
||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||
'DB': os.environ.get("REDIS_DB", 0),
|
||||
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
|
||||
}
|
||||
}
|
||||
SCHEDULER_CONFIG = {
|
||||
'EXECUTIONS_IN_PAGE': 20,
|
||||
|
||||
Reference in New Issue
Block a user