URLs site with status filter, refactoring, django-tasks-scheduler low high priority queues
This commit is contained in:
@@ -236,6 +236,7 @@
|
|||||||
" # Websites of interest\n",
|
" # Websites of interest\n",
|
||||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.missingkids.org/poster', 'url_host');\" )\n",
|
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.missingkids.org/poster', 'url_host');\" )\n",
|
||||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n",
|
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n",
|
||||||
|
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n",
|
||||||
" # Search keywords\n",
|
" # Search keywords\n",
|
||||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
|
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
|
||||||
" \n",
|
" \n",
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ conda create -n matitos_urls python=3.12
|
|||||||
conda activate matitos_urls
|
conda activate matitos_urls
|
||||||
# Core
|
# Core
|
||||||
pip install django psycopg[binary] django-redis django-tasks-scheduler
|
pip install django psycopg[binary] django-redis django-tasks-scheduler
|
||||||
# django-rq
|
|
||||||
# Fetcher
|
# Fetcher
|
||||||
pip install feedparser python-dateutil newspaper4k lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews
|
pip install feedparser python-dateutil newspaper4k lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews
|
||||||
# News visualization
|
# News visualization
|
||||||
@@ -88,33 +87,41 @@ RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900}
|
|||||||
RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600}
|
RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600}
|
||||||
```
|
```
|
||||||
|
|
||||||
* Django DB
|
* Deploy
|
||||||
```
|
```
|
||||||
# Generate content for models.py
|
|
||||||
python manage.py inspectdb
|
|
||||||
# Migrations
|
# Migrations
|
||||||
python manage.py makemigrations api; python manage.py migrate --fake-initial
|
python manage.py makemigrations api; python manage.py migrate --fake-initial
|
||||||
# Create user
|
# Create user
|
||||||
python manage.py createsuperuser
|
python manage.py createsuperuser
|
||||||
```
|
|
||||||
|
|
||||||
* Deploy
|
# 1) Server
|
||||||
```
|
|
||||||
# Server
|
|
||||||
python manage.py runserver
|
python manage.py runserver
|
||||||
|
|
||||||
# Workers
|
# 2) Workers
|
||||||
# python manage.py rqworker high default low
|
|
||||||
python manage.py rqworker high default low
|
python manage.py rqworker high default low
|
||||||
|
|
||||||
# Visualize DB
|
# Visualize DB
|
||||||
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id
|
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id
|
||||||
```
|
```
|
||||||
|
|
||||||
|
* Scheduled tasks
|
||||||
|
```
|
||||||
|
Names: Fetch Feeds, Fetch Parser, Fetch Search
|
||||||
|
Callable: api.tasks.fetch_feeds, api.tasks.fetch_parser, api.tasks.fetch_search
|
||||||
|
Task type: Repetable task (or cron...)
|
||||||
|
Queue: Default
|
||||||
|
Interval: 15min, 2h, 30min
|
||||||
|
|
||||||
|
Names: Process raw URLs, Process error URLs, Process MissingKids URLs
|
||||||
|
Callable: api.tasks.process_raw_urls, api.tasks.process_error_urls, api.tasks.process_missing_kids_urls_50
|
||||||
|
Task type: Repetable task (or cron...)
|
||||||
|
Queue: Low, Low, Default
|
||||||
|
Interval: 1h, 4h, 2h
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
* Utils
|
* Utils
|
||||||
```
|
```
|
||||||
python manage.py rqstats
|
python manage.py rqstats
|
||||||
python manage.py rqstats --interval=1 # Refreshes every second
|
python manage.py rqstats --interval=1 # Refreshes every second
|
||||||
python manage.py rqstats --json # Output as JSON
|
|
||||||
python manage.py rqstats --yaml # Output as YAML
|
|
||||||
```
|
```
|
||||||
@@ -41,13 +41,29 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
|||||||
def process_url(url):
|
def process_url(url):
|
||||||
try:
|
try:
|
||||||
# Slow down if required to avoid too many requests error
|
# Slow down if required to avoid too many requests error
|
||||||
url_host_slowdown(url, url_host_slowdown_seconds=2)
|
url_host_slowdown(url, url_host_slowdown_seconds=5)
|
||||||
# Process
|
# Process
|
||||||
article = newspaper.article(url)
|
article = newspaper.article(url)
|
||||||
except newspaper.ArticleBinaryDataException:
|
except newspaper.ArticleBinaryDataException:
|
||||||
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
|
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
|
||||||
return {"override_status": "invalid"}
|
return {"override_status": "invalid"}
|
||||||
except newspaper.ArticleException as e:
|
except newspaper.ArticleException as e:
|
||||||
|
|
||||||
|
# Too many requests? Cool down...
|
||||||
|
if ("Status code 429" in str(e)):
|
||||||
|
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||||
|
logger.debug("TODO: Implement code 429")
|
||||||
|
# Unavailable for legal reasons
|
||||||
|
if ("Status code 451" in str(e)):
|
||||||
|
# TODO: Bypass with VPN
|
||||||
|
logger.debug("TODO: Implement code 451")
|
||||||
|
# CloudFlare protection?
|
||||||
|
if ("Website protected with Cloudflare" in str(e)):
|
||||||
|
logger.debug("TODO: Implement bypass CloudFlare")
|
||||||
|
# PerimeterX protection?
|
||||||
|
if ("Website protected with PerimeterX" in str(e)):
|
||||||
|
logger.debug("TODO: Implement bypass PerimeterX")
|
||||||
|
|
||||||
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
|
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ from .src.fetch_search import FetchSearcher
|
|||||||
from .src.db_utils import DB_Handler
|
from .src.db_utils import DB_Handler
|
||||||
'''
|
'''
|
||||||
from src.missing_kids_fetch import MissingKidsFetch
|
from src.missing_kids_fetch import MissingKidsFetch
|
||||||
from src.missing_kids_status import MissingKidsStatus
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from .src.logger import get_logger
|
from .src.logger import get_logger
|
||||||
|
|||||||
@@ -63,44 +63,41 @@
|
|||||||
fetchButton.prop("disabled", true); // Disable button
|
fetchButton.prop("disabled", true); // Disable button
|
||||||
|
|
||||||
|
|
||||||
fetch(fetchUrl)
|
fetch(fetchUrl/*, {
|
||||||
.then(response => {
|
method: "POST",
|
||||||
|
body: JSON.stringify({
|
||||||
|
text: inputText
|
||||||
|
}),
|
||||||
|
headers: {
|
||||||
|
"Content-type": "application/json; charset=UTF-8"
|
||||||
|
}
|
||||||
|
}*/).then(response => {
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
throw new Error("Error on network response");
|
throw new Error("Error on network response");
|
||||||
}
|
}
|
||||||
const reader = response.body.getReader();
|
const reader = response.body.getReader();
|
||||||
const decoder = new TextDecoder();
|
const decoder = new TextDecoder();
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////
|
|
||||||
|
|
||||||
let accumulatedText = ""; // Store streamed text before rendering Markdown
|
let accumulatedText = ""; // Store streamed text before rendering Markdown
|
||||||
// Create a temporary container for streaming response
|
let messageContainer = $('<div class="chat-message"></div>'); // Create a temporary container for streaming response
|
||||||
let messageContainer = $('<div class="chat-message"></div>');
|
|
||||||
//let messageContainer = $('');
|
|
||||||
resultContainer.append(messageContainer);
|
resultContainer.append(messageContainer);
|
||||||
//////////////////////////////////////
|
|
||||||
|
|
||||||
function read() {
|
function read() {
|
||||||
return reader.read().then(({ done, value }) => {
|
return reader.read().then(({ done, value }) => {
|
||||||
if (done) {
|
if (done) {
|
||||||
//////////////////////////////////////
|
|
||||||
messageContainer.html(marked.parse(accumulatedText));
|
messageContainer.html(marked.parse(accumulatedText));
|
||||||
//////////////////////////////////////
|
|
||||||
fetchButton.prop("disabled", false); // Re-enable button when done
|
fetchButton.prop("disabled", false); // Re-enable button when done
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////
|
|
||||||
// Decode the streamed chunk
|
// Decode the streamed chunk
|
||||||
let chunk = decoder.decode(value);
|
let chunk = decoder.decode(value);
|
||||||
// Append to the accumulated text
|
// Append to the accumulated text
|
||||||
accumulatedText += chunk;
|
accumulatedText += chunk;
|
||||||
// Render Markdown progressively (but safely)
|
// Render Markdown progressively (but safely)
|
||||||
messageContainer.html(marked.parse(accumulatedText));
|
messageContainer.html(marked.parse(accumulatedText));
|
||||||
//////////////////////////////////////
|
// Auto-scroll to bottom
|
||||||
|
resultContainer.scrollTop(resultContainer[0].scrollHeight);
|
||||||
resultContainer.scrollTop(resultContainer[0].scrollHeight); // Auto-scroll to bottom
|
|
||||||
return read();
|
return read();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -125,7 +122,7 @@
|
|||||||
<table class="table table-bordered">
|
<table class="table table-bordered">
|
||||||
<tr>
|
<tr>
|
||||||
<th>URL</th>
|
<th>URL</th>
|
||||||
<td><a href="{{ url_item.url }}" target="_blank">{{ url_item.url }}</a></td>
|
<td><a href="{{ url_item.url|safe }}" target="_blank">{{ url_item.url }}</a></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Fetch Date</th>
|
<th>Fetch Date</th>
|
||||||
@@ -144,16 +141,20 @@
|
|||||||
<td>{{ url_item.status }}</td>
|
<td>{{ url_item.status }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Title</th>
|
<th>URL host</th>
|
||||||
<td>{{ url_content.title }}</td>
|
<td><a href="{{ url_content.url_host|safe }}" target="_blank">{{ url_content.url_host }}</a></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Description</th>
|
<th>Site name</th>
|
||||||
<td>{{ url_content.description }}</td>
|
<td>{{ url_content.site_name }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Content</th>
|
<th>Published Date</th>
|
||||||
<td>{{ url_content.content }}</td>
|
<td>{{ url_content.date_published }} UTC</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Valid news article content?</th>
|
||||||
|
<td>{{ url_content.valid_content }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Tags</th>
|
<th>Tags</th>
|
||||||
@@ -163,10 +164,38 @@
|
|||||||
<th>Authors</th>
|
<th>Authors</th>
|
||||||
<td>{{ url_content.authors }}</td>
|
<td>{{ url_content.authors }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Keywords</th>
|
||||||
|
<td>{{ url_content.keywords }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Language</th>
|
||||||
|
<td>{{ url_content.language }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Main image</th>
|
||||||
|
<td><a href="{{ url_content.image_main_url|safe }}" target="_blank">{{ url_content.image_main_url }}</a></td>
|
||||||
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Image URLs</th>
|
<th>Image URLs</th>
|
||||||
<td>{{ url_content.image_urls }}</td>
|
<td>{{ url_content.image_urls }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Video URLs</th>
|
||||||
|
<td>{{ url_content.videos_url }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Title</th>
|
||||||
|
<td>{{ url_content.title }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Description</th>
|
||||||
|
<td>{{ url_content.description }}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Content</th>
|
||||||
|
<td>{{ url_content.content }}</td>
|
||||||
|
</tr>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
<!-- Independent form for optional values -->
|
<!-- Independent form for optional values -->
|
||||||
@@ -181,7 +210,8 @@
|
|||||||
|
|
||||||
<!-- Input field with a default value -->
|
<!-- Input field with a default value -->
|
||||||
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
|
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
|
||||||
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="3">{{ prompt }} {{ url_item.url }}</textarea>
|
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="5">{{ prompt }}
|
||||||
|
{{ url_item.url }}</textarea>
|
||||||
|
|
||||||
<div class="d-flex align-items-center">
|
<div class="d-flex align-items-center">
|
||||||
<!-- Fetch details button -->
|
<!-- Fetch details button -->
|
||||||
|
|||||||
@@ -592,7 +592,7 @@
|
|||||||
|
|
||||||
<!-- Table -->
|
<!-- Table -->
|
||||||
<div id="item-list">
|
<div id="item-list">
|
||||||
{% include 'item_list_partial.html' %}
|
{% include 'urls_partial.html' %}
|
||||||
</div>
|
</div>
|
||||||
<!-- Loading... -->
|
<!-- Loading... -->
|
||||||
<div id="loading" class="text-center mt-3" style="display:none;">
|
<div id="loading" class="text-center mt-3" style="display:none;">
|
||||||
@@ -94,9 +94,9 @@ def urls(request):
|
|||||||
|
|
||||||
# If request is AJAX, return JSON response
|
# If request is AJAX, return JSON response
|
||||||
if request.headers.get("X-Requested-With") == "XMLHttpRequest":
|
if request.headers.get("X-Requested-With") == "XMLHttpRequest":
|
||||||
return JsonResponse({'items_html': render(request, 'item_list_partial.html', context).content.decode('utf-8')})
|
return JsonResponse({'urls': render(request, 'urls_partial.html', context).content.decode('utf-8')})
|
||||||
|
|
||||||
return render(request, "item_list.html", context)
|
return render(request, "urls.html", context)
|
||||||
|
|
||||||
|
|
||||||
class OllamaClient():
|
class OllamaClient():
|
||||||
@@ -114,7 +114,8 @@ class OllamaClient():
|
|||||||
return models
|
return models
|
||||||
|
|
||||||
def get_prompt(self):
|
def get_prompt(self):
|
||||||
return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
|
return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
|
||||||
|
#return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
|
||||||
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
|
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
|
||||||
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
|
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
|
||||||
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
|
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
|
||||||
@@ -153,6 +154,9 @@ def fetch_details(request, id):
|
|||||||
model = request.GET.get("model", "") # Get LLM model
|
model = request.GET.get("model", "") # Get LLM model
|
||||||
text = request.GET.get("text", "") # Get LLM prompt
|
text = request.GET.get("text", "") # Get LLM prompt
|
||||||
|
|
||||||
|
# print(request)
|
||||||
|
# print(text)
|
||||||
|
|
||||||
# LLM
|
# LLM
|
||||||
ollama = OllamaClient()
|
ollama = OllamaClient()
|
||||||
|
|
||||||
|
|||||||
@@ -38,7 +38,6 @@ INSTALLED_APPS = [
|
|||||||
'django.contrib.sessions',
|
'django.contrib.sessions',
|
||||||
'django.contrib.messages',
|
'django.contrib.messages',
|
||||||
'django.contrib.staticfiles',
|
'django.contrib.staticfiles',
|
||||||
# 'django_rq',
|
|
||||||
'scheduler',
|
'scheduler',
|
||||||
'api',
|
'api',
|
||||||
]
|
]
|
||||||
@@ -105,18 +104,6 @@ CACHES = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
'''
|
|
||||||
RQ_QUEUES = {
|
|
||||||
'default': {
|
|
||||||
'HOST': os.environ.get("REDIS_HOST", "localhost"),
|
|
||||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
|
||||||
'DB': os.environ.get("REDIS_DB", 0),
|
|
||||||
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 900),
|
|
||||||
# 'DEFAULT_RESULT_TTL': os.environ.get("RQ_DEFAULT_RESULT_TTL", 3600),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
'''
|
|
||||||
|
|
||||||
# https://django-tasks-scheduler.readthedocs.io/en/latest/configuration/
|
# https://django-tasks-scheduler.readthedocs.io/en/latest/configuration/
|
||||||
SCHEDULER_QUEUES = {
|
SCHEDULER_QUEUES = {
|
||||||
'default': {
|
'default': {
|
||||||
@@ -124,7 +111,19 @@ SCHEDULER_QUEUES = {
|
|||||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||||
'DB': os.environ.get("REDIS_DB", 0),
|
'DB': os.environ.get("REDIS_DB", 0),
|
||||||
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
|
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
|
||||||
}
|
},
|
||||||
|
'high': {
|
||||||
|
'HOST': os.environ.get("REDIS_HOST", "localhost"),
|
||||||
|
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||||
|
'DB': os.environ.get("REDIS_DB", 0),
|
||||||
|
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
|
||||||
|
},
|
||||||
|
'low': {
|
||||||
|
'HOST': os.environ.get("REDIS_HOST", "localhost"),
|
||||||
|
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||||
|
'DB': os.environ.get("REDIS_DB", 0),
|
||||||
|
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
SCHEDULER_CONFIG = {
|
SCHEDULER_CONFIG = {
|
||||||
'EXECUTIONS_IN_PAGE': 20,
|
'EXECUTIONS_IN_PAGE': 20,
|
||||||
|
|||||||
Reference in New Issue
Block a user