Publisher task

This commit is contained in:
Luciano Gervasoni
2025-04-18 16:36:14 +02:00
parent d7373edba2
commit 8d0697edee
21 changed files with 574 additions and 50 deletions

View File

View File

@@ -1,14 +1,5 @@
from django.utils.deprecation import MiddlewareMixin
'''
class FaviconMiddleware(MiddlewareMixin):
def process_response(self, request, response):
if 'text/html' in response.get('Content-Type', '') and b'</head>' in response.content:
icon_link = b'<link rel="icon" type="image/png" href="/static/img/mate-icon.png">\n'
response.content = response.content.replace(b'</head>', icon_link + b'</head>')
return response
'''
class FaviconMiddleware(MiddlewareMixin):
def process_response(self, request, response):
if 'text/html' in response.get('Content-Type', '') and b'</head>' in response.content:

View File

View File

@@ -4,7 +4,7 @@ from django.core.cache import cache
from django.db import IntegrityError
from django.utils import timezone
from datetime import timedelta
from .url_processor import process_url, get_with_protocol
from .fetch_utils_url_processor import process_url, get_with_protocol
import re
import os
import traceback

View File

@@ -1,6 +1,6 @@
from .db_utils import DB_Handler
from ..models import Search, Source
from .url_processor import get_with_protocol, url_host_slowdown
from .fetch_utils_url_processor import get_with_protocol, url_host_slowdown
import newspaper
import traceback
from .logger import get_logger

View File

@@ -4,7 +4,7 @@ import os
from django.utils import timezone
from datetime import timedelta
from ..models import Search, Source
from .fetch_utils import decode_gnews_urls
from .fetch_utils_gnews import decode_gnews_urls
from .logger import get_logger
logger = get_logger()

View File

@@ -0,0 +1,24 @@
import ollama
import os
class OllamaClient():
def __init__(self):
self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
def _get_default_model(self):
return "llama3.2:3b"
def get_models(self):
models = sorted([m.model for m in self.client.list().models])
if (self._get_default_model() in models):
return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
else:
return models
def get_prompt(self):
return ("Rewrite the text below into a clear and concise summary of one paragraph maximum, presenting the key points as if they are newly written insights. "
"Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. "
"Write in a natural, standalone format that feels like an original explanation. "
"Keep it brief, engaging, informative, in the style of a news article: \n"
)

View File

@@ -0,0 +1,149 @@
import time
import jwt
import os
import requests
import random
from .llm import OllamaClient
from ..models import Urls, UrlContent
from .logger import get_logger
logger = get_logger()
class Publisher():
def __init__(self):
pass
def _create_jwt(self, admin_api_key):
id_, secret = admin_api_key.split(':')
iat = int(time.time())
exp = iat + 5 * 60 # 5 minutes
header = {'alg': 'HS256', 'kid': id_}
payload = {
'iat': iat,
'exp': exp,
'aud': '/v5/admin/' # Adjust depending on your Ghost version
}
token = jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)
return token
def _create_ghost_post(self, post_data):
# Get token
jwt_token = self._create_jwt(os.getenv("GHOST_ADMIN_API_KEY"))
# Get Admin API URL
admin_api_url = os.getenv("GHOST_ADMIN_API_URL")
headers = {
'Authorization': f'Ghost {jwt_token}',
'Content-Type': 'application/json'
}
post_data = {"posts": [post_data]}
response = requests.post(
os.path.join(admin_api_url, "posts"),
json=post_data,
headers=headers,
params={"source":"html"}
)
if response.status_code == 201:
logger.info("Ghost post published successfully")
return response.json()
else:
logger.warning("Ghost - Failed to publish post: {} {}".format(response.status_code, response.text))
return None
def _get_photo_url(self, query):
# TODO: Get already used photos to skip. Use DB
try:
endpoint = "https://api.pexels.com/v1/search?query={}".format(query)
header= {"Authorization": os.getenv("PEXELS_API_KEY")}
while True:
# Request
r = requests.get(endpoint, headers=header)
dict_images = r.json()
# Get list of photos
list_photos = dict_images.get("photos", [])
# TODO: IMPROVE...
photo_url = random.choice(list_photos).get("src").get("landscape")
return photo_url
for photo in list_photos:
# Already used? -> Continue
# photo.get("id") # Compare against DB
# Get landscape photo
photo_url = photo.get("src").get("landscape")
return photo_url
# Iterated page, already used all images
endpoint = dict_images.get("next_page")
except Exception as e:
logger.warning("Something went wrong while fetching image from Pexels: {}".format(str(e)))
return None
def publish(self, url_id):
logger.info("Publishing URL ID {}".format(url_id))
# URL Content
url_content = UrlContent.objects.filter(pk=url_id).first()
url = Urls.objects.filter(pk=url_id).first()
if (url_content is None):
logger.warning("Ghost - URL Content is NULL for URL ID: {} {}".format(url_id, url.url))
return
if (url_content.valid_content is False):
logger.warning("Ghost - URL Content is not valid for URL ID: {} {}".format(url_id, url.url))
return
model = "llama3.2:3b"
prompt = "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
ollama_msg = {"role": "user", "content": "{}\n{}".format(prompt, url_content.content)}
response = OllamaClient().client.chat(model=model, messages=[ollama_msg])
article_summary = response["message"]["content"]
################################################################################################
if (url_content.image_main_url is None) or (requests.get(url_content.image_main_url).status_code != 200):
# Invalid main image -> Search for one
photo_query = "Mountain landscape"
photo_url = self._get_photo_url(photo_query)
else:
photo_url = url_content.image_main_url
post_data = {
# "slug": "hey-short",
"title": url_content.title,
"html": "".join([ "<p>{}</p>".format(t) for t in article_summary.split("\n") ]) + '<a href="{}">Source</a>'.format(url.url),
#"meta_title": "",
#"meta_description": "",
"feature_image": photo_url,
#"feature_image_caption": "",
"status": "published",
}
# Publish post
payload = self._create_ghost_post(post_data)
logger.debug("Ghost payload: {}".format(str(payload)))
'''
# Return a response (you can customize this as needed)
return HttpResponse(f"""
<h1>Generated Content</h1>
<p>URL ID: {id_url}</p>
<p>URL: {url.url}</p>
<p>Title: {url_content.title}</p>
<p>Description: {url_content.description}</p>
<p>Content: {url_content.content}</p>
<p>Valid content: {url_content.valid_content}</p>
<p>Language: {url_content.language}</p>
<p>Main image: {url_content.image_main_url}</p>
<p>Generated summary: {article_summary}</p>
""")
'''

View File

@@ -5,6 +5,7 @@ from .src.fetch_parser import FetchParser
from .src.fetch_search import FetchSearcher
from .src.fetch_missing_kids import FetchMissingKids
from .src.db_utils import DB_Handler
from .src.publisher import Publisher
from .src.logger import get_logger
logger = get_logger()
@@ -118,7 +119,7 @@ def background_task(process_type: str):
elif ("process_missing_kids_urls" in process_type):
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
elif ( "clean_old_url_content" in process_type ):
elif ("clean_old_url_content" in process_type ):
# Older than X days encoded in URL
try:
older_than_days = float(process_type.split("_")[-1])
@@ -126,6 +127,12 @@ def background_task(process_type: str):
older_than_days = None
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
elif ("publish" in process_type):
# Extract URL ID
url_id = process_type.split("_")[-1]
# Publish
Publisher().publish(url_id)
else:
logger.info("Task unknown!: {}".format(process_type))

View File

@@ -115,7 +115,7 @@
}
// Fetch URL
let fetchUrl = `/urls/llm/`;
let fetchUrl = `/llm/`;
let resultContainer = $("#chat-output");
resultContainer.html(""); // Clear previous content before fetching
@@ -189,7 +189,7 @@
<!-- <h2>URL Details</h2> -->
<table class="table table-bordered">
<tr>
<th>URL</th>
<th>URL <a href="/task/publish_{{ url_item.id }}" target="_blank">[✍️ Publish]</a></th>
<td><a href="{{ url_item.url|safe }}" target="_blank">{{ url_item.url }}</a></td>
</tr>
<tr>

View File

@@ -15,8 +15,8 @@ urlpatterns = [
path('urls-per-source/', views.urls_per_source, name='urls_per_source'),
path('urls-per-search/', views.urls_per_search, name='urls_per_search'),
#
path('llm/', views.llm, name='llm'),
#
path('urls/', views.filtered_urls, name='filtered_urls'),
path('urls/<int:id>/', views.url_detail_view, name='url_detail'),
path('urls/llm/', views.llm, name='llm'),
path('urls/content_generation', views.content_generation, name='content_generation'),
]

View File

@@ -2,34 +2,16 @@ from .views_base import link_list, logs, log_db, trigger_task
from django.core.paginator import Paginator
from django.shortcuts import render, get_object_or_404
from django.http import StreamingHttpResponse, JsonResponse
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
from django.db.models import Q, Count
from django.utils import timezone
from django.utils.timezone import now, timedelta
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
import ollama
import os
from .src.llm import OllamaClient
import json
####################################################################################################
class OllamaClient():
def __init__(self):
self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
def _get_default_model(self):
return "llama3.2:3b"
def get_models(self):
models = sorted([m.model for m in self.client.list().models])
if (self._get_default_model() in models):
return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
else:
return models
def get_prompt(self):
return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
def llm(request):
@@ -72,7 +54,6 @@ def url_detail_view(request, id):
except UrlContent.DoesNotExist:
url_content = {}
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
ollama = OllamaClient()
context = {
@@ -301,15 +282,5 @@ def filtered_urls(request):
}
return render(request, 'filtered_urls.html', context)
####################################################################################################
def content_generation(request):
'''
# Get list of URLs ID
selected_urls = request.GET.getlist('urls', [])
# Sample URLs
selected_urls = [13460, 13455, 13454, 13452, 13210]
'''
####################################################################################################

View File

@@ -1,5 +1,4 @@
import os
import psycopg
from .tasks import background_task
from django.http import JsonResponse, HttpResponse
from django.db import connection

View File

@@ -0,0 +1,34 @@
{
"SEARCH": {
"rss_feed": [
"https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
"https://feeds.feedburner.com/breitbart",
"https://feeds.feedburner.com/zerohedge/feed",
"https://moxie.foxnews.com/google-publisher/latest.xml",
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
],
"url_host": [
"missingkids.org/poster",
"missingkids.org/new-poster",
"breitbart.com",
"zerohedge.com",
"foxnews.com",
"cnbc.com"
],
"keyword_search": [
"child abuse"
]
},
"REGEX_PATTERN_STATUS_PRIORITY": [
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
[".*radio.foxnews\\.com\\/.*", "invalid", 75],
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
]
}

View File

@@ -16,4 +16,5 @@ GoogleNews
duckduckgo_search
git+https://github.com/tasos-py/Search-Engines-Scraper.git
langdetect
ollama
ollama
PyJWT