Publisher task

2025-04-18 16:36:14 +02:00
parent d7373edba2
commit 8d0697edee
21 changed files with 574 additions and 50 deletions
--- a/app_urls/fetcher/middleware/init.py
+++ b/app_urls/fetcher/middleware/init.py
--- a/app_urls/fetcher/middleware/favicon.py
+++ b/app_urls/fetcher/middleware/favicon.py
@@ -1,14 +1,5 @@
 from django.utils.deprecation import MiddlewareMixin

-'''
-class FaviconMiddleware(MiddlewareMixin):
-    def process_response(self, request, response):
-        if 'text/html' in response.get('Content-Type', '') and b'</head>' in response.content:
-            icon_link = b'<link rel="icon" type="image/png" href="/static/img/mate-icon.png">\n'
-            response.content = response.content.replace(b'</head>', icon_link + b'</head>')
-        return response
-'''
-
 class FaviconMiddleware(MiddlewareMixin):
    def process_response(self, request, response):
        if 'text/html' in response.get('Content-Type', '') and b'</head>' in response.content:
--- a/app_urls/fetcher/src/init.py
+++ b/app_urls/fetcher/src/init.py
--- a/app_urls/fetcher/src/db_utils.py
+++ b/app_urls/fetcher/src/db_utils.py
@@ -4,7 +4,7 @@ from django.core.cache import cache
 from django.db import IntegrityError
 from django.utils import timezone
 from datetime import timedelta
-from .url_processor import process_url, get_with_protocol
+from .fetch_utils_url_processor import process_url, get_with_protocol
 import re
 import os
 import traceback
--- a/app_urls/fetcher/src/fetch_parser.py
+++ b/app_urls/fetcher/src/fetch_parser.py
@@ -1,6 +1,6 @@
 from .db_utils import DB_Handler
 from ..models import Search, Source
-from .url_processor import get_with_protocol, url_host_slowdown
+from .fetch_utils_url_processor import get_with_protocol, url_host_slowdown
 import newspaper
 import traceback
 from .logger import get_logger
--- a/app_urls/fetcher/src/fetch_search_instances.py
+++ b/app_urls/fetcher/src/fetch_search_instances.py
@@ -4,7 +4,7 @@ import os
 from django.utils import timezone
 from datetime import timedelta
 from ..models import Search, Source
-from .fetch_utils import decode_gnews_urls
+from .fetch_utils_gnews import decode_gnews_urls
 from .logger import get_logger
 logger = get_logger()

--- a/app_urls/fetcher/src/fetch_utils_gnews.py
+++ b/app_urls/fetcher/src/fetch_utils_gnews.py
--- a/app_urls/fetcher/src/fetch_utils_url_processor.py
+++ b/app_urls/fetcher/src/fetch_utils_url_processor.py
--- a/app_urls/fetcher/src/llm.py
+++ b/app_urls/fetcher/src/llm.py
@@ -0,0 +1,24 @@
+import ollama
+import os
+
+class OllamaClient():
+    def __init__(self):
+        self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
+    
+    def _get_default_model(self):
+        return "llama3.2:3b"
+
+    def get_models(self):
+        models = sorted([m.model for m in self.client.list().models])
+        if (self._get_default_model() in models):
+            return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
+        else:
+            return models
+    
+    def get_prompt(self):
+        return ("Rewrite the text below into a clear and concise summary of one paragraph maximum, presenting the key points as if they are newly written insights. "
+                "Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. "
+                "Write in a natural, standalone format that feels like an original explanation. "
+                "Keep it brief, engaging, informative, in the style of a news article: \n"
+        )
+    
--- a/app_urls/fetcher/src/publisher.py
+++ b/app_urls/fetcher/src/publisher.py
@@ -0,0 +1,149 @@
+import time
+import jwt
+import os
+import requests
+import random
+from .llm import OllamaClient
+from ..models import Urls, UrlContent
+
+from .logger import get_logger
+logger = get_logger()
+
+
+class Publisher():
+    def __init__(self):
+        pass
+
+    def _create_jwt(self, admin_api_key):
+        id_, secret = admin_api_key.split(':')
+        iat = int(time.time())
+        exp = iat + 5 * 60  # 5 minutes
+        header = {'alg': 'HS256', 'kid': id_}
+        payload = {
+            'iat': iat,
+            'exp': exp,
+            'aud': '/v5/admin/'  # Adjust depending on your Ghost version
+        }
+        token = jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)
+        return token
+
+    def _create_ghost_post(self, post_data):
+        # Get token
+        jwt_token = self._create_jwt(os.getenv("GHOST_ADMIN_API_KEY"))
+        # Get Admin API URL
+        admin_api_url = os.getenv("GHOST_ADMIN_API_URL")
+
+        headers = {
+            'Authorization': f'Ghost {jwt_token}',
+            'Content-Type': 'application/json'
+        }
+        
+        post_data = {"posts": [post_data]}
+
+        response = requests.post(
+            os.path.join(admin_api_url, "posts"),
+            json=post_data,
+            headers=headers,
+            params={"source":"html"}
+        )
+
+        if response.status_code == 201:
+            logger.info("Ghost post published successfully")
+            return response.json()
+        else:
+            logger.warning("Ghost - Failed to publish post: {} {}".format(response.status_code, response.text))
+            return None
+
+    def _get_photo_url(self, query):
+        # TODO: Get already used photos to skip. Use DB
+        try:
+            endpoint = "https://api.pexels.com/v1/search?query={}".format(query)
+            header= {"Authorization": os.getenv("PEXELS_API_KEY")}
+
+            while True:
+                # Request
+                r = requests.get(endpoint, headers=header)
+                dict_images = r.json()
+
+                # Get list of photos
+                list_photos = dict_images.get("photos", [])
+
+                # TODO: IMPROVE...
+                photo_url = random.choice(list_photos).get("src").get("landscape")
+                return photo_url
+
+                
+                for photo in list_photos:
+                    # Already used? -> Continue
+                    # photo.get("id") # Compare against DB
+                    
+                    # Get landscape photo
+                    photo_url = photo.get("src").get("landscape")
+                    return photo_url
+                
+                # Iterated page, already used all images
+                endpoint = dict_images.get("next_page")
+        except Exception as e:
+            logger.warning("Something went wrong while fetching image from Pexels: {}".format(str(e)))
+            return None
+
+    def publish(self, url_id):
+        logger.info("Publishing URL ID {}".format(url_id))
+
+        # URL Content
+        url_content = UrlContent.objects.filter(pk=url_id).first()
+        url = Urls.objects.filter(pk=url_id).first()
+
+        if (url_content is None):
+            logger.warning("Ghost - URL Content is NULL for URL ID: {} {}".format(url_id, url.url))
+            return
+        if (url_content.valid_content is False):
+            logger.warning("Ghost - URL Content is not valid for URL ID: {} {}".format(url_id, url.url))
+            return
+
+        model = "llama3.2:3b"
+        prompt = "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
+
+        ollama_msg = {"role": "user", "content": "{}\n{}".format(prompt, url_content.content)}
+        response = OllamaClient().client.chat(model=model, messages=[ollama_msg])
+
+        article_summary = response["message"]["content"]
+
+        ################################################################################################
+        if (url_content.image_main_url is None) or (requests.get(url_content.image_main_url).status_code != 200):
+            # Invalid main image -> Search for one
+            photo_query = "Mountain landscape"
+            photo_url = self._get_photo_url(photo_query)
+        else:
+            photo_url = url_content.image_main_url
+
+        post_data = {
+            # "slug": "hey-short",
+            "title": url_content.title,
+            "html": "".join([ "<p>{}</p>".format(t) for t in article_summary.split("\n") ]) + '<a href="{}">Source</a>'.format(url.url),
+            #"meta_title": "",
+            #"meta_description": "",
+            "feature_image": photo_url,
+            #"feature_image_caption": "",
+            "status": "published",
+        }
+
+        # Publish post
+        payload = self._create_ghost_post(post_data)
+        logger.debug("Ghost payload: {}".format(str(payload)))
+
+        '''
+        # Return a response (you can customize this as needed)
+        return HttpResponse(f"""
+            <h1>Generated Content</h1>
+            <p>URL ID: {id_url}</p>
+            <p>URL: {url.url}</p>
+            <p>Title: {url_content.title}</p>
+            <p>Description: {url_content.description}</p>
+            <p>Content: {url_content.content}</p>
+            <p>Valid content: {url_content.valid_content}</p>
+            <p>Language: {url_content.language}</p>
+            <p>Main image: {url_content.image_main_url}</p>
+            <p>Generated summary: {article_summary}</p>
+        """)
+        '''
--- a/app_urls/fetcher/tasks.py
+++ b/app_urls/fetcher/tasks.py
@@ -5,6 +5,7 @@ from .src.fetch_parser import FetchParser
 from .src.fetch_search import FetchSearcher
 from .src.fetch_missing_kids import FetchMissingKids
 from .src.db_utils import DB_Handler
+from .src.publisher import Publisher

 from .src.logger import get_logger
 logger = get_logger()
@@ -118,7 +119,7 @@ def background_task(process_type: str):
            elif ("process_missing_kids_urls" in process_type):
                DB_Handler().process_missing_kids_urls(batch_size=batch_size)

-        elif ( "clean_old_url_content" in process_type ):
+        elif ("clean_old_url_content" in process_type ):
            # Older than X days encoded in URL
            try:
                older_than_days = float(process_type.split("_")[-1])
@@ -126,6 +127,12 @@ def background_task(process_type: str):
                older_than_days = None

            DB_Handler().clean_old_url_content(older_than_days=older_than_days)
+        
+        elif ("publish" in process_type):
+            # Extract URL ID
+            url_id = process_type.split("_")[-1]
+            # Publish
+            Publisher().publish(url_id)
            
        else:
            logger.info("Task unknown!: {}".format(process_type))
--- a/app_urls/fetcher/templates/url_detail.html
+++ b/app_urls/fetcher/templates/url_detail.html
@@ -115,7 +115,7 @@
        }

        // Fetch URL
-        let fetchUrl = `/urls/llm/`;
+        let fetchUrl = `/llm/`;

        let resultContainer = $("#chat-output");
        resultContainer.html(""); // Clear previous content before fetching
@@ -189,7 +189,7 @@
        <!--  <h2>URL Details</h2> -->
        <table class="table table-bordered">
            <tr>
-                <th>URL</th>
+                <th>URL <a href="/task/publish_{{ url_item.id }}" target="_blank">[✍️ Publish]</a></th>
                <td><a href="{{ url_item.url|safe }}" target="_blank">{{ url_item.url }}</a></td>
            </tr>
            <tr>
--- a/app_urls/fetcher/urls.py
+++ b/app_urls/fetcher/urls.py
@@ -15,8 +15,8 @@ urlpatterns = [
    path('urls-per-source/', views.urls_per_source, name='urls_per_source'),
    path('urls-per-search/', views.urls_per_search, name='urls_per_search'),
    #
+    path('llm/', views.llm, name='llm'),
+    #
    path('urls/', views.filtered_urls, name='filtered_urls'),
    path('urls/<int:id>/', views.url_detail_view, name='url_detail'),
-    path('urls/llm/', views.llm, name='llm'),
-    path('urls/content_generation', views.content_generation, name='content_generation'),
 ]
--- a/app_urls/fetcher/views.py
+++ b/app_urls/fetcher/views.py
@@ -2,34 +2,16 @@ from .views_base import link_list, logs, log_db, trigger_task

 from django.core.paginator import Paginator
 from django.shortcuts import render, get_object_or_404
-from django.http import StreamingHttpResponse, JsonResponse
+from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
 from django.db.models import Q, Count
 from django.utils import timezone
 from django.utils.timezone import now, timedelta
 from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
-import ollama
-import os
+from .src.llm import OllamaClient
 import json


 ####################################################################################################
-class OllamaClient():
-    def __init__(self):
-        self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
-    
-    def _get_default_model(self):
-        return "llama3.2:3b"
-
-    def get_models(self):
-        models = sorted([m.model for m in self.client.list().models])
-        if (self._get_default_model() in models):
-            return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
-        else:
-            return models
-    
-    def get_prompt(self):
-        return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
-

 def llm(request):

@@ -72,7 +54,6 @@ def url_detail_view(request, id):
    except UrlContent.DoesNotExist:
        url_content = {}
    
-    # TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
    ollama = OllamaClient()

    context = {
@@ -301,15 +282,5 @@ def filtered_urls(request):
    }

    return render(request, 'filtered_urls.html', context)
-####################################################################################################
-
-def content_generation(request):
-    '''
-    # Get list of URLs ID
-    selected_urls = request.GET.getlist('urls', [])
-    
-    # Sample URLs
-    selected_urls = [13460, 13455, 13454, 13452, 13210]
-    '''

 ####################################################################################################
--- a/app_urls/fetcher/views_base.py
+++ b/app_urls/fetcher/views_base.py
@@ -1,5 +1,4 @@
 import os
-import psycopg
 from .tasks import background_task
 from django.http import JsonResponse, HttpResponse
 from django.db import connection
--- a/app_urls/init_data_sca.json
+++ b/app_urls/init_data_sca.json
@@ -0,0 +1,34 @@
+{
+    "SEARCH": {
+        "rss_feed": [
+            "https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
+            "https://feeds.feedburner.com/breitbart",
+            "https://feeds.feedburner.com/zerohedge/feed",
+            "https://moxie.foxnews.com/google-publisher/latest.xml",
+            "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
+            "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
+        ],
+        "url_host": [
+            "missingkids.org/poster",
+            "missingkids.org/new-poster",
+            "breitbart.com",
+            "zerohedge.com",
+            "foxnews.com",
+            "cnbc.com"
+        ],
+        "keyword_search": [
+            "child abuse"
+        ]
+    },
+    "REGEX_PATTERN_STATUS_PRIORITY": [
+        [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
+        [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
+        [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
+        [".*radio.foxnews\\.com\\/.*", "invalid", 75],
+        [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
+        [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
+        [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
+        [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
+        [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
+    ]
+}
--- a/app_urls/requirements.txt
+++ b/app_urls/requirements.txt
@@ -16,4 +16,5 @@ GoogleNews
 duckduckgo_search
 git+https://github.com/tasos-py/Search-Engines-Scraper.git
 langdetect
-ollama
+ollama
+PyJWT