From b3f7cb255c20052531b496b17b888c9a54e377fe Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Thu, 24 Apr 2025 16:47:14 +0200 Subject: [PATCH] Publish with hidden tag, don't publish if url id already processed --- app_urls/fetcher/src/publisher.py | 38 ++++++++++++++++++++++++++----- utils/Newspapers.ipynb | 20 ++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 utils/Newspapers.ipynb diff --git a/app_urls/fetcher/src/publisher.py b/app_urls/fetcher/src/publisher.py index 7934f02..6783d10 100644 --- a/app_urls/fetcher/src/publisher.py +++ b/app_urls/fetcher/src/publisher.py @@ -12,7 +12,8 @@ logger = get_logger() class Publisher(): def __init__(self): - pass + self.admin_api_url = os.getenv("GHOST_ADMIN_API_URL") + self.admin_api_key = os.getenv("GHOST_ADMIN_API_KEY") def _create_jwt(self, admin_api_key): id_, secret = admin_api_key.split(':') @@ -29,9 +30,7 @@ class Publisher(): def _create_ghost_post(self, post_data): # Get token - jwt_token = self._create_jwt(os.getenv("GHOST_ADMIN_API_KEY")) - # Get Admin API URL - admin_api_url = os.getenv("GHOST_ADMIN_API_URL") + jwt_token = self._create_jwt(self.admin_api_key) headers = { 'Authorization': f'Ghost {jwt_token}', @@ -41,7 +40,7 @@ class Publisher(): post_data = {"posts": [post_data]} response = requests.post( - os.path.join(admin_api_url, "posts"), + os.path.join(self.admin_api_url, "posts"), json=post_data, headers=headers, params={"source":"html"} @@ -53,6 +52,27 @@ class Publisher(): else: logger.warning("Ghost - Failed to publish post: {} {}".format(response.status_code, response.text)) return None + + def _published_url_id(self, url_id): + # Get token + jwt_token = self._create_jwt(self.admin_api_key) + + headers = { + 'Authorization': f'Ghost {jwt_token}', + 'Content-Type': 'application/json' + } + + # Query param filter by URL ID + params = {"filter": "tags:hash-url-id-{}".format(url_id)} + # Get posts using filter + response = requests.get(os.path.join(self.admin_api_url, "posts"), params=params, headers=headers) + # To JSON + dict_response = response.json() + + if (len(dict_response.get("posts")) > 0): + return True + else + return False def _get_photo_url(self, query): # TODO: Get already used photos to skip. Use DB @@ -101,6 +121,11 @@ class Publisher(): logger.warning("Ghost - URL Content is not valid for URL ID: {} {}".format(url_id, url.url)) return + # URL ID already published? + if (self._published_url_id(url_id)): + logger.info("Ghost - URL ID {} already published, skipping".format(url_id)) + return + ########################################### client_llm = OllamaClient() # Model @@ -160,7 +185,7 @@ class Publisher(): if (location_url is not None): html_data += '

Estimated location

'.format(location_url) # HTML: Add source - html_data += '

Source

'.format(url.url) + html_data += '

Source: {}

'.format(url.url, url_content.url_host.replace("https://", "")) post_data = { # "slug": "hey-short", @@ -171,6 +196,7 @@ class Publisher(): "feature_image": photo_url, #"feature_image_caption": "", "status": "published", + "tags": ["#url-id-{}".format(url_id)] # Hidden tag with associated URL ID } # Publish post diff --git a/utils/Newspapers.ipynb b/utils/Newspapers.ipynb new file mode 100644 index 0000000..48da9a6 --- /dev/null +++ b/utils/Newspapers.ipynb @@ -0,0 +1,20 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://onlinenewspapers.com/index.shtml\"" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}