Publish with hidden tag, don't publish if url id already processed

This commit is contained in:
Luciano Gervasoni
2025-04-24 16:47:14 +02:00
parent b8fdcae5ec
commit b3f7cb255c
2 changed files with 52 additions and 6 deletions

View File

@@ -12,7 +12,8 @@ logger = get_logger()
class Publisher(): class Publisher():
def __init__(self): def __init__(self):
pass self.admin_api_url = os.getenv("GHOST_ADMIN_API_URL")
self.admin_api_key = os.getenv("GHOST_ADMIN_API_KEY")
def _create_jwt(self, admin_api_key): def _create_jwt(self, admin_api_key):
id_, secret = admin_api_key.split(':') id_, secret = admin_api_key.split(':')
@@ -29,9 +30,7 @@ class Publisher():
def _create_ghost_post(self, post_data): def _create_ghost_post(self, post_data):
# Get token # Get token
jwt_token = self._create_jwt(os.getenv("GHOST_ADMIN_API_KEY")) jwt_token = self._create_jwt(self.admin_api_key)
# Get Admin API URL
admin_api_url = os.getenv("GHOST_ADMIN_API_URL")
headers = { headers = {
'Authorization': f'Ghost {jwt_token}', 'Authorization': f'Ghost {jwt_token}',
@@ -41,7 +40,7 @@ class Publisher():
post_data = {"posts": [post_data]} post_data = {"posts": [post_data]}
response = requests.post( response = requests.post(
os.path.join(admin_api_url, "posts"), os.path.join(self.admin_api_url, "posts"),
json=post_data, json=post_data,
headers=headers, headers=headers,
params={"source":"html"} params={"source":"html"}
@@ -53,6 +52,27 @@ class Publisher():
else: else:
logger.warning("Ghost - Failed to publish post: {} {}".format(response.status_code, response.text)) logger.warning("Ghost - Failed to publish post: {} {}".format(response.status_code, response.text))
return None return None
def _published_url_id(self, url_id):
# Get token
jwt_token = self._create_jwt(self.admin_api_key)
headers = {
'Authorization': f'Ghost {jwt_token}',
'Content-Type': 'application/json'
}
# Query param filter by URL ID
params = {"filter": "tags:hash-url-id-{}".format(url_id)}
# Get posts using filter
response = requests.get(os.path.join(self.admin_api_url, "posts"), params=params, headers=headers)
# To JSON
dict_response = response.json()
if (len(dict_response.get("posts")) > 0):
return True
else
return False
def _get_photo_url(self, query): def _get_photo_url(self, query):
# TODO: Get already used photos to skip. Use DB # TODO: Get already used photos to skip. Use DB
@@ -101,6 +121,11 @@ class Publisher():
logger.warning("Ghost - URL Content is not valid for URL ID: {} {}".format(url_id, url.url)) logger.warning("Ghost - URL Content is not valid for URL ID: {} {}".format(url_id, url.url))
return return
# URL ID already published?
if (self._published_url_id(url_id)):
logger.info("Ghost - URL ID {} already published, skipping".format(url_id))
return
########################################### ###########################################
client_llm = OllamaClient() client_llm = OllamaClient()
# Model # Model
@@ -160,7 +185,7 @@ class Publisher():
if (location_url is not None): if (location_url is not None):
html_data += '<p><a href="{}">Estimated location</a></p>'.format(location_url) html_data += '<p><a href="{}">Estimated location</a></p>'.format(location_url)
# HTML: Add source # HTML: Add source
html_data += '<p><a href="{}">Source</a></p>'.format(url.url) html_data += '<p><a href="{}">Source: {}</a></p>'.format(url.url, url_content.url_host.replace("https://", ""))
post_data = { post_data = {
# "slug": "hey-short", # "slug": "hey-short",
@@ -171,6 +196,7 @@ class Publisher():
"feature_image": photo_url, "feature_image": photo_url,
#"feature_image_caption": "", #"feature_image_caption": "",
"status": "published", "status": "published",
"tags": ["#url-id-{}".format(url_id)] # Hidden tag with associated URL ID
} }
# Publish post # Publish post

20
utils/Newspapers.ipynb Normal file
View File

@@ -0,0 +1,20 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"url = \"https://onlinenewspapers.com/index.shtml\""
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}