Dockerization, whitenoise serving static, refactor

This commit is contained in:
Luciano Gervasoni
2025-04-04 10:53:16 +02:00
parent 5addfa5ba9
commit 4dbe2e55ef
39 changed files with 708 additions and 1238 deletions

View File

@@ -0,0 +1,127 @@
from django.core.cache import cache
from .logger import get_logger
logger = get_logger()
import newspaper
import time
import os
from urllib.parse import unquote
import langdetect
langdetect.DetectorFactory.seed = 0
def get_with_protocol(url):
# http:// -> https://
url = url.replace("http://", "https://")
# "" -> https://
if not (url.startswith("https://")):
url = "https://" + url
return url
def get_url_host(url):
# URL no protocol, first substring before '/'
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
return url_host
def url_host_slowdown(url, url_host_slowdown_seconds):
### Avoid (frequent) too many requests to the same URL host
# Get URL host
url_host = get_url_host(url)
# Recently processed URL host? -> Slow down required
last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None)
if last_cached_timestamp:
# Get time since last processed URL host (in seconds)
time_since_last_processed = time.time() - last_cached_timestamp
# Amount of time required to sleep?
slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
# Sleep
time.sleep(slowdown_required)
# About to process URL host, cache time
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
def process_url(url):
try:
# Slow down if required to avoid too many requests error
url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
# Process
article = newspaper.article(url)
except newspaper.ArticleBinaryDataException:
logger.warning("ArticleException for input URL {}".format(url))
return {"override_status": "invalid"}
except newspaper.ArticleException as e:
# Too many requests? Cool down...
if ("Status code 429" in str(e.args)):
# TODO: cool down and retry once?, proxy/VPN, ...
logger.debug("TODO: process_url Implement code 429")
# Unavailable for legal reasons
if ("Status code 451" in str(e.args)):
# TODO: Bypass with VPN
logger.debug("TODO: process_url Implement code 451")
# CloudFlare protection?
if ("Website protected with Cloudflare" in str(e.args)):
logger.debug("TODO: process_url Implement bypass CloudFlare")
# PerimeterX protection?
if ("Website protected with PerimeterX" in str(e.args)):
logger.debug("TODO: process_url Implement bypass PerimeterX")
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
return None
except Exception as e:
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
return None
try:
content_merged = "\n".join([article.title, article.meta_description, article.text])
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
language = langdetect.detect(content_merged)
else:
language = None
except Exception as e:
logger.info("Could not detect language: {}\n{}".format(url, str(e)))
language = None
dict_data = {
"url": url,
"url_canonical": article.canonical_link,
"url_host": article.source_url,
"site_name": article.meta_site_name,
"publish_date": article.publish_date,
"language": language, # article.meta_lang -> Not always reliable
"title": article.title,
"description": article.meta_description,
"content": article.text,
"valid_content": article.is_valid_body(),
"keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
"tags": article.tags,
"authors": article.authors,
"image_main_url": article.top_image, # article.meta_img
"images_url": article.images,
"videos_url": article.movies,
}
'''
# TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",")
if (dict_data["tags"] is None):
dict_data["tags"] = []
for k in article.meta_data.keys():
if ("tags" in k):
dict_data["tags"] += article.meta_data[k].split(",")
'''
# Sanity check
for k in dict_data.keys():
if (type(dict_data[k]) is list):
# Remove empty string, unquote special characters, e.g. "%20" -> " "
dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ]
# NULL instead of empty list
if (len(dict_data[k]) == 0):
dict_data[k] = None
elif (type(dict_data[k]) is str):
# Unquote special characters
if (dict_data[k] is not None):
dict_data[k] = unquote(dict_data[k])
# NULL instead of empty string
if (dict_data[k] == ""):
dict_data[k] = None
return dict_data