Dockerization, whitenoise serving static, refactor

2025-04-04 10:53:16 +02:00
parent 5addfa5ba9
commit 4dbe2e55ef
39 changed files with 708 additions and 1238 deletions
--- a/app_urls/fetcher/src/url_processor.py
+++ b/app_urls/fetcher/src/url_processor.py
@@ -0,0 +1,127 @@
+from django.core.cache import cache
+from .logger import get_logger
+logger = get_logger()
+import newspaper
+import time
+import os
+from urllib.parse import unquote
+import langdetect
+langdetect.DetectorFactory.seed = 0
+
+def get_with_protocol(url):
+    # http:// -> https://
+    url = url.replace("http://", "https://")
+    # "" -> https://
+    if not (url.startswith("https://")):
+        url = "https://" + url
+    return url
+
+def get_url_host(url):
+    # URL no protocol, first substring before '/'
+    url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
+    return url_host
+
+def url_host_slowdown(url, url_host_slowdown_seconds):
+    ### Avoid (frequent) too many requests to the same URL host
+    # Get URL host
+    url_host = get_url_host(url)
+    # Recently processed URL host? -> Slow down required
+    last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None)
+    if last_cached_timestamp:
+        # Get time since last processed URL host (in seconds)
+        time_since_last_processed = time.time() - last_cached_timestamp
+        # Amount of time required to sleep?
+        slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
+        logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
+        # Sleep
+        time.sleep(slowdown_required)
+    # About to process URL host, cache time
+    cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
+
+def process_url(url):
+    try:
+        # Slow down if required to avoid too many requests error
+        url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
+        # Process
+        article = newspaper.article(url)
+    except newspaper.ArticleBinaryDataException:
+        logger.warning("ArticleException for input URL {}".format(url))
+        return {"override_status": "invalid"}
+    except newspaper.ArticleException as e:
+        
+        # Too many requests? Cool down...
+        if ("Status code 429" in str(e.args)):
+            # TODO: cool down and retry once?, proxy/VPN, ...
+            logger.debug("TODO: process_url Implement code 429")
+        # Unavailable for legal reasons
+        if ("Status code 451" in str(e.args)):
+            # TODO: Bypass with VPN
+            logger.debug("TODO: process_url Implement code 451")
+        # CloudFlare protection?
+        if ("Website protected with Cloudflare" in str(e.args)):
+            logger.debug("TODO: process_url Implement bypass CloudFlare")
+        # PerimeterX protection?
+        if ("Website protected with PerimeterX" in str(e.args)):
+            logger.debug("TODO: process_url Implement bypass PerimeterX")
+
+        logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
+        return None
+    except Exception as e:
+        logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
+        return None
+    
+    try:
+        content_merged = "\n".join([article.title, article.meta_description, article.text])
+        if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
+            language = langdetect.detect(content_merged)
+        else:
+            language = None
+    except Exception as e:
+        logger.info("Could not detect language: {}\n{}".format(url, str(e)))
+        language = None
+
+    dict_data = {
+        "url": url,
+        "url_canonical": article.canonical_link,
+        "url_host": article.source_url,
+        "site_name": article.meta_site_name,
+        "publish_date": article.publish_date,
+        "language": language, # article.meta_lang -> Not always reliable
+        "title": article.title,
+        "description": article.meta_description,
+        "content": article.text,
+        "valid_content": article.is_valid_body(),
+        "keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
+        "tags": article.tags,
+        "authors": article.authors,
+        "image_main_url": article.top_image, # article.meta_img
+        "images_url": article.images,
+        "videos_url": article.movies,
+    }
+
+    '''
+    # TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",")
+    if (dict_data["tags"] is None):
+        dict_data["tags"] = []
+    for k in article.meta_data.keys():
+        if ("tags" in k):
+            dict_data["tags"] += article.meta_data[k].split(",")
+    '''
+
+    # Sanity check
+    for k in dict_data.keys():
+        if (type(dict_data[k]) is list):
+            # Remove empty string, unquote special characters, e.g. "%20" -> " "
+            dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ]
+            # NULL instead of empty list
+            if (len(dict_data[k]) == 0):
+                dict_data[k] = None
+        elif (type(dict_data[k]) is str):
+            # Unquote special characters
+            if (dict_data[k] is not None):
+                dict_data[k] = unquote(dict_data[k])
+            # NULL instead of empty string
+            if (dict_data[k] == ""):
+                dict_data[k] = None
+
+    return dict_data