From 95451cdb5731cd04fe92dbd85f492ba9ece72971 Mon Sep 17 00:00:00 2001
From: Luciano Gervasoni <luciano.gervasoni@3dlook.me>
Date: Wed, 16 Apr 2025 12:29:50 +0200
Subject: [PATCH] Bypass paywall feature

---
 app_urls/fetcher/src/db_utils.py      | 14 +++--------
 app_urls/fetcher/src/url_processor.py | 35 ++++++++++++++++++++++++---
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py
index f67cc84..ff5e791 100644
--- a/app_urls/fetcher/src/db_utils.py
+++ b/app_urls/fetcher/src/db_utils.py
@@ -82,7 +82,7 @@ class DB_Handler():
             logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
 
 
-    def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, override_url=None):
+    def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
         
         def set_status(obj_url, status):
             # Update status if setting a new value
@@ -100,13 +100,8 @@ class DB_Handler():
                 return
         
         try:
-            # Override URL for request?
-            if (override_url is not None):
-                url_of_interest = override_url
-            else:
-                url_of_interest = obj_url.url
             # Extract URL content
-            dict_url_data = process_url(url_of_interest)
+            dict_url_data = process_url(obj_url.url, paywall_bypass)
         except Exception as e:
             if (raise_exception_on_error):
                 # Simply raise exception, handled in a different way
@@ -234,9 +229,8 @@ class DB_Handler():
                         continue
                     
                     try:
-                        # Process URL
-                        override_url = "https://marreta.pcdomanual.com/p/{}".format(obj_url.url)
-                        self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, override_url=override_url)
+                        # Process URL, try bypassing paywall
+                        self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, paywall_bypass=True)
                         num_urls_processed += 1
                     except Exception as e:
                         # Error, cache to avoid re-processing for X time
diff --git a/app_urls/fetcher/src/url_processor.py b/app_urls/fetcher/src/url_processor.py
index b19ea37..4b3110d 100644
--- a/app_urls/fetcher/src/url_processor.py
+++ b/app_urls/fetcher/src/url_processor.py
@@ -38,17 +38,25 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
     # About to process URL host, cache time
     cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
 
-def process_url(url):
+def process_url(url, paywall_bypass=False):
+    
+    if (paywall_bypass):
+        # TODO: Implement self-hosted instance
+        url_paywall_bypass_base = "https://marreta.pcdomanual.com/p/"    
+        # Override URL for request
+        url_of_interest = os.path.join(url_paywall_bypass_base, url)
+    else:
+        url_of_interest = url
+
     try:
-        # Slow down if required to avoid too many requests error
+        # Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
         url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
         # Process
-        article = newspaper.article(url)
+        article = newspaper.article(url_of_interest)
     except newspaper.ArticleBinaryDataException:
         logger.warning("ArticleException for input URL {}".format(url))
         return {"override_status": "invalid"}
     except newspaper.ArticleException as e:
-        
         # Too many requests? Cool down...
         if ("Status code 429" in str(e.args)):
             # TODO: cool down and retry once?, proxy/VPN, ...
@@ -70,6 +78,25 @@ def process_url(url):
         logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
         return None
     
+    # Not a valid URL?
+    if (not article.is_valid_url()):
+        logger.debug("Invalid URL found: {}".format(url))
+        return {"override_status": "invalid"}
+    
+    if (paywall_bypass):
+        # Canonical link is paywall bypass URL? -> Invalid
+        if (url_paywall_bypass_base in article.canonical_link):
+            logger.debug("Invalid URL found: {}".format(url))
+            return {"override_status": "invalid"}
+
+        # Valid URL? -> Update source URL
+        scheme = newspaper.urls.get_scheme(url)
+        if scheme is None:
+            scheme = "http"
+        source_url = scheme + "://" + str(newspaper.urls.get_domain(url))
+        # Update dictionary
+        article.source_url = source_url
+
     try:
         content_merged = "\n".join([article.title, article.meta_description, article.text])
         if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):