Bypass paywall feature

2025-04-16 12:29:50 +02:00
parent 148ec72658
commit 95451cdb57
2 changed files with 35 additions and 14 deletions
--- a/app_urls/fetcher/src/db_utils.py
+++ b/app_urls/fetcher/src/db_utils.py
@@ -82,7 +82,7 @@ class DB_Handler():
            logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))


-    def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, override_url=None):
+    def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
        
        def set_status(obj_url, status):
            # Update status if setting a new value
@@ -100,13 +100,8 @@ class DB_Handler():
                return
        
        try:
-            # Override URL for request?
-            if (override_url is not None):
-                url_of_interest = override_url
-            else:
-                url_of_interest = obj_url.url
            # Extract URL content
-            dict_url_data = process_url(url_of_interest)
+            dict_url_data = process_url(obj_url.url, paywall_bypass)
        except Exception as e:
            if (raise_exception_on_error):
                # Simply raise exception, handled in a different way
@@ -234,9 +229,8 @@ class DB_Handler():
                        continue
                    
                    try:
-                        # Process URL
-                        override_url = "https://marreta.pcdomanual.com/p/{}".format(obj_url.url)
-                        self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, override_url=override_url)
+                        # Process URL, try bypassing paywall
+                        self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, paywall_bypass=True)
                        num_urls_processed += 1
                    except Exception as e:
                        # Error, cache to avoid re-processing for X time
--- a/app_urls/fetcher/src/url_processor.py
+++ b/app_urls/fetcher/src/url_processor.py
@@ -38,17 +38,25 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
    # About to process URL host, cache time
    cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes

-def process_url(url):
+def process_url(url, paywall_bypass=False):
+    
+    if (paywall_bypass):
+        # TODO: Implement self-hosted instance
+        url_paywall_bypass_base = "https://marreta.pcdomanual.com/p/"    
+        # Override URL for request
+        url_of_interest = os.path.join(url_paywall_bypass_base, url)
+    else:
+        url_of_interest = url
+
    try:
-        # Slow down if required to avoid too many requests error
+        # Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
        url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
        # Process
-        article = newspaper.article(url)
+        article = newspaper.article(url_of_interest)
    except newspaper.ArticleBinaryDataException:
        logger.warning("ArticleException for input URL {}".format(url))
        return {"override_status": "invalid"}
    except newspaper.ArticleException as e:
-        
        # Too many requests? Cool down...
        if ("Status code 429" in str(e.args)):
            # TODO: cool down and retry once?, proxy/VPN, ...
@@ -70,6 +78,25 @@ def process_url(url):
        logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
        return None
    
+    # Not a valid URL?
+    if (not article.is_valid_url()):
+        logger.debug("Invalid URL found: {}".format(url))
+        return {"override_status": "invalid"}
+    
+    if (paywall_bypass):
+        # Canonical link is paywall bypass URL? -> Invalid
+        if (url_paywall_bypass_base in article.canonical_link):
+            logger.debug("Invalid URL found: {}".format(url))
+            return {"override_status": "invalid"}
+
+        # Valid URL? -> Update source URL
+        scheme = newspaper.urls.get_scheme(url)
+        if scheme is None:
+            scheme = "http"
+        source_url = scheme + "://" + str(newspaper.urls.get_domain(url))
+        # Update dictionary
+        article.source_url = source_url
+
    try:
        content_merged = "\n".join([article.title, article.meta_description, article.text])
        if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):