From 95451cdb5731cd04fe92dbd85f492ba9ece72971 Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Wed, 16 Apr 2025 12:29:50 +0200 Subject: [PATCH] Bypass paywall feature --- app_urls/fetcher/src/db_utils.py | 14 +++-------- app_urls/fetcher/src/url_processor.py | 35 ++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index f67cc84..ff5e791 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -82,7 +82,7 @@ class DB_Handler(): logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) - def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, override_url=None): + def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False): def set_status(obj_url, status): # Update status if setting a new value @@ -100,13 +100,8 @@ class DB_Handler(): return try: - # Override URL for request? - if (override_url is not None): - url_of_interest = override_url - else: - url_of_interest = obj_url.url # Extract URL content - dict_url_data = process_url(url_of_interest) + dict_url_data = process_url(obj_url.url, paywall_bypass) except Exception as e: if (raise_exception_on_error): # Simply raise exception, handled in a different way @@ -234,9 +229,8 @@ class DB_Handler(): continue try: - # Process URL - override_url = "https://marreta.pcdomanual.com/p/{}".format(obj_url.url) - self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, override_url=override_url) + # Process URL, try bypassing paywall + self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, paywall_bypass=True) num_urls_processed += 1 except Exception as e: # Error, cache to avoid re-processing for X time diff --git a/app_urls/fetcher/src/url_processor.py b/app_urls/fetcher/src/url_processor.py index b19ea37..4b3110d 100644 --- a/app_urls/fetcher/src/url_processor.py +++ b/app_urls/fetcher/src/url_processor.py @@ -38,17 +38,25 @@ def url_host_slowdown(url, url_host_slowdown_seconds): # About to process URL host, cache time cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes -def process_url(url): +def process_url(url, paywall_bypass=False): + + if (paywall_bypass): + # TODO: Implement self-hosted instance + url_paywall_bypass_base = "https://marreta.pcdomanual.com/p/" + # Override URL for request + url_of_interest = os.path.join(url_paywall_bypass_base, url) + else: + url_of_interest = url + try: - # Slow down if required to avoid too many requests error + # Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint) url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5))) # Process - article = newspaper.article(url) + article = newspaper.article(url_of_interest) except newspaper.ArticleBinaryDataException: logger.warning("ArticleException for input URL {}".format(url)) return {"override_status": "invalid"} except newspaper.ArticleException as e: - # Too many requests? Cool down... if ("Status code 429" in str(e.args)): # TODO: cool down and retry once?, proxy/VPN, ... @@ -70,6 +78,25 @@ def process_url(url): logger.warning("Exception for input URL {}\n{}".format(url, str(e))) return None + # Not a valid URL? + if (not article.is_valid_url()): + logger.debug("Invalid URL found: {}".format(url)) + return {"override_status": "invalid"} + + if (paywall_bypass): + # Canonical link is paywall bypass URL? -> Invalid + if (url_paywall_bypass_base in article.canonical_link): + logger.debug("Invalid URL found: {}".format(url)) + return {"override_status": "invalid"} + + # Valid URL? -> Update source URL + scheme = newspaper.urls.get_scheme(url) + if scheme is None: + scheme = "http" + source_url = scheme + "://" + str(newspaper.urls.get_domain(url)) + # Update dictionary + article.source_url = source_url + try: content_merged = "\n".join([article.title, article.meta_description, article.text]) if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):