diff --git a/app_urls/fetcher/src/url_processor.py b/app_urls/fetcher/src/url_processor.py index 4b3110d..3e652ef 100644 --- a/app_urls/fetcher/src/url_processor.py +++ b/app_urls/fetcher/src/url_processor.py @@ -42,9 +42,9 @@ def process_url(url, paywall_bypass=False): if (paywall_bypass): # TODO: Implement self-hosted instance - url_paywall_bypass_base = "https://marreta.pcdomanual.com/p/" + url_paywall_bypass_base = "https://marreta.pcdomanual.com" # Override URL for request - url_of_interest = os.path.join(url_paywall_bypass_base, url) + url_of_interest = os.path.join(url_paywall_bypass_base, "p", url) else: url_of_interest = url @@ -84,10 +84,11 @@ def process_url(url, paywall_bypass=False): return {"override_status": "invalid"} if (paywall_bypass): - # Canonical link is paywall bypass URL? -> Invalid + # Canonical link contains URL of paywall bypass? Unsuccessful bypassing -> Error / Unknown if (url_paywall_bypass_base in article.canonical_link): - logger.debug("Invalid URL found: {}".format(url)) - return {"override_status": "invalid"} + logger.debug("Paywall bypass not successful for URL: {}".format(url)) + # return {"override_status": "unknown"} + return None # Valid URL? -> Update source URL scheme = newspaper.urls.get_scheme(url)