Url bypass handle error case

This commit is contained in:
Luciano Gervasoni
2025-04-16 18:30:31 +02:00
parent 95451cdb57
commit 4dd351f4ef

View File

@@ -42,9 +42,9 @@ def process_url(url, paywall_bypass=False):
if (paywall_bypass):
# TODO: Implement self-hosted instance
url_paywall_bypass_base = "https://marreta.pcdomanual.com/p/"
url_paywall_bypass_base = "https://marreta.pcdomanual.com"
# Override URL for request
url_of_interest = os.path.join(url_paywall_bypass_base, url)
url_of_interest = os.path.join(url_paywall_bypass_base, "p", url)
else:
url_of_interest = url
@@ -84,10 +84,11 @@ def process_url(url, paywall_bypass=False):
return {"override_status": "invalid"}
if (paywall_bypass):
# Canonical link is paywall bypass URL? -> Invalid
# Canonical link contains URL of paywall bypass? Unsuccessful bypassing -> Error / Unknown
if (url_paywall_bypass_base in article.canonical_link):
logger.debug("Invalid URL found: {}".format(url))
return {"override_status": "invalid"}
logger.debug("Paywall bypass not successful for URL: {}".format(url))
# return {"override_status": "unknown"}
return None
# Valid URL? -> Update source URL
scheme = newspaper.urls.get_scheme(url)