Bypass paywall feature

This commit is contained in:
Luciano Gervasoni
2025-04-16 12:29:50 +02:00
parent 148ec72658
commit 95451cdb57
2 changed files with 35 additions and 14 deletions

View File

@@ -82,7 +82,7 @@ class DB_Handler():
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, override_url=None):
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
def set_status(obj_url, status):
# Update status if setting a new value
@@ -100,13 +100,8 @@ class DB_Handler():
return
try:
# Override URL for request?
if (override_url is not None):
url_of_interest = override_url
else:
url_of_interest = obj_url.url
# Extract URL content
dict_url_data = process_url(url_of_interest)
dict_url_data = process_url(obj_url.url, paywall_bypass)
except Exception as e:
if (raise_exception_on_error):
# Simply raise exception, handled in a different way
@@ -234,9 +229,8 @@ class DB_Handler():
continue
try:
# Process URL
override_url = "https://marreta.pcdomanual.com/p/{}".format(obj_url.url)
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, override_url=override_url)
# Process URL, try bypassing paywall
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, paywall_bypass=True)
num_urls_processed += 1
except Exception as e:
# Error, cache to avoid re-processing for X time

View File

@@ -38,17 +38,25 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
# About to process URL host, cache time
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
def process_url(url):
def process_url(url, paywall_bypass=False):
if (paywall_bypass):
# TODO: Implement self-hosted instance
url_paywall_bypass_base = "https://marreta.pcdomanual.com/p/"
# Override URL for request
url_of_interest = os.path.join(url_paywall_bypass_base, url)
else:
url_of_interest = url
try:
# Slow down if required to avoid too many requests error
# Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
# Process
article = newspaper.article(url)
article = newspaper.article(url_of_interest)
except newspaper.ArticleBinaryDataException:
logger.warning("ArticleException for input URL {}".format(url))
return {"override_status": "invalid"}
except newspaper.ArticleException as e:
# Too many requests? Cool down...
if ("Status code 429" in str(e.args)):
# TODO: cool down and retry once?, proxy/VPN, ...
@@ -70,6 +78,25 @@ def process_url(url):
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
return None
# Not a valid URL?
if (not article.is_valid_url()):
logger.debug("Invalid URL found: {}".format(url))
return {"override_status": "invalid"}
if (paywall_bypass):
# Canonical link is paywall bypass URL? -> Invalid
if (url_paywall_bypass_base in article.canonical_link):
logger.debug("Invalid URL found: {}".format(url))
return {"override_status": "invalid"}
# Valid URL? -> Update source URL
scheme = newspaper.urls.get_scheme(url)
if scheme is None:
scheme = "http"
source_url = scheme + "://" + str(newspaper.urls.get_domain(url))
# Update dictionary
article.source_url = source_url
try:
content_merged = "\n".join([article.title, article.meta_description, article.text])
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):