Bypass paywall feature

This commit is contained in:
Luciano Gervasoni
2025-04-16 12:29:50 +02:00
parent 148ec72658
commit 95451cdb57
2 changed files with 35 additions and 14 deletions

View File

@@ -82,7 +82,7 @@ class DB_Handler():
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, override_url=None):
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
def set_status(obj_url, status):
# Update status if setting a new value
@@ -100,13 +100,8 @@ class DB_Handler():
return
try:
# Override URL for request?
if (override_url is not None):
url_of_interest = override_url
else:
url_of_interest = obj_url.url
# Extract URL content
dict_url_data = process_url(url_of_interest)
dict_url_data = process_url(obj_url.url, paywall_bypass)
except Exception as e:
if (raise_exception_on_error):
# Simply raise exception, handled in a different way
@@ -234,9 +229,8 @@ class DB_Handler():
continue
try:
# Process URL
override_url = "https://marreta.pcdomanual.com/p/{}".format(obj_url.url)
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, override_url=override_url)
# Process URL, try bypassing paywall
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, paywall_bypass=True)
num_urls_processed += 1
except Exception as e:
# Error, cache to avoid re-processing for X time