Bypass paywall feature
This commit is contained in:
@@ -82,7 +82,7 @@ class DB_Handler():
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, override_url=None):
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
|
||||
|
||||
def set_status(obj_url, status):
|
||||
# Update status if setting a new value
|
||||
@@ -100,13 +100,8 @@ class DB_Handler():
|
||||
return
|
||||
|
||||
try:
|
||||
# Override URL for request?
|
||||
if (override_url is not None):
|
||||
url_of_interest = override_url
|
||||
else:
|
||||
url_of_interest = obj_url.url
|
||||
# Extract URL content
|
||||
dict_url_data = process_url(url_of_interest)
|
||||
dict_url_data = process_url(obj_url.url, paywall_bypass)
|
||||
except Exception as e:
|
||||
if (raise_exception_on_error):
|
||||
# Simply raise exception, handled in a different way
|
||||
@@ -234,9 +229,8 @@ class DB_Handler():
|
||||
continue
|
||||
|
||||
try:
|
||||
# Process URL
|
||||
override_url = "https://marreta.pcdomanual.com/p/{}".format(obj_url.url)
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, override_url=override_url)
|
||||
# Process URL, try bypassing paywall
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, paywall_bypass=True)
|
||||
num_urls_processed += 1
|
||||
except Exception as e:
|
||||
# Error, cache to avoid re-processing for X time
|
||||
|
||||
@@ -38,17 +38,25 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
||||
# About to process URL host, cache time
|
||||
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||
|
||||
def process_url(url):
|
||||
def process_url(url, paywall_bypass=False):
|
||||
|
||||
if (paywall_bypass):
|
||||
# TODO: Implement self-hosted instance
|
||||
url_paywall_bypass_base = "https://marreta.pcdomanual.com/p/"
|
||||
# Override URL for request
|
||||
url_of_interest = os.path.join(url_paywall_bypass_base, url)
|
||||
else:
|
||||
url_of_interest = url
|
||||
|
||||
try:
|
||||
# Slow down if required to avoid too many requests error
|
||||
# Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||
# Process
|
||||
article = newspaper.article(url)
|
||||
article = newspaper.article(url_of_interest)
|
||||
except newspaper.ArticleBinaryDataException:
|
||||
logger.warning("ArticleException for input URL {}".format(url))
|
||||
return {"override_status": "invalid"}
|
||||
except newspaper.ArticleException as e:
|
||||
|
||||
# Too many requests? Cool down...
|
||||
if ("Status code 429" in str(e.args)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
@@ -70,6 +78,25 @@ def process_url(url):
|
||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||
return None
|
||||
|
||||
# Not a valid URL?
|
||||
if (not article.is_valid_url()):
|
||||
logger.debug("Invalid URL found: {}".format(url))
|
||||
return {"override_status": "invalid"}
|
||||
|
||||
if (paywall_bypass):
|
||||
# Canonical link is paywall bypass URL? -> Invalid
|
||||
if (url_paywall_bypass_base in article.canonical_link):
|
||||
logger.debug("Invalid URL found: {}".format(url))
|
||||
return {"override_status": "invalid"}
|
||||
|
||||
# Valid URL? -> Update source URL
|
||||
scheme = newspaper.urls.get_scheme(url)
|
||||
if scheme is None:
|
||||
scheme = "http"
|
||||
source_url = scheme + "://" + str(newspaper.urls.get_domain(url))
|
||||
# Update dictionary
|
||||
article.source_url = source_url
|
||||
|
||||
try:
|
||||
content_merged = "\n".join([article.title, article.meta_description, article.text])
|
||||
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
|
||||
|
||||
Reference in New Issue
Block a user