Bypass paywall feature
This commit is contained in:
@@ -82,7 +82,7 @@ class DB_Handler():
|
|||||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||||
|
|
||||||
|
|
||||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, override_url=None):
|
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
|
||||||
|
|
||||||
def set_status(obj_url, status):
|
def set_status(obj_url, status):
|
||||||
# Update status if setting a new value
|
# Update status if setting a new value
|
||||||
@@ -100,13 +100,8 @@ class DB_Handler():
|
|||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Override URL for request?
|
|
||||||
if (override_url is not None):
|
|
||||||
url_of_interest = override_url
|
|
||||||
else:
|
|
||||||
url_of_interest = obj_url.url
|
|
||||||
# Extract URL content
|
# Extract URL content
|
||||||
dict_url_data = process_url(url_of_interest)
|
dict_url_data = process_url(obj_url.url, paywall_bypass)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if (raise_exception_on_error):
|
if (raise_exception_on_error):
|
||||||
# Simply raise exception, handled in a different way
|
# Simply raise exception, handled in a different way
|
||||||
@@ -234,9 +229,8 @@ class DB_Handler():
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Process URL
|
# Process URL, try bypassing paywall
|
||||||
override_url = "https://marreta.pcdomanual.com/p/{}".format(obj_url.url)
|
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, paywall_bypass=True)
|
||||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, override_url=override_url)
|
|
||||||
num_urls_processed += 1
|
num_urls_processed += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Error, cache to avoid re-processing for X time
|
# Error, cache to avoid re-processing for X time
|
||||||
|
|||||||
@@ -38,17 +38,25 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
|||||||
# About to process URL host, cache time
|
# About to process URL host, cache time
|
||||||
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||||
|
|
||||||
def process_url(url):
|
def process_url(url, paywall_bypass=False):
|
||||||
|
|
||||||
|
if (paywall_bypass):
|
||||||
|
# TODO: Implement self-hosted instance
|
||||||
|
url_paywall_bypass_base = "https://marreta.pcdomanual.com/p/"
|
||||||
|
# Override URL for request
|
||||||
|
url_of_interest = os.path.join(url_paywall_bypass_base, url)
|
||||||
|
else:
|
||||||
|
url_of_interest = url
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Slow down if required to avoid too many requests error
|
# Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
|
||||||
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||||
# Process
|
# Process
|
||||||
article = newspaper.article(url)
|
article = newspaper.article(url_of_interest)
|
||||||
except newspaper.ArticleBinaryDataException:
|
except newspaper.ArticleBinaryDataException:
|
||||||
logger.warning("ArticleException for input URL {}".format(url))
|
logger.warning("ArticleException for input URL {}".format(url))
|
||||||
return {"override_status": "invalid"}
|
return {"override_status": "invalid"}
|
||||||
except newspaper.ArticleException as e:
|
except newspaper.ArticleException as e:
|
||||||
|
|
||||||
# Too many requests? Cool down...
|
# Too many requests? Cool down...
|
||||||
if ("Status code 429" in str(e.args)):
|
if ("Status code 429" in str(e.args)):
|
||||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||||
@@ -70,6 +78,25 @@ def process_url(url):
|
|||||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Not a valid URL?
|
||||||
|
if (not article.is_valid_url()):
|
||||||
|
logger.debug("Invalid URL found: {}".format(url))
|
||||||
|
return {"override_status": "invalid"}
|
||||||
|
|
||||||
|
if (paywall_bypass):
|
||||||
|
# Canonical link is paywall bypass URL? -> Invalid
|
||||||
|
if (url_paywall_bypass_base in article.canonical_link):
|
||||||
|
logger.debug("Invalid URL found: {}".format(url))
|
||||||
|
return {"override_status": "invalid"}
|
||||||
|
|
||||||
|
# Valid URL? -> Update source URL
|
||||||
|
scheme = newspaper.urls.get_scheme(url)
|
||||||
|
if scheme is None:
|
||||||
|
scheme = "http"
|
||||||
|
source_url = scheme + "://" + str(newspaper.urls.get_domain(url))
|
||||||
|
# Update dictionary
|
||||||
|
article.source_url = source_url
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content_merged = "\n".join([article.title, article.meta_description, article.text])
|
content_merged = "\n".join([article.title, article.meta_description, article.text])
|
||||||
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
|
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
|
||||||
|
|||||||
Reference in New Issue
Block a user