URLs site with status filter, refactoring, django-tasks-scheduler low high priority queues
This commit is contained in:
@@ -41,13 +41,29 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
||||
def process_url(url):
|
||||
try:
|
||||
# Slow down if required to avoid too many requests error
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=2)
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=5)
|
||||
# Process
|
||||
article = newspaper.article(url)
|
||||
except newspaper.ArticleBinaryDataException:
|
||||
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
|
||||
return {"override_status": "invalid"}
|
||||
except newspaper.ArticleException as e:
|
||||
|
||||
# Too many requests? Cool down...
|
||||
if ("Status code 429" in str(e)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
logger.debug("TODO: Implement code 429")
|
||||
# Unavailable for legal reasons
|
||||
if ("Status code 451" in str(e)):
|
||||
# TODO: Bypass with VPN
|
||||
logger.debug("TODO: Implement code 451")
|
||||
# CloudFlare protection?
|
||||
if ("Website protected with Cloudflare" in str(e)):
|
||||
logger.debug("TODO: Implement bypass CloudFlare")
|
||||
# PerimeterX protection?
|
||||
if ("Website protected with PerimeterX" in str(e)):
|
||||
logger.debug("TODO: Implement bypass PerimeterX")
|
||||
|
||||
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
|
||||
return None
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user