URLs site with status filter, refactoring, django-tasks-scheduler low high priority queues

This commit is contained in:
Luciano Gervasoni
2025-03-25 21:44:26 +01:00
parent 24b4614049
commit 9d2550b374
9 changed files with 111 additions and 55 deletions

View File

@@ -41,13 +41,29 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
def process_url(url):
try:
# Slow down if required to avoid too many requests error
url_host_slowdown(url, url_host_slowdown_seconds=2)
url_host_slowdown(url, url_host_slowdown_seconds=5)
# Process
article = newspaper.article(url)
except newspaper.ArticleBinaryDataException:
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
return {"override_status": "invalid"}
except newspaper.ArticleException as e:
# Too many requests? Cool down...
if ("Status code 429" in str(e)):
# TODO: cool down and retry once?, proxy/VPN, ...
logger.debug("TODO: Implement code 429")
# Unavailable for legal reasons
if ("Status code 451" in str(e)):
# TODO: Bypass with VPN
logger.debug("TODO: Implement code 451")
# CloudFlare protection?
if ("Website protected with Cloudflare" in str(e)):
logger.debug("TODO: Implement bypass CloudFlare")
# PerimeterX protection?
if ("Website protected with PerimeterX" in str(e)):
logger.debug("TODO: Implement bypass PerimeterX")
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
return None
except Exception as e: