Days handling URLs visualization, exception handling url_processor

This commit is contained in:
Luciano Gervasoni
2025-03-27 12:32:18 +01:00
parent 8dce5206af
commit a6b25fe915
4 changed files with 33 additions and 26 deletions

View File

@@ -101,20 +101,22 @@ class DB_Handler():
try:
# Get data
dict_url_data = process_url(obj_url.url)
# Not none or handle as exception
assert(dict_url_data is not None)
except Exception as e:
if (raise_exception_on_error):
# Simply raise exception
raise Exception("Error processing URL")
# Simply raise exception, handled in a different way
raise Exception("Error processing URL, raising exception as expected")
else:
logger.debug("Error processing URL: {}\n{}\n{}".format(obj_url.url, str(e), traceback.format_exc()))
# Set status to error
logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc()))
# Update status
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
dict_url_data = None
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
if (dict_url_data is None):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
# Invalid? e.g. binary data
if (dict_url_data.get("override_status") == "invalid"):
# Update status

View File

@@ -52,17 +52,17 @@ def process_url(url):
# Too many requests? Cool down...
if ("Status code 429" in str(e.args)):
# TODO: cool down and retry once?, proxy/VPN, ...
logger.debug("TODO: Implement code 429")
logger.debug("TODO: process_url Implement code 429")
# Unavailable for legal reasons
if ("Status code 451" in str(e.args)):
# TODO: Bypass with VPN
logger.debug("TODO: Implement code 451")
logger.debug("TODO: process_url Implement code 451")
# CloudFlare protection?
if ("Website protected with Cloudflare" in str(e.args)):
logger.debug("TODO: Implement bypass CloudFlare")
logger.debug("TODO: process_url Implement bypass CloudFlare")
# PerimeterX protection?
if ("Website protected with PerimeterX" in str(e.args)):
logger.debug("TODO: Implement bypass PerimeterX")
logger.debug("TODO: process_url Implement bypass PerimeterX")
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e.args)))
return None