Unknown instead of error for fetched urls

This commit is contained in:
Luciano Gervasoni
2025-06-19 22:43:29 +02:00
parent a2cce62096
commit 490f01d66c
7 changed files with 227 additions and 9076 deletions

View File

@@ -2,6 +2,7 @@ from django.core.cache import cache
from .logger import get_logger
logger = get_logger()
import newspaper
import requests
import time
import os
from urllib.parse import unquote
@@ -39,7 +40,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
def process_url(url, paywall_bypass=False):
if (paywall_bypass):
# TODO: Implement self-hosted instance
url_paywall_bypass_base = "https://marreta.pcdomanual.com"
@@ -57,22 +58,46 @@ def process_url(url, paywall_bypass=False):
logger.warning("ArticleException for input URL {}".format(url))
return {"override_status": "invalid"}
except newspaper.ArticleException as e:
# Too many requests or blocked for some reason
if ("Status code 403" in str(e.args)):
# TODO: cool down and retry once?, proxy/VPN, ...
logger.debug("TODO: process_url Implement code 403")
# Not found, either it doesn't exist or getting blocked...
if ("Status code 404" in str(e.args)):
# TODO: cool down and retry once?, proxy/VPN, ...
logger.debug("TODO: process_url Implement code 404")
# Too many requests? Cool down...
if ("Status code 429" in str(e.args)):
# TODO: cool down and retry once?, proxy/VPN, ...
logger.debug("TODO: process_url Implement code 429")
# Unavailable for legal reasons
if ("Status code 451" in str(e.args)):
# TODO: Bypass with VPN
logger.debug("TODO: process_url Implement code 451")
# CloudFlare protection?
if ("Website protected with Cloudflare" in str(e.args)):
logger.debug("TODO: process_url Implement bypass CloudFlare")
# PerimeterX protection?
if ("Website protected with PerimeterX" in str(e.args)):
logger.debug("TODO: process_url Implement bypass PerimeterX")
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
# Try simple request, valid response but couldn't parse article? e.g. getting blocked? -> unknown
time.sleep(0.25)
r = requests.get(url_of_interest)
if (r.status_code == 200):
return {"override_status": "unknown"}
else:
# Another status code still... "error" or "unknown"
return {"override_status": "unknown"}
return None
except Exception as e:
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))