Unknown instead of error for fetched urls
This commit is contained in:
@@ -2,6 +2,7 @@ from django.core.cache import cache
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
import newspaper
|
||||
import requests
|
||||
import time
|
||||
import os
|
||||
from urllib.parse import unquote
|
||||
@@ -39,7 +40,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
||||
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||
|
||||
def process_url(url, paywall_bypass=False):
|
||||
|
||||
|
||||
if (paywall_bypass):
|
||||
# TODO: Implement self-hosted instance
|
||||
url_paywall_bypass_base = "https://marreta.pcdomanual.com"
|
||||
@@ -57,22 +58,46 @@ def process_url(url, paywall_bypass=False):
|
||||
logger.warning("ArticleException for input URL {}".format(url))
|
||||
return {"override_status": "invalid"}
|
||||
except newspaper.ArticleException as e:
|
||||
|
||||
# Too many requests or blocked for some reason
|
||||
if ("Status code 403" in str(e.args)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
logger.debug("TODO: process_url Implement code 403")
|
||||
|
||||
# Not found, either it doesn't exist or getting blocked...
|
||||
if ("Status code 404" in str(e.args)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
logger.debug("TODO: process_url Implement code 404")
|
||||
|
||||
# Too many requests? Cool down...
|
||||
if ("Status code 429" in str(e.args)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
logger.debug("TODO: process_url Implement code 429")
|
||||
|
||||
# Unavailable for legal reasons
|
||||
if ("Status code 451" in str(e.args)):
|
||||
# TODO: Bypass with VPN
|
||||
logger.debug("TODO: process_url Implement code 451")
|
||||
|
||||
# CloudFlare protection?
|
||||
if ("Website protected with Cloudflare" in str(e.args)):
|
||||
logger.debug("TODO: process_url Implement bypass CloudFlare")
|
||||
|
||||
# PerimeterX protection?
|
||||
if ("Website protected with PerimeterX" in str(e.args)):
|
||||
logger.debug("TODO: process_url Implement bypass PerimeterX")
|
||||
|
||||
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
|
||||
|
||||
# Try simple request, valid response but couldn't parse article? e.g. getting blocked? -> unknown
|
||||
time.sleep(0.25)
|
||||
r = requests.get(url_of_interest)
|
||||
if (r.status_code == 200):
|
||||
return {"override_status": "unknown"}
|
||||
else:
|
||||
# Another status code still... "error" or "unknown"
|
||||
return {"override_status": "unknown"}
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
{
|
||||
"SEARCH": {
|
||||
"rss_feed": [
|
||||
],
|
||||
"url_host": [
|
||||
"johnpilger.com",
|
||||
"lapenseeecologique.com",
|
||||
"partage-le.com",
|
||||
"reflets.info",
|
||||
"rezo.net",
|
||||
"consortiumnews.com",
|
||||
"disclose.ngo/fr",
|
||||
"energieetenvironnement.com",
|
||||
"global-climat.com",
|
||||
"slashdot.org",
|
||||
"lesamisdebartleby.wordpress.com",
|
||||
"lundi.am",
|
||||
"lvsl.fr",
|
||||
"moderndiplomacy.eu",
|
||||
"mrmondialisation.org",
|
||||
"ourfiniteworld.com",
|
||||
"southfront.org",
|
||||
"simplicius76.substack.com",
|
||||
"smoothiex12.blogspot.com",
|
||||
"theintercept.com",
|
||||
"wikileaks.org",
|
||||
"contretemps.eu",
|
||||
"indianpunchline.com",
|
||||
"investigaction.net/fr",
|
||||
"notechmagazine.com",
|
||||
"terrestres.org",
|
||||
"truthdig.com",
|
||||
"tass.com",
|
||||
"bastamag.net",
|
||||
"counterpunch.org",
|
||||
"energy-daily.com",
|
||||
"fakirpresse.info",
|
||||
"geopoliticalmonitor.com",
|
||||
"huffingtonpost.fr",
|
||||
"legrandsoir.info",
|
||||
"les-crises.fr",
|
||||
"liberation.fr",
|
||||
"maitre-eolas.fr",
|
||||
"marianne.net",
|
||||
"mediapart.fr",
|
||||
"metaefficient.com",
|
||||
"monde-diplomatique.fr",
|
||||
"paulcraigroberts.org",
|
||||
"politis.fr",
|
||||
"reporterre.net",
|
||||
"rue89.com",
|
||||
"theguardian.com/international",
|
||||
"treehugger.com",
|
||||
"unz.com",
|
||||
"voltairenet.org",
|
||||
"wsws.org"
|
||||
],
|
||||
"keyword_search": [
|
||||
"society collapse"
|
||||
]
|
||||
},
|
||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50]
|
||||
]
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
{
|
||||
"SEARCH": {
|
||||
"rss_feed": [
|
||||
"https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
|
||||
"https://feeds.feedburner.com/breitbart",
|
||||
"https://feeds.feedburner.com/zerohedge/feed",
|
||||
"https://moxie.foxnews.com/google-publisher/latest.xml",
|
||||
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
|
||||
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
|
||||
],
|
||||
"url_host": [
|
||||
"missingkids.org/poster",
|
||||
"missingkids.org/new-poster",
|
||||
"breitbart.com",
|
||||
"zerohedge.com",
|
||||
"foxnews.com",
|
||||
"cnbc.com"
|
||||
],
|
||||
"keyword_search": [
|
||||
"child abuse"
|
||||
]
|
||||
},
|
||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
||||
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
||||
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
|
||||
[".*radio.foxnews\\.com\\/.*", "invalid", 75],
|
||||
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
||||
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user