Workers light,default,heavy
This commit is contained in:
@@ -100,7 +100,7 @@ class DB_Handler():
|
||||
# URLs duplciate association
|
||||
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False, request_timeout=15):
|
||||
##########################################################################
|
||||
# URL pattern: missingkids.org/poster OR missingkids.org/new-poster
|
||||
if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url):
|
||||
@@ -147,8 +147,7 @@ class DB_Handler():
|
||||
|
||||
try:
|
||||
# Extract URL content
|
||||
dict_url_data = process_url(obj_url.url, paywall_bypass)
|
||||
logger.debug("Processing raw URL EXTRACT URL CONTENT OK: {}".format(obj_url.url))
|
||||
dict_url_data = process_url(obj_url.url, paywall_bypass, request_timeout)
|
||||
except Exception as e:
|
||||
if (raise_exception_on_error):
|
||||
# Simply raise exception, handled in a different way
|
||||
@@ -238,12 +237,10 @@ class DB_Handler():
|
||||
|
||||
# Per URL
|
||||
for obj_url in raw_urls:
|
||||
logger.debug("Processing raw URL: {}".format(obj_url.url))
|
||||
# Override status if pattern matching?
|
||||
status_pattern_match = _get_status_pattern_matching(obj_url.url, list_pattern_status_tuple)
|
||||
# Process URL
|
||||
self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False)
|
||||
logger.debug("Processing raw URL OK: {}".format(obj_url.url))
|
||||
|
||||
logger.info("Updated #{} raw URLs".format(len(raw_urls)))
|
||||
except Exception as e:
|
||||
|
||||
@@ -39,7 +39,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
||||
# About to process URL host, cache time
|
||||
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||
|
||||
def process_url(url, paywall_bypass=False):
|
||||
def process_url(url, paywall_bypass=False, request_timeout=15):
|
||||
logger.debug("Processing raw URL 1: {}".format(url))
|
||||
|
||||
if (paywall_bypass):
|
||||
@@ -59,7 +59,7 @@ def process_url(url, paywall_bypass=False):
|
||||
# Process
|
||||
if ("foxnews.com" in url_of_interest) or ("zerohedge" in url_of_interest):
|
||||
# Request
|
||||
r = requests.get(url, headers={"User-Agent": user_agent}, timeout=15)
|
||||
r = requests.get(url, headers={"User-Agent": user_agent}, timeout=request_timeout)
|
||||
# Raise for error code
|
||||
r.raise_for_status()
|
||||
# Parse
|
||||
@@ -68,7 +68,7 @@ def process_url(url, paywall_bypass=False):
|
||||
# Config: Fake user agent
|
||||
config = newspaper.configuration.Configuration()
|
||||
config.headers = {'User-Agent': user_agent}
|
||||
config.request_timeout = 15 # timeout in seconds
|
||||
config.request_timeout = request_timeout
|
||||
# Default mode
|
||||
article = newspaper.article(url_of_interest, config=config)
|
||||
|
||||
@@ -110,7 +110,7 @@ def process_url(url, paywall_bypass=False):
|
||||
|
||||
# Try simple request, valid response but couldn't parse article? e.g. getting blocked? -> unknown
|
||||
time.sleep(0.25)
|
||||
r = requests.get(url_of_interest, timeout=15)
|
||||
r = requests.get(url_of_interest, timeout=request_timeout)
|
||||
if (r.status_code == 200):
|
||||
return {"override_status": "unknown"}
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user