URL redirect get before newspaper processing
This commit is contained in:
@@ -56,13 +56,13 @@ def process_url(url, paywall_bypass=False):
|
|||||||
user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
|
user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
|
||||||
|
|
||||||
# Process
|
# Process
|
||||||
if ("foxnews.com" in url_of_interest):
|
if ("foxnews.com" in url_of_interest) or ("zerohedge" in url_of_interest):
|
||||||
# Request
|
# Request
|
||||||
r = requests.get(url, headers={"User-Agent": user_agent})
|
r = requests.get(url, headers={"User-Agent": user_agent})
|
||||||
# Raise for error code
|
# Raise for error code
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
# Parse
|
# Parse
|
||||||
article = newspaper.Article(url=url).download(input_html=r.text).parse()
|
article = newspaper.Article(url=r.url).download(input_html=r.text).parse()
|
||||||
else:
|
else:
|
||||||
# Config: Fake user agent
|
# Config: Fake user agent
|
||||||
config = newspaper.configuration.Configuration()
|
config = newspaper.configuration.Configuration()
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ def link_list(request):
|
|||||||
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_valid_all", "process_missing_kids_urls_invalid_all", "process_missing_kids_urls_unknown_all", "process_missing_kids_urls_all", "clean_old_url_content_60"]
|
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_valid_all", "process_missing_kids_urls_invalid_all", "process_missing_kids_urls_unknown_all", "process_missing_kids_urls_all", "clean_old_url_content_60"]
|
||||||
# List of links
|
# List of links
|
||||||
list_links = \
|
list_links = \
|
||||||
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls"), os.path.join(app_url, "notify_status") ] + \
|
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") + \
|
||||||
[ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning", "server", "beat", "worker_default", "worker_low"] ] #+ \
|
[ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning", "server", "beat", "worker_default", "worker_low"] ] #+ \
|
||||||
#[ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
|
#[ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user