From 883dfcd3bd700abfbcc44c44af424194d4c217d7 Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Fri, 22 Aug 2025 13:11:02 +0200 Subject: [PATCH] URL redirect get before newspaper processing --- app_urls/fetcher/src/fetch_utils_url_processor.py | 4 ++-- app_urls/fetcher/views_base.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/app_urls/fetcher/src/fetch_utils_url_processor.py b/app_urls/fetcher/src/fetch_utils_url_processor.py index 5d29731..0805fd5 100644 --- a/app_urls/fetcher/src/fetch_utils_url_processor.py +++ b/app_urls/fetcher/src/fetch_utils_url_processor.py @@ -56,13 +56,13 @@ def process_url(url, paywall_bypass=False): user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1" # Process - if ("foxnews.com" in url_of_interest): + if ("foxnews.com" in url_of_interest) or ("zerohedge" in url_of_interest): # Request r = requests.get(url, headers={"User-Agent": user_agent}) # Raise for error code r.raise_for_status() # Parse - article = newspaper.Article(url=url).download(input_html=r.text).parse() + article = newspaper.Article(url=r.url).download(input_html=r.text).parse() else: # Config: Fake user agent config = newspaper.configuration.Configuration() diff --git a/app_urls/fetcher/views_base.py b/app_urls/fetcher/views_base.py index d04d0cd..daf3a89 100644 --- a/app_urls/fetcher/views_base.py +++ b/app_urls/fetcher/views_base.py @@ -21,7 +21,7 @@ def link_list(request): links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_valid_all", "process_missing_kids_urls_invalid_all", "process_missing_kids_urls_unknown_all", "process_missing_kids_urls_all", "clean_old_url_content_60"] # List of links list_links = \ - [ os.path.join(app_url, "admin"), os.path.join(app_url, "urls"), os.path.join(app_url, "notify_status") ] + \ + [ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") + \ [ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning", "server", "beat", "worker_default", "worker_low"] ] #+ \ #[ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]