diff --git a/app_urls/fetcher/src/fetch_utils_url_processor.py b/app_urls/fetcher/src/fetch_utils_url_processor.py index c72d267..5d29731 100644 --- a/app_urls/fetcher/src/fetch_utils_url_processor.py +++ b/app_urls/fetcher/src/fetch_utils_url_processor.py @@ -52,8 +52,23 @@ def process_url(url, paywall_bypass=False): try: # Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint) url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5))) + # User agent + user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1" + # Process - article = newspaper.article(url_of_interest) + if ("foxnews.com" in url_of_interest): + # Request + r = requests.get(url, headers={"User-Agent": user_agent}) + # Raise for error code + r.raise_for_status() + # Parse + article = newspaper.Article(url=url).download(input_html=r.text).parse() + else: + # Config: Fake user agent + config = newspaper.configuration.Configuration() + config.headers = {'User-Agent': user_agent} + # Default mode + article = newspaper.article(url_of_interest, config=config) except newspaper.ArticleBinaryDataException: logger.warning("ArticleException for input URL {}".format(url)) return {"override_status": "invalid"} diff --git a/app_urls/init_data.json b/app_urls/init_data.json index 0f2b15e..1f1272e 100644 --- a/app_urls/init_data.json +++ b/app_urls/init_data.json @@ -22,13 +22,15 @@ }, "REGEX_PATTERN_STATUS_PRIORITY": [ [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50], + ["https:\\/\\/x.com\\/.*", "invalid", 50], [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75], - [".*foxnews\\.com\\/(video|category|person)\\/.*", "invalid", 75], + [".*foxnews\\.com\\/(video|category|person|html-sitemap)\\/.*", "invalid", 75], [".*radio\\.foxnews\\.com\\/.*", "invalid", 75], [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75], - [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75], + [".*zerohedge\\.com\\/(user|contributors)\\/.*", "invalid", 75], [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50], [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50], - [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50] + [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50], + [".*foxnews\\.com\\/[^\\/]+\\/?$", "invalid", 25] ] }