Pattern matching, foxnews request with header

This commit is contained in:
Luciano Gervasoni
2025-08-13 14:12:54 +02:00
parent 30c586d49a
commit e3d6cf8000
2 changed files with 21 additions and 4 deletions

View File

@@ -52,8 +52,23 @@ def process_url(url, paywall_bypass=False):
try:
# Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
# User agent
user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
# Process
article = newspaper.article(url_of_interest)
if ("foxnews.com" in url_of_interest):
# Request
r = requests.get(url, headers={"User-Agent": user_agent})
# Raise for error code
r.raise_for_status()
# Parse
article = newspaper.Article(url=url).download(input_html=r.text).parse()
else:
# Config: Fake user agent
config = newspaper.configuration.Configuration()
config.headers = {'User-Agent': user_agent}
# Default mode
article = newspaper.article(url_of_interest, config=config)
except newspaper.ArticleBinaryDataException:
logger.warning("ArticleException for input URL {}".format(url))
return {"override_status": "invalid"}