Pattern matching, foxnews request with header
This commit is contained in:
@@ -52,8 +52,23 @@ def process_url(url, paywall_bypass=False):
|
||||
try:
|
||||
# Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||
# User agent
|
||||
user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
|
||||
|
||||
# Process
|
||||
article = newspaper.article(url_of_interest)
|
||||
if ("foxnews.com" in url_of_interest):
|
||||
# Request
|
||||
r = requests.get(url, headers={"User-Agent": user_agent})
|
||||
# Raise for error code
|
||||
r.raise_for_status()
|
||||
# Parse
|
||||
article = newspaper.Article(url=url).download(input_html=r.text).parse()
|
||||
else:
|
||||
# Config: Fake user agent
|
||||
config = newspaper.configuration.Configuration()
|
||||
config.headers = {'User-Agent': user_agent}
|
||||
# Default mode
|
||||
article = newspaper.article(url_of_interest, config=config)
|
||||
except newspaper.ArticleBinaryDataException:
|
||||
logger.warning("ArticleException for input URL {}".format(url))
|
||||
return {"override_status": "invalid"}
|
||||
|
||||
@@ -22,13 +22,15 @@
|
||||
},
|
||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
||||
["https:\\/\\/x.com\\/.*", "invalid", 50],
|
||||
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
||||
[".*foxnews\\.com\\/(video|category|person)\\/.*", "invalid", 75],
|
||||
[".*foxnews\\.com\\/(video|category|person|html-sitemap)\\/.*", "invalid", 75],
|
||||
[".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
|
||||
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(user|contributors)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
||||
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
|
||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50],
|
||||
[".*foxnews\\.com\\/[^\\/]+\\/?$", "invalid", 25]
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user