Pattern matching, foxnews request with header

2025-08-13 14:12:54 +02:00
parent 30c586d49a
commit e3d6cf8000
2 changed files with 21 additions and 4 deletions
--- a/app_urls/fetcher/src/fetch_utils_url_processor.py
+++ b/app_urls/fetcher/src/fetch_utils_url_processor.py
@@ -52,8 +52,23 @@ def process_url(url, paywall_bypass=False):
    try:
        # Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
        url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
+        # User agent
+        user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
+        
        # Process
-        article = newspaper.article(url_of_interest)
+        if ("foxnews.com" in url_of_interest):
+            # Request
+            r = requests.get(url, headers={"User-Agent": user_agent})
+            # Raise for error code
+            r.raise_for_status()
+            # Parse
+            article = newspaper.Article(url=url).download(input_html=r.text).parse()
+        else:
+            # Config: Fake user agent
+            config = newspaper.configuration.Configuration()
+            config.headers = {'User-Agent': user_agent}
+            # Default mode
+            article = newspaper.article(url_of_interest, config=config)
    except newspaper.ArticleBinaryDataException:
        logger.warning("ArticleException for input URL {}".format(url))
        return {"override_status": "invalid"}
--- a/app_urls/init_data.json
+++ b/app_urls/init_data.json
@@ -22,13 +22,15 @@
    },
    "REGEX_PATTERN_STATUS_PRIORITY": [
        [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
+        ["https:\\/\\/x.com\\/.*", "invalid", 50],
        [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
-        [".*foxnews\\.com\\/(video|category|person)\\/.*", "invalid", 75],
+        [".*foxnews\\.com\\/(video|category|person|html-sitemap)\\/.*", "invalid", 75],
        [".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
        [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
-        [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
+        [".*zerohedge\\.com\\/(user|contributors)\\/.*", "invalid", 75],
        [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
        [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
-        [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
+        [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50],
+        [".*foxnews\\.com\\/[^\\/]+\\/?$", "invalid", 25]
    ]
 }