Unknown instead of error for fetched urls

2025-06-19 22:43:29 +02:00
parent a2cce62096
commit 490f01d66c
7 changed files with 227 additions and 9076 deletions
--- a/app_urls/fetcher/src/fetch_utils_url_processor.py
+++ b/app_urls/fetcher/src/fetch_utils_url_processor.py
@@ -2,6 +2,7 @@ from django.core.cache import cache
 from .logger import get_logger
 logger = get_logger()
 import newspaper
+import requests
 import time
 import os
 from urllib.parse import unquote
@@ -39,7 +40,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
    cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes

 def process_url(url, paywall_bypass=False):
-    
+
    if (paywall_bypass):
        # TODO: Implement self-hosted instance
        url_paywall_bypass_base = "https://marreta.pcdomanual.com"
@@ -57,22 +58,46 @@ def process_url(url, paywall_bypass=False):
        logger.warning("ArticleException for input URL {}".format(url))
        return {"override_status": "invalid"}
    except newspaper.ArticleException as e:
+
+        # Too many requests or blocked for some reason
+        if ("Status code 403" in str(e.args)):
+            # TODO: cool down and retry once?, proxy/VPN, ...
+            logger.debug("TODO: process_url Implement code 403")
+
+        # Not found, either it doesn't exist or getting blocked...
+        if ("Status code 404" in str(e.args)):
+            # TODO: cool down and retry once?, proxy/VPN, ...
+            logger.debug("TODO: process_url Implement code 404")
+
        # Too many requests? Cool down...
        if ("Status code 429" in str(e.args)):
            # TODO: cool down and retry once?, proxy/VPN, ...
            logger.debug("TODO: process_url Implement code 429")
+
        # Unavailable for legal reasons
        if ("Status code 451" in str(e.args)):
            # TODO: Bypass with VPN
            logger.debug("TODO: process_url Implement code 451")
+
        # CloudFlare protection?
        if ("Website protected with Cloudflare" in str(e.args)):
            logger.debug("TODO: process_url Implement bypass CloudFlare")
+
        # PerimeterX protection?
        if ("Website protected with PerimeterX" in str(e.args)):
            logger.debug("TODO: process_url Implement bypass PerimeterX")

        logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
+
+        # Try simple request, valid response but couldn't parse article? e.g. getting blocked? -> unknown
+        time.sleep(0.25)
+        r = requests.get(url_of_interest)
+        if (r.status_code == 200):
+            return {"override_status": "unknown"}
+        else:
+            # Another status code still... "error" or "unknown"
+            return {"override_status": "unknown"}
+
        return None
    except Exception as e:
        logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
--- a/app_urls/init_data_fr.json
+++ b/app_urls/init_data_fr.json
@@ -1,65 +0,0 @@
-{
-    "SEARCH": {
-        "rss_feed": [
-        ],
-        "url_host": [
-            "johnpilger.com",
-            "lapenseeecologique.com",
-            "partage-le.com",
-            "reflets.info",
-            "rezo.net",
-            "consortiumnews.com",
-            "disclose.ngo/fr",
-            "energieetenvironnement.com",
-            "global-climat.com",
-            "slashdot.org",
-            "lesamisdebartleby.wordpress.com",
-            "lundi.am",
-            "lvsl.fr",
-            "moderndiplomacy.eu",
-            "mrmondialisation.org",
-            "ourfiniteworld.com",
-            "southfront.org",
-            "simplicius76.substack.com",
-            "smoothiex12.blogspot.com",
-            "theintercept.com",
-            "wikileaks.org",
-            "contretemps.eu",
-            "indianpunchline.com",
-            "investigaction.net/fr",
-            "notechmagazine.com",
-            "terrestres.org",
-            "truthdig.com",
-            "tass.com",
-            "bastamag.net",
-            "counterpunch.org",
-            "energy-daily.com",
-            "fakirpresse.info",
-            "geopoliticalmonitor.com",
-            "huffingtonpost.fr",
-            "legrandsoir.info",
-            "les-crises.fr",
-            "liberation.fr",
-            "maitre-eolas.fr",
-            "marianne.net",
-            "mediapart.fr",
-            "metaefficient.com",
-            "monde-diplomatique.fr",
-            "paulcraigroberts.org",
-            "politis.fr",
-            "reporterre.net",
-            "rue89.com",
-            "theguardian.com/international",
-            "treehugger.com",
-            "unz.com",
-            "voltairenet.org",
-            "wsws.org"
-        ],  
-        "keyword_search": [
-            "society collapse"
-        ]
-    },
-    "REGEX_PATTERN_STATUS_PRIORITY": [
-        [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50]
-    ]
-}
--- a/app_urls/init_data_sca.json
+++ b/app_urls/init_data_sca.json
@@ -1,34 +0,0 @@
-{
-    "SEARCH": {
-        "rss_feed": [
-            "https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
-            "https://feeds.feedburner.com/breitbart",
-            "https://feeds.feedburner.com/zerohedge/feed",
-            "https://moxie.foxnews.com/google-publisher/latest.xml",
-            "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
-            "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
-        ],
-        "url_host": [
-            "missingkids.org/poster",
-            "missingkids.org/new-poster",
-            "breitbart.com",
-            "zerohedge.com",
-            "foxnews.com",
-            "cnbc.com"
-        ],
-        "keyword_search": [
-            "child abuse"
-        ]
-    },
-    "REGEX_PATTERN_STATUS_PRIORITY": [
-        [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
-        [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
-        [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
-        [".*radio.foxnews\\.com\\/.*", "invalid", 75],
-        [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
-        [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
-        [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
-        [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
-        [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
-    ]
-}