Unknown instead of error for fetched urls

2025-06-19 22:43:29 +02:00
parent a2cce62096
commit 490f01d66c
7 changed files with 227 additions and 9076 deletions
--- a/app_cv/Server.ipynb
+++ b/app_cv/Server.ipynb
@@ -6,8 +6,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# !pip install flask\n",
-    "!python app.py"
+    "!uvicorn app:app --workers 1 --log-level info --port 5001\n",
+    "#!uvicorn app:app --reload --log-level debug --port 8000\n",
+    "#!python app.py"
   ]
  }
 ],
--- a/app_urls/fetcher/src/fetch_utils_url_processor.py
+++ b/app_urls/fetcher/src/fetch_utils_url_processor.py
@@ -2,6 +2,7 @@ from django.core.cache import cache
 from .logger import get_logger
 logger = get_logger()
 import newspaper
+import requests
 import time
 import os
 from urllib.parse import unquote
@@ -39,7 +40,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
    cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes

 def process_url(url, paywall_bypass=False):
-    
+
    if (paywall_bypass):
        # TODO: Implement self-hosted instance
        url_paywall_bypass_base = "https://marreta.pcdomanual.com"
@@ -57,22 +58,46 @@ def process_url(url, paywall_bypass=False):
        logger.warning("ArticleException for input URL {}".format(url))
        return {"override_status": "invalid"}
    except newspaper.ArticleException as e:
+
+        # Too many requests or blocked for some reason
+        if ("Status code 403" in str(e.args)):
+            # TODO: cool down and retry once?, proxy/VPN, ...
+            logger.debug("TODO: process_url Implement code 403")
+
+        # Not found, either it doesn't exist or getting blocked...
+        if ("Status code 404" in str(e.args)):
+            # TODO: cool down and retry once?, proxy/VPN, ...
+            logger.debug("TODO: process_url Implement code 404")
+
        # Too many requests? Cool down...
        if ("Status code 429" in str(e.args)):
            # TODO: cool down and retry once?, proxy/VPN, ...
            logger.debug("TODO: process_url Implement code 429")
+
        # Unavailable for legal reasons
        if ("Status code 451" in str(e.args)):
            # TODO: Bypass with VPN
            logger.debug("TODO: process_url Implement code 451")
+
        # CloudFlare protection?
        if ("Website protected with Cloudflare" in str(e.args)):
            logger.debug("TODO: process_url Implement bypass CloudFlare")
+
        # PerimeterX protection?
        if ("Website protected with PerimeterX" in str(e.args)):
            logger.debug("TODO: process_url Implement bypass PerimeterX")

        logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
+
+        # Try simple request, valid response but couldn't parse article? e.g. getting blocked? -> unknown
+        time.sleep(0.25)
+        r = requests.get(url_of_interest)
+        if (r.status_code == 200):
+            return {"override_status": "unknown"}
+        else:
+            # Another status code still... "error" or "unknown"
+            return {"override_status": "unknown"}
+
        return None
    except Exception as e:
        logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
--- a/app_urls/init_data_fr.json
+++ b/app_urls/init_data_fr.json
@@ -1,65 +0,0 @@
-{
-    "SEARCH": {
-        "rss_feed": [
-        ],
-        "url_host": [
-            "johnpilger.com",
-            "lapenseeecologique.com",
-            "partage-le.com",
-            "reflets.info",
-            "rezo.net",
-            "consortiumnews.com",
-            "disclose.ngo/fr",
-            "energieetenvironnement.com",
-            "global-climat.com",
-            "slashdot.org",
-            "lesamisdebartleby.wordpress.com",
-            "lundi.am",
-            "lvsl.fr",
-            "moderndiplomacy.eu",
-            "mrmondialisation.org",
-            "ourfiniteworld.com",
-            "southfront.org",
-            "simplicius76.substack.com",
-            "smoothiex12.blogspot.com",
-            "theintercept.com",
-            "wikileaks.org",
-            "contretemps.eu",
-            "indianpunchline.com",
-            "investigaction.net/fr",
-            "notechmagazine.com",
-            "terrestres.org",
-            "truthdig.com",
-            "tass.com",
-            "bastamag.net",
-            "counterpunch.org",
-            "energy-daily.com",
-            "fakirpresse.info",
-            "geopoliticalmonitor.com",
-            "huffingtonpost.fr",
-            "legrandsoir.info",
-            "les-crises.fr",
-            "liberation.fr",
-            "maitre-eolas.fr",
-            "marianne.net",
-            "mediapart.fr",
-            "metaefficient.com",
-            "monde-diplomatique.fr",
-            "paulcraigroberts.org",
-            "politis.fr",
-            "reporterre.net",
-            "rue89.com",
-            "theguardian.com/international",
-            "treehugger.com",
-            "unz.com",
-            "voltairenet.org",
-            "wsws.org"
-        ],  
-        "keyword_search": [
-            "society collapse"
-        ]
-    },
-    "REGEX_PATTERN_STATUS_PRIORITY": [
-        [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50]
-    ]
-}
--- a/app_urls/init_data_sca.json
+++ b/app_urls/init_data_sca.json
@@ -1,34 +0,0 @@
-{
-    "SEARCH": {
-        "rss_feed": [
-            "https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
-            "https://feeds.feedburner.com/breitbart",
-            "https://feeds.feedburner.com/zerohedge/feed",
-            "https://moxie.foxnews.com/google-publisher/latest.xml",
-            "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
-            "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
-        ],
-        "url_host": [
-            "missingkids.org/poster",
-            "missingkids.org/new-poster",
-            "breitbart.com",
-            "zerohedge.com",
-            "foxnews.com",
-            "cnbc.com"
-        ],
-        "keyword_search": [
-            "child abuse"
-        ]
-    },
-    "REGEX_PATTERN_STATUS_PRIORITY": [
-        [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
-        [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
-        [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
-        [".*radio.foxnews\\.com\\/.*", "invalid", 75],
-        [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
-        [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
-        [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
-        [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
-        [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
-    ]
-}
--- a/utils/Ghost-Posts.ipynb
+++ b/utils/Ghost-Posts.ipynb
@@ -12,8 +12,8 @@
    "import requests\n",
    "from datetime import datetime, timedelta, timezone\n",
    "\n",
-    "admin_api_url = \"\"\n",
-    "admin_api_key = \"\"\n",
+    "admin_api_url = \"\" # .env -> GHOST_ADMIN_API_URL\n",
+    "admin_api_key = \"\" # .env -> GHOST_ADMIN_API_KEY\n",
    "\n",
    "def _create_jwt(admin_api_key):\n",
    "    id_, secret = admin_api_key.split(':')\n",
--- a/utils/Newspapers.ipynb
+++ b/utils/Newspapers.ipynb
@@ -8,11 +8,206 @@
   "source": [
    "url = \"https://onlinenewspapers.com/index.shtml\""
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'newspaper/0.9.3.1'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"\"\"\n",
+    "import newspaper\n",
+    "\n",
+    "newspaper.Config().__dict__\n",
+    "\n",
+    " 'requests_params': {'timeout': 7,\n",
+    "  'proxies': {},\n",
+    "  'headers': {'User-Agent': 'newspaper/0.9.3.1'}},\n",
+    "\"\"\"\n",
+    "import newspaper\n",
+    "newspaper.Config().browser_user_agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "        url (str): The url of the source (news website) to build. For example,\n",
+    "            `https://www.cnn.com`.\n",
+    "        dry (bool): If true, the source object will be constructed but not\n",
+    "            downloaded or parsed.\n",
+    "        only_homepage (bool): If true, the source object will only parse\n",
+    "            the homepage of the source.\n",
+    "        only_in_path (bool): If true, the source object will only\n",
+    "            parse the articles that are in the same path as the source's\n",
+    "            homepage. You can scrape a specific category this way.\n",
+    "            Defaults to False.\n",
+    "        input_html (str): The HTML of the source to parse. Use this to pass cached\n",
+    "            HTML to the source object.\n",
+    "        config (Configuration): A configuration object to use for the source.\n",
+    "        kwargs: Any other keyword arguments to pass to the Source constructor.\n",
+    "            If you omit the config object, you can add any configuration\n",
+    "            options here.\n",
+    "\"\"\"\n",
+    "\n",
+    "url = \"https://www.lanacion.com.ar/deportes/\"\n",
+    "\n",
+    "newspaper_built = newspaper.build(url, only_in_path=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newspaper_built.__dict__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newspaper_built.article_urls()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "url = \"https://www.lanacion.com.ar/\"\n",
+    "#url = \"https://www.lanacion.com.ar/deportes/\"\n",
+    "newspaper_built = newspaper.build(url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "        url (str): The url of the source (news website) to build. For example,\n",
+    "            `https://www.cnn.com`.\n",
+    "        dry (bool): If true, the source object will be constructed but not\n",
+    "            downloaded or parsed.\n",
+    "        only_homepage (bool): If true, the source object will only parse\n",
+    "            the homepage of the source.\n",
+    "        only_in_path (bool): If true, the source object will only\n",
+    "            parse the articles that are in the same path as the source's\n",
+    "            homepage. You can scrape a specific category this way.\n",
+    "            Defaults to False.\n",
+    "        input_html (str): The HTML of the source to parse. Use this to pass cached\n",
+    "            HTML to the source object.\n",
+    "        config (Configuration): A configuration object to use for the source.\n",
+    "        kwargs: Any other keyword arguments to pass to the Source constructor.\n",
+    "            If you omit the config object, you can add any configuration\n",
+    "            options here.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cat = newspaper_built.categories[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newspaper_built.categories_to_articles()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newspaper_built.category_urls()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " 'https://www.lanacion.com.ar/tema/futbol-argentino-tid57505/',\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "categories = newspaper_built.category_urls()\n",
+    "url_of_interest = \"https://www.lanacion.com.ar/sabado/todo-para-ellos-nid21042025/\"\n",
+    "\n",
+    "potential_categories = []\n",
+    "\n",
+    "for c in categories:\n",
+    "    if (c in url_of_interest):\n",
+    "        print(c, url_of_interest)\n",
+    "        potential_categories.append(c)\n",
+    "\n",
+    "# Get longest length category"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newspaper_built.article_urls()"
+   ]
  }
 ],
 "metadata": {
+  "kernelspec": {
+   "display_name": "matitos_urls",
+   "language": "python",
+   "name": "python3"
+  },
  "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
  }
 },
 "nbformat": 4,
--- a/utils/scholenopdekaart.csv
+++ b/utils/scholenopdekaart.csv