Unknown instead of error for fetched urls

2025-06-19 22:43:29 +02:00
parent a2cce62096
commit 490f01d66c
7 changed files with 227 additions and 9076 deletions
--- a/app_cv/Server.ipynb
+++ b/app_cv/Server.ipynb
@@ -6,8 +6,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# !pip install flask\n",
+    "!uvicorn app:app --workers 1 --log-level info --port 5001\n",
-    "!python app.py"
+    "#!uvicorn app:app --reload --log-level debug --port 8000\n",
    "#!python app.py"
   ]
  }
 ],
--- a/app_urls/fetcher/src/fetch_utils_url_processor.py
+++ b/app_urls/fetcher/src/fetch_utils_url_processor.py
@@ -2,6 +2,7 @@ from django.core.cache import cache
 from .logger import get_logger
 logger = get_logger()
 import newspaper
 import requests
 import time
 import os
 from urllib.parse import unquote
@@ -39,7 +40,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
    cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
 def process_url(url, paywall_bypass=False):
-    
+
    if (paywall_bypass):
        # TODO: Implement self-hosted instance
        url_paywall_bypass_base = "https://marreta.pcdomanual.com"
@@ -57,22 +58,46 @@ def process_url(url, paywall_bypass=False):
        logger.warning("ArticleException for input URL {}".format(url))
        return {"override_status": "invalid"}
    except newspaper.ArticleException as e:
        # Too many requests or blocked for some reason
        if ("Status code 403" in str(e.args)):
            # TODO: cool down and retry once?, proxy/VPN, ...
            logger.debug("TODO: process_url Implement code 403")
        # Not found, either it doesn't exist or getting blocked...
        if ("Status code 404" in str(e.args)):
            # TODO: cool down and retry once?, proxy/VPN, ...
            logger.debug("TODO: process_url Implement code 404")
        # Too many requests? Cool down...
        if ("Status code 429" in str(e.args)):
            # TODO: cool down and retry once?, proxy/VPN, ...
            logger.debug("TODO: process_url Implement code 429")
        # Unavailable for legal reasons
        if ("Status code 451" in str(e.args)):
            # TODO: Bypass with VPN
            logger.debug("TODO: process_url Implement code 451")
        # CloudFlare protection?
        if ("Website protected with Cloudflare" in str(e.args)):
            logger.debug("TODO: process_url Implement bypass CloudFlare")
        # PerimeterX protection?
        if ("Website protected with PerimeterX" in str(e.args)):
            logger.debug("TODO: process_url Implement bypass PerimeterX")
        logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
        # Try simple request, valid response but couldn't parse article? e.g. getting blocked? -> unknown
        time.sleep(0.25)
        r = requests.get(url_of_interest)
        if (r.status_code == 200):
            return {"override_status": "unknown"}
        else:
            # Another status code still... "error" or "unknown"
            return {"override_status": "unknown"}
        return None
    except Exception as e:
        logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
--- a/app_urls/init_data_fr.json
+++ b/app_urls/init_data_fr.json
@@ -1,65 +0,0 @@
 {
    "SEARCH": {
        "rss_feed": [
        ],
        "url_host": [
            "johnpilger.com",
            "lapenseeecologique.com",
            "partage-le.com",
            "reflets.info",
            "rezo.net",
            "consortiumnews.com",
            "disclose.ngo/fr",
            "energieetenvironnement.com",
            "global-climat.com",
            "slashdot.org",
            "lesamisdebartleby.wordpress.com",
            "lundi.am",
            "lvsl.fr",
            "moderndiplomacy.eu",
            "mrmondialisation.org",
            "ourfiniteworld.com",
            "southfront.org",
            "simplicius76.substack.com",
            "smoothiex12.blogspot.com",
            "theintercept.com",
            "wikileaks.org",
            "contretemps.eu",
            "indianpunchline.com",
            "investigaction.net/fr",
            "notechmagazine.com",
            "terrestres.org",
            "truthdig.com",
            "tass.com",
            "bastamag.net",
            "counterpunch.org",
            "energy-daily.com",
            "fakirpresse.info",
            "geopoliticalmonitor.com",
            "huffingtonpost.fr",
            "legrandsoir.info",
            "les-crises.fr",
            "liberation.fr",
            "maitre-eolas.fr",
            "marianne.net",
            "mediapart.fr",
            "metaefficient.com",
            "monde-diplomatique.fr",
            "paulcraigroberts.org",
            "politis.fr",
            "reporterre.net",
            "rue89.com",
            "theguardian.com/international",
            "treehugger.com",
            "unz.com",
            "voltairenet.org",
            "wsws.org"
        ],  
        "keyword_search": [
            "society collapse"
        ]
    },
    "REGEX_PATTERN_STATUS_PRIORITY": [
        [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50]
    ]
 }
--- a/app_urls/init_data_sca.json
+++ b/app_urls/init_data_sca.json
@@ -1,34 +0,0 @@
 {
    "SEARCH": {
        "rss_feed": [
            "https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
            "https://feeds.feedburner.com/breitbart",
            "https://feeds.feedburner.com/zerohedge/feed",
            "https://moxie.foxnews.com/google-publisher/latest.xml",
            "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
            "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
        ],
        "url_host": [
            "missingkids.org/poster",
            "missingkids.org/new-poster",
            "breitbart.com",
            "zerohedge.com",
            "foxnews.com",
            "cnbc.com"
        ],
        "keyword_search": [
            "child abuse"
        ]
    },
    "REGEX_PATTERN_STATUS_PRIORITY": [
        [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
        [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
        [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
        [".*radio.foxnews\\.com\\/.*", "invalid", 75],
        [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
        [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
        [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
        [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
        [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
    ]
 }
--- a/utils/Ghost-Posts.ipynb
+++ b/utils/Ghost-Posts.ipynb
@@ -12,8 +12,8 @@
    "import requests\n",
    "from datetime import datetime, timedelta, timezone\n",
    "\n",
-    "admin_api_url = \"\"\n",
+    "admin_api_url = \"\" # .env -> GHOST_ADMIN_API_URL\n",
-    "admin_api_key = \"\"\n",
+    "admin_api_key = \"\" # .env -> GHOST_ADMIN_API_KEY\n",
    "\n",
    "def _create_jwt(admin_api_key):\n",
    "    id_, secret = admin_api_key.split(':')\n",
--- a/utils/Newspapers.ipynb
+++ b/utils/Newspapers.ipynb
@@ -8,11 +8,206 @@
   "source": [
    "url = \"https://onlinenewspapers.com/index.shtml\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'newspaper/0.9.3.1'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "import newspaper\n",
    "\n",
    "newspaper.Config().__dict__\n",
    "\n",
    " 'requests_params': {'timeout': 7,\n",
    "  'proxies': {},\n",
    "  'headers': {'User-Agent': 'newspaper/0.9.3.1'}},\n",
    "\"\"\"\n",
    "import newspaper\n",
    "newspaper.Config().browser_user_agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "        url (str): The url of the source (news website) to build. For example,\n",
    "            `https://www.cnn.com`.\n",
    "        dry (bool): If true, the source object will be constructed but not\n",
    "            downloaded or parsed.\n",
    "        only_homepage (bool): If true, the source object will only parse\n",
    "            the homepage of the source.\n",
    "        only_in_path (bool): If true, the source object will only\n",
    "            parse the articles that are in the same path as the source's\n",
    "            homepage. You can scrape a specific category this way.\n",
    "            Defaults to False.\n",
    "        input_html (str): The HTML of the source to parse. Use this to pass cached\n",
    "            HTML to the source object.\n",
    "        config (Configuration): A configuration object to use for the source.\n",
    "        kwargs: Any other keyword arguments to pass to the Source constructor.\n",
    "            If you omit the config object, you can add any configuration\n",
    "            options here.\n",
    "\"\"\"\n",
    "\n",
    "url = \"https://www.lanacion.com.ar/deportes/\"\n",
    "\n",
    "newspaper_built = newspaper.build(url, only_in_path=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.__dict__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.article_urls()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "url = \"https://www.lanacion.com.ar/\"\n",
    "#url = \"https://www.lanacion.com.ar/deportes/\"\n",
    "newspaper_built = newspaper.build(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "        url (str): The url of the source (news website) to build. For example,\n",
    "            `https://www.cnn.com`.\n",
    "        dry (bool): If true, the source object will be constructed but not\n",
    "            downloaded or parsed.\n",
    "        only_homepage (bool): If true, the source object will only parse\n",
    "            the homepage of the source.\n",
    "        only_in_path (bool): If true, the source object will only\n",
    "            parse the articles that are in the same path as the source's\n",
    "            homepage. You can scrape a specific category this way.\n",
    "            Defaults to False.\n",
    "        input_html (str): The HTML of the source to parse. Use this to pass cached\n",
    "            HTML to the source object.\n",
    "        config (Configuration): A configuration object to use for the source.\n",
    "        kwargs: Any other keyword arguments to pass to the Source constructor.\n",
    "            If you omit the config object, you can add any configuration\n",
    "            options here.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat = newspaper_built.categories[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.categories_to_articles()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.category_urls()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    " 'https://www.lanacion.com.ar/tema/futbol-argentino-tid57505/',\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = newspaper_built.category_urls()\n",
    "url_of_interest = \"https://www.lanacion.com.ar/sabado/todo-para-ellos-nid21042025/\"\n",
    "\n",
    "potential_categories = []\n",
    "\n",
    "for c in categories:\n",
    "    if (c in url_of_interest):\n",
    "        print(c, url_of_interest)\n",
    "        potential_categories.append(c)\n",
    "\n",
    "# Get longest length category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.article_urls()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matitos_urls",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
-   "name": "python"
+   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
--- a/utils/scholenopdekaart.csv
+++ b/utils/scholenopdekaart.csv