Unknown instead of error for fetched urls

2025-06-19 22:43:29 +02:00
parent a2cce62096
commit 490f01d66c
7 changed files with 227 additions and 9076 deletions
--- a/utils/Ghost-Posts.ipynb
+++ b/utils/Ghost-Posts.ipynb
@@ -12,8 +12,8 @@
    "import requests\n",
    "from datetime import datetime, timedelta, timezone\n",
    "\n",
-    "admin_api_url = \"\"\n",
-    "admin_api_key = \"\"\n",
+    "admin_api_url = \"\" # .env -> GHOST_ADMIN_API_URL\n",
+    "admin_api_key = \"\" # .env -> GHOST_ADMIN_API_KEY\n",
    "\n",
    "def _create_jwt(admin_api_key):\n",
    "    id_, secret = admin_api_key.split(':')\n",
--- a/utils/Newspapers.ipynb
+++ b/utils/Newspapers.ipynb
@@ -8,11 +8,206 @@
   "source": [
    "url = \"https://onlinenewspapers.com/index.shtml\""
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'newspaper/0.9.3.1'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"\"\"\n",
+    "import newspaper\n",
+    "\n",
+    "newspaper.Config().__dict__\n",
+    "\n",
+    " 'requests_params': {'timeout': 7,\n",
+    "  'proxies': {},\n",
+    "  'headers': {'User-Agent': 'newspaper/0.9.3.1'}},\n",
+    "\"\"\"\n",
+    "import newspaper\n",
+    "newspaper.Config().browser_user_agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "        url (str): The url of the source (news website) to build. For example,\n",
+    "            `https://www.cnn.com`.\n",
+    "        dry (bool): If true, the source object will be constructed but not\n",
+    "            downloaded or parsed.\n",
+    "        only_homepage (bool): If true, the source object will only parse\n",
+    "            the homepage of the source.\n",
+    "        only_in_path (bool): If true, the source object will only\n",
+    "            parse the articles that are in the same path as the source's\n",
+    "            homepage. You can scrape a specific category this way.\n",
+    "            Defaults to False.\n",
+    "        input_html (str): The HTML of the source to parse. Use this to pass cached\n",
+    "            HTML to the source object.\n",
+    "        config (Configuration): A configuration object to use for the source.\n",
+    "        kwargs: Any other keyword arguments to pass to the Source constructor.\n",
+    "            If you omit the config object, you can add any configuration\n",
+    "            options here.\n",
+    "\"\"\"\n",
+    "\n",
+    "url = \"https://www.lanacion.com.ar/deportes/\"\n",
+    "\n",
+    "newspaper_built = newspaper.build(url, only_in_path=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newspaper_built.__dict__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newspaper_built.article_urls()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "url = \"https://www.lanacion.com.ar/\"\n",
+    "#url = \"https://www.lanacion.com.ar/deportes/\"\n",
+    "newspaper_built = newspaper.build(url)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "        url (str): The url of the source (news website) to build. For example,\n",
+    "            `https://www.cnn.com`.\n",
+    "        dry (bool): If true, the source object will be constructed but not\n",
+    "            downloaded or parsed.\n",
+    "        only_homepage (bool): If true, the source object will only parse\n",
+    "            the homepage of the source.\n",
+    "        only_in_path (bool): If true, the source object will only\n",
+    "            parse the articles that are in the same path as the source's\n",
+    "            homepage. You can scrape a specific category this way.\n",
+    "            Defaults to False.\n",
+    "        input_html (str): The HTML of the source to parse. Use this to pass cached\n",
+    "            HTML to the source object.\n",
+    "        config (Configuration): A configuration object to use for the source.\n",
+    "        kwargs: Any other keyword arguments to pass to the Source constructor.\n",
+    "            If you omit the config object, you can add any configuration\n",
+    "            options here.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cat = newspaper_built.categories[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newspaper_built.categories_to_articles()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newspaper_built.category_urls()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " 'https://www.lanacion.com.ar/tema/futbol-argentino-tid57505/',\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "categories = newspaper_built.category_urls()\n",
+    "url_of_interest = \"https://www.lanacion.com.ar/sabado/todo-para-ellos-nid21042025/\"\n",
+    "\n",
+    "potential_categories = []\n",
+    "\n",
+    "for c in categories:\n",
+    "    if (c in url_of_interest):\n",
+    "        print(c, url_of_interest)\n",
+    "        potential_categories.append(c)\n",
+    "\n",
+    "# Get longest length category"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "newspaper_built.article_urls()"
+   ]
  }
 ],
 "metadata": {
+  "kernelspec": {
+   "display_name": "matitos_urls",
+   "language": "python",
+   "name": "python3"
+  },
  "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
  }
 },
 "nbformat": 4,
--- a/utils/scholenopdekaart.csv
+++ b/utils/scholenopdekaart.csv