Utils

2025-04-23 16:26:08 +02:00
parent 8d0697edee
commit 8ea3ec1bda
3 changed files with 9205 additions and 51 deletions
--- a/utils/Ghost-Posts.ipynb
+++ b/utils/Ghost-Posts.ipynb
@@ -0,0 +1,88 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "import jwt\n",
    "import requests\n",
    "\n",
    "admin_api_url = \"\"\n",
    "admin_api_key = \"\"\n",
    "\n",
    "def _create_jwt(admin_api_key):\n",
    "    id_, secret = admin_api_key.split(':')\n",
    "    iat = int(time.time())\n",
    "    exp = iat + 5 * 60  # 5 minutes\n",
    "    header = {'alg': 'HS256', 'kid': id_}\n",
    "    payload = {\n",
    "        'iat': iat,\n",
    "        'exp': exp,\n",
    "        'aud': '/v5/admin/'  # Adjust depending on your Ghost version\n",
    "    }\n",
    "    token = jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)\n",
    "    return token\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get token\n",
    "jwt_token = _create_jwt(os.getenv(\"GHOST_ADMIN_API_KEY\"))\n",
    "\n",
    "headers = {\n",
    "    'Authorization': f'Ghost {jwt_token}',\n",
    "    'Content-Type': 'application/json'\n",
    "}\n",
    "\n",
    "deleted_post = True\n",
    "\n",
    "while (deleted_post):\n",
    "    # GET /admin/posts/\n",
    "    response = requests.get(os.path.join(admin_api_url, \"posts\"), headers=headers)\n",
    "    dict_response = response.json()\n",
    "\n",
    "    if (len(dict_response.get(\"posts\")) == 0):\n",
    "        deleted_post = False\n",
    "        break\n",
    "\n",
    "    # Iterate posts\n",
    "    for p in dict_response.get(\"posts\"):\n",
    "        # Post ID\n",
    "        post_id = p.get(\"id\")\n",
    "\n",
    "        # DELETE /admin/posts/{id}/\n",
    "        r = requests.delete(os.path.join(admin_api_url, \"posts\", \"{}\".format(post_id)), headers=headers)\n",
    "        print(\"Post:\", post_id, \"Status:\", r.status_code, r.text)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matitos_urls",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/utils/Schools-NL.ipynb
+++ b/utils/Schools-NL.ipynb
@@ -11,6 +11,7 @@
    "from urllib.parse import urljoin\n",
    "import pandas as pd\n",
    "import os\n",
    "import json\n",
    "\n",
    "headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
   ]
@@ -166,7 +167,53 @@
    "                        school_data[\"email\"] = extract_emails(soup_school)\n",
    "                    except Exception as e:\n",
    "                        pass\n",
-    "                    \n",
+    "\n",
    "                    if (category.lower() == \"basisscholen\"):\n",
    "                        ############################################################################\n",
    "                        # Step 1: Locate the <aantal-leerlingen-leeftijd-bar-chart> tag\n",
    "                        chart_tag = soup.find('aantal-leerlingen-leeftijd-bar-chart', attrs={'aantal-per-leeftijd': True})\n",
    "                        # Step 2: Extract the 'aantal-per-leeftijd' attribute\n",
    "                        raw_data = chart_tag['aantal-per-leeftijd']\n",
    "\n",
    "                        # Step 3: Parse the JSON data\n",
    "                        try:\n",
    "                            data = json.loads(raw_data)\n",
    "                            # Step 4: Print the extracted data\n",
    "                            print(\"Aantal per Leeftijd:\")\n",
    "                            for entry in data:\n",
    "                                age = entry['key']\n",
    "                                num_students = entry['aantal']\n",
    "                                school_data[\"num_students_age_{}\".format(age)] = num_students\n",
    "                                # print(f\"Age {entry['key']}: {entry['aantal']} leerlingen\")\n",
    "                        except json.JSONDecodeError as e:\n",
    "                            print(f\"Failed to parse JSON data: {e}\")\n",
    "\n",
    "                        # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
    "                        chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
    "\n",
    "                        ############################################################################\n",
    "                        # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
    "                        chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
    "\n",
    "                        if not chart_tag:\n",
    "                            print(\"Could not find the 'aantal per leerjaar' section.\")\n",
    "                        else:\n",
    "                            # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
    "                            raw_data = chart_tag['aantal-per-leerjaar']\n",
    "                            \n",
    "                            # Step 3: Parse the JSON data\n",
    "                            try:\n",
    "                                data = json.loads(raw_data)\n",
    "                                # Step 4: Print the extracted data\n",
    "                                print(\"Aantal per Leerjaar:\")\n",
    "                                for entry in data:\n",
    "                                    group = entry['key']\n",
    "                                    num_students = entry['aantal']\n",
    "                                    school_data[\"num_students_group_{}\".format(group)] = num_students\n",
    "                                    print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
    "                            except json.JSONDecodeError as e:\n",
    "                                print(f\"Failed to parse JSON data: {e}\")\n",
    "                        ############################################################################\n",
    "                except Exception as e:\n",
    "                    print(school_url, str(e))\n",
    "                    # assert False\n",
@@ -230,7 +277,16 @@
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n",
-    "df.loc[0, \"category_3\"]"
+    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.tail()"
   ]
  },
  {
@@ -242,72 +298,111 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://scholenopdekaart.nl/middelbare-scholen/hardenberg/26614/de-ambelt/\n",
    "# From which zip codes the students come\n",
    "# How many kids passed the exams', ...\n",
    "\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import os\n",
    "\n",
-    "# Step 1: Fetch the webpage\n",
+    "url = \"https://scholenopdekaart.nl/middelbare-scholen/hardenberg/26614/de-ambelt/\"\n",
-    "url = \"https://scholenopdekaart.nl/basisscholen/aagtekerke/25963/jhr-willem-versluijsschool/\"\n",
+    "response = requests.get( os.path.join(url, \"#inhoud\") )\n",
    "headers = {\n",
    "    \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36\"\n",
    "}\n",
    "response = requests.get(url, headers=headers)\n",
    "\n",
    "# Check if the request was successful\n",
    "if response.status_code != 200:\n",
    "    print(f\"Failed to retrieve the page. Status code: {response.status_code}\")\n",
    "    exit()\n",
    "\n",
-    "# Step 2: Parse the HTML content\n",
+    "url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n",
-    "soup = BeautifulSoup(response.text, 'html.parser')"
+    "response = requests.get(url)\n",
    "\n",
    "soup = BeautifulSoup(response.content, 'html.parser')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Aantal per Leerjaar:\n",
      "Groep 1: 29 leerlingen\n",
      "Groep 2: 28 leerlingen\n",
      "Groep 3: 30 leerlingen\n",
      "Groep 4: 25 leerlingen\n",
      "Groep 5: 19 leerlingen\n",
      "Groep 6: 26 leerlingen\n",
      "Groep 7: 22 leerlingen\n",
      "Groep 8: 20 leerlingen\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
-    "# Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
+    "# Locate the section containing \"Welke profielen volgen de examendeelnemers?\"\n",
-    "chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
+    "section_header = soup.find('h3', string=\"Welke profielen volgen de examendeelnemers?\")\n",
    "if not section_header:\n",
    "    raise ValueError(\"Section 'Welke profielen volgen de examendeelnemers?' not found in the HTML.\")\n",
    "\n",
-    "if not chart_tag:\n",
+    "# Navigate to the parent section or subsection\n",
-    "    print(\"Could not find the 'aantal per leerjaar' section.\")\n",
+    "section = section_header.find_parent('section')\n",
    "if not section:\n",
    "    raise ValueError(\"Parent section for 'Welke profielen volgen de examendeelnemers?' not found.\")\n",
    "\n",
    "# Check if the section contains a message indicating no data is available\n",
    "no_data_message = section.find('p', string=\"Deze informatie is voor deze school niet bekend.\")\n",
    "if no_data_message:\n",
    "    print(\"No data available for 'Welke profielen volgen de examendeelnemers?'.\")\n",
    "else:\n",
-    "    # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
+    "    # Extract the relevant content (e.g., tables, lists, or paragraphs)\n",
-    "    raw_data = chart_tag['aantal-per-leerjaar']\n",
+    "    content = []\n",
-    "    \n",
+    "    for element in section.find_all(['p', 'table', 'ul', 'ol']):\n",
-    "    # Step 3: Parse the JSON data\n",
+    "        if element.name == 'table':\n",
-    "    try:\n",
+    "            # Extract table rows\n",
-    "        data = json.loads(raw_data)\n",
+    "            rows = element.find_all('tr')\n",
-    "        \n",
+    "            for row in rows:\n",
-    "        # Step 4: Print the extracted data\n",
+    "                cells = row.find_all(['th', 'td'])\n",
-    "        print(\"Aantal per Leerjaar:\")\n",
+    "                row_data = [cell.get_text(strip=True) for cell in cells]\n",
-    "        for entry in data:\n",
+    "                content.append(row_data)\n",
-    "            print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
+    "        else:\n",
-    "    except json.JSONDecodeError as e:\n",
+    "            # Extract text from paragraphs, lists, etc.\n",
-    "        print(f\"Failed to parse JSON data: {e}\")"
+    "            content.append(element.get_text(strip=True))\n",
    "\n",
    "    # Print the extracted content\n",
    "    for item in content:\n",
    "        print(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Locate the dialog containing the table\n",
    "dialog = soup.find('dialog', class_='modal modal-dialog')\n",
    "if not dialog:\n",
    "    raise ValueError(\"Dialog element not found in the HTML.\")\n",
    "\n",
    "# Locate the table within the dialog\n",
    "table = dialog.find('table')\n",
    "if not table:\n",
    "    raise ValueError(\"Table element not found within the dialog.\")\n",
    "\n",
    "# Extract table headers\n",
    "headers = [header.get_text(strip=True) for header in table.find_all('th')]\n",
    "\n",
    "# Extract table rows\n",
    "data = []\n",
    "for row in table.find_all('tr')[1:]:  # Skip the header row\n",
    "    cells = row.find_all('td')\n",
    "    if len(cells) == len(headers):  # Ensure the row matches the expected structure\n",
    "        row_data = {\n",
    "            headers[0]: cells[0].get_text(strip=True),  # Postcodegebied\n",
    "            headers[1]: cells[1].get_text(strip=True),  # Aantal leerlingen\n",
    "            headers[2]: cells[2].get_text(strip=True)   # Percentage\n",
    "        }\n",
    "        data.append(row_data)\n",
    "\n",
    "# Print the extracted data\n",
    "for entry in data:\n",
    "    print(entry)"
   ]
  }
 ],
--- a/utils/scholenopdekaart.csv
+++ b/utils/scholenopdekaart.csv