Utils
This commit is contained in:
88
utils/Ghost-Posts.ipynb
Normal file
88
utils/Ghost-Posts.ipynb
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import time\n",
|
||||||
|
"import jwt\n",
|
||||||
|
"import requests\n",
|
||||||
|
"\n",
|
||||||
|
"admin_api_url = \"\"\n",
|
||||||
|
"admin_api_key = \"\"\n",
|
||||||
|
"\n",
|
||||||
|
"def _create_jwt(admin_api_key):\n",
|
||||||
|
" id_, secret = admin_api_key.split(':')\n",
|
||||||
|
" iat = int(time.time())\n",
|
||||||
|
" exp = iat + 5 * 60 # 5 minutes\n",
|
||||||
|
" header = {'alg': 'HS256', 'kid': id_}\n",
|
||||||
|
" payload = {\n",
|
||||||
|
" 'iat': iat,\n",
|
||||||
|
" 'exp': exp,\n",
|
||||||
|
" 'aud': '/v5/admin/' # Adjust depending on your Ghost version\n",
|
||||||
|
" }\n",
|
||||||
|
" token = jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)\n",
|
||||||
|
" return token\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Get token\n",
|
||||||
|
"jwt_token = _create_jwt(os.getenv(\"GHOST_ADMIN_API_KEY\"))\n",
|
||||||
|
"\n",
|
||||||
|
"headers = {\n",
|
||||||
|
" 'Authorization': f'Ghost {jwt_token}',\n",
|
||||||
|
" 'Content-Type': 'application/json'\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"deleted_post = True\n",
|
||||||
|
"\n",
|
||||||
|
"while (deleted_post):\n",
|
||||||
|
" # GET /admin/posts/\n",
|
||||||
|
" response = requests.get(os.path.join(admin_api_url, \"posts\"), headers=headers)\n",
|
||||||
|
" dict_response = response.json()\n",
|
||||||
|
"\n",
|
||||||
|
" if (len(dict_response.get(\"posts\")) == 0):\n",
|
||||||
|
" deleted_post = False\n",
|
||||||
|
" break\n",
|
||||||
|
"\n",
|
||||||
|
" # Iterate posts\n",
|
||||||
|
" for p in dict_response.get(\"posts\"):\n",
|
||||||
|
" # Post ID\n",
|
||||||
|
" post_id = p.get(\"id\")\n",
|
||||||
|
"\n",
|
||||||
|
" # DELETE /admin/posts/{id}/\n",
|
||||||
|
" r = requests.delete(os.path.join(admin_api_url, \"posts\", \"{}\".format(post_id)), headers=headers)\n",
|
||||||
|
" print(\"Post:\", post_id, \"Status:\", r.status_code, r.text)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "matitos_urls",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@@ -11,6 +11,7 @@
|
|||||||
"from urllib.parse import urljoin\n",
|
"from urllib.parse import urljoin\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import os\n",
|
"import os\n",
|
||||||
|
"import json\n",
|
||||||
"\n",
|
"\n",
|
||||||
"headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
|
"headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
|
||||||
]
|
]
|
||||||
@@ -166,7 +167,53 @@
|
|||||||
" school_data[\"email\"] = extract_emails(soup_school)\n",
|
" school_data[\"email\"] = extract_emails(soup_school)\n",
|
||||||
" except Exception as e:\n",
|
" except Exception as e:\n",
|
||||||
" pass\n",
|
" pass\n",
|
||||||
" \n",
|
"\n",
|
||||||
|
" if (category.lower() == \"basisscholen\"):\n",
|
||||||
|
" ############################################################################\n",
|
||||||
|
" # Step 1: Locate the <aantal-leerlingen-leeftijd-bar-chart> tag\n",
|
||||||
|
" chart_tag = soup.find('aantal-leerlingen-leeftijd-bar-chart', attrs={'aantal-per-leeftijd': True})\n",
|
||||||
|
" # Step 2: Extract the 'aantal-per-leeftijd' attribute\n",
|
||||||
|
" raw_data = chart_tag['aantal-per-leeftijd']\n",
|
||||||
|
"\n",
|
||||||
|
" # Step 3: Parse the JSON data\n",
|
||||||
|
" try:\n",
|
||||||
|
" data = json.loads(raw_data)\n",
|
||||||
|
" # Step 4: Print the extracted data\n",
|
||||||
|
" print(\"Aantal per Leeftijd:\")\n",
|
||||||
|
" for entry in data:\n",
|
||||||
|
" age = entry['key']\n",
|
||||||
|
" num_students = entry['aantal']\n",
|
||||||
|
" school_data[\"num_students_age_{}\".format(age)] = num_students\n",
|
||||||
|
" # print(f\"Age {entry['key']}: {entry['aantal']} leerlingen\")\n",
|
||||||
|
" except json.JSONDecodeError as e:\n",
|
||||||
|
" print(f\"Failed to parse JSON data: {e}\")\n",
|
||||||
|
"\n",
|
||||||
|
" # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
|
||||||
|
" chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
|
||||||
|
"\n",
|
||||||
|
" ############################################################################\n",
|
||||||
|
" # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
|
||||||
|
" chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
|
||||||
|
"\n",
|
||||||
|
" if not chart_tag:\n",
|
||||||
|
" print(\"Could not find the 'aantal per leerjaar' section.\")\n",
|
||||||
|
" else:\n",
|
||||||
|
" # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
|
||||||
|
" raw_data = chart_tag['aantal-per-leerjaar']\n",
|
||||||
|
" \n",
|
||||||
|
" # Step 3: Parse the JSON data\n",
|
||||||
|
" try:\n",
|
||||||
|
" data = json.loads(raw_data)\n",
|
||||||
|
" # Step 4: Print the extracted data\n",
|
||||||
|
" print(\"Aantal per Leerjaar:\")\n",
|
||||||
|
" for entry in data:\n",
|
||||||
|
" group = entry['key']\n",
|
||||||
|
" num_students = entry['aantal']\n",
|
||||||
|
" school_data[\"num_students_group_{}\".format(group)] = num_students\n",
|
||||||
|
" print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
|
||||||
|
" except json.JSONDecodeError as e:\n",
|
||||||
|
" print(f\"Failed to parse JSON data: {e}\")\n",
|
||||||
|
" ############################################################################\n",
|
||||||
" except Exception as e:\n",
|
" except Exception as e:\n",
|
||||||
" print(school_url, str(e))\n",
|
" print(school_url, str(e))\n",
|
||||||
" # assert False\n",
|
" # assert False\n",
|
||||||
@@ -230,7 +277,16 @@
|
|||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"\n",
|
"\n",
|
||||||
"df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n",
|
"df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n",
|
||||||
"df.loc[0, \"category_3\"]"
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df.tail()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -242,72 +298,111 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"# https://scholenopdekaart.nl/middelbare-scholen/hardenberg/26614/de-ambelt/\n",
|
||||||
|
"# From which zip codes the students come\n",
|
||||||
|
"# How many kids passed the exams', ...\n",
|
||||||
|
"\n",
|
||||||
"import requests\n",
|
"import requests\n",
|
||||||
"from bs4 import BeautifulSoup\n",
|
"from bs4 import BeautifulSoup\n",
|
||||||
|
"import os\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Step 1: Fetch the webpage\n",
|
"url = \"https://scholenopdekaart.nl/middelbare-scholen/hardenberg/26614/de-ambelt/\"\n",
|
||||||
"url = \"https://scholenopdekaart.nl/basisscholen/aagtekerke/25963/jhr-willem-versluijsschool/\"\n",
|
"response = requests.get( os.path.join(url, \"#inhoud\") )\n",
|
||||||
"headers = {\n",
|
|
||||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36\"\n",
|
|
||||||
"}\n",
|
|
||||||
"response = requests.get(url, headers=headers)\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Check if the request was successful\n",
|
|
||||||
"if response.status_code != 200:\n",
|
|
||||||
" print(f\"Failed to retrieve the page. Status code: {response.status_code}\")\n",
|
|
||||||
" exit()\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Step 2: Parse the HTML content\n",
|
"url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n",
|
||||||
"soup = BeautifulSoup(response.text, 'html.parser')"
|
"response = requests.get(url)\n",
|
||||||
|
"\n",
|
||||||
|
"soup = BeautifulSoup(response.content, 'html.parser')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Aantal per Leerjaar:\n",
|
|
||||||
"Groep 1: 29 leerlingen\n",
|
|
||||||
"Groep 2: 28 leerlingen\n",
|
|
||||||
"Groep 3: 30 leerlingen\n",
|
|
||||||
"Groep 4: 25 leerlingen\n",
|
|
||||||
"Groep 5: 19 leerlingen\n",
|
|
||||||
"Groep 6: 26 leerlingen\n",
|
|
||||||
"Groep 7: 22 leerlingen\n",
|
|
||||||
"Groep 8: 20 leerlingen\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"import json\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
|
"# Locate the section containing \"Welke profielen volgen de examendeelnemers?\"\n",
|
||||||
"chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
|
"section_header = soup.find('h3', string=\"Welke profielen volgen de examendeelnemers?\")\n",
|
||||||
|
"if not section_header:\n",
|
||||||
|
" raise ValueError(\"Section 'Welke profielen volgen de examendeelnemers?' not found in the HTML.\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if not chart_tag:\n",
|
"# Navigate to the parent section or subsection\n",
|
||||||
" print(\"Could not find the 'aantal per leerjaar' section.\")\n",
|
"section = section_header.find_parent('section')\n",
|
||||||
|
"if not section:\n",
|
||||||
|
" raise ValueError(\"Parent section for 'Welke profielen volgen de examendeelnemers?' not found.\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Check if the section contains a message indicating no data is available\n",
|
||||||
|
"no_data_message = section.find('p', string=\"Deze informatie is voor deze school niet bekend.\")\n",
|
||||||
|
"if no_data_message:\n",
|
||||||
|
" print(\"No data available for 'Welke profielen volgen de examendeelnemers?'.\")\n",
|
||||||
"else:\n",
|
"else:\n",
|
||||||
" # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
|
" # Extract the relevant content (e.g., tables, lists, or paragraphs)\n",
|
||||||
" raw_data = chart_tag['aantal-per-leerjaar']\n",
|
" content = []\n",
|
||||||
" \n",
|
" for element in section.find_all(['p', 'table', 'ul', 'ol']):\n",
|
||||||
" # Step 3: Parse the JSON data\n",
|
" if element.name == 'table':\n",
|
||||||
" try:\n",
|
" # Extract table rows\n",
|
||||||
" data = json.loads(raw_data)\n",
|
" rows = element.find_all('tr')\n",
|
||||||
" \n",
|
" for row in rows:\n",
|
||||||
" # Step 4: Print the extracted data\n",
|
" cells = row.find_all(['th', 'td'])\n",
|
||||||
" print(\"Aantal per Leerjaar:\")\n",
|
" row_data = [cell.get_text(strip=True) for cell in cells]\n",
|
||||||
" for entry in data:\n",
|
" content.append(row_data)\n",
|
||||||
" print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
|
" else:\n",
|
||||||
" except json.JSONDecodeError as e:\n",
|
" # Extract text from paragraphs, lists, etc.\n",
|
||||||
" print(f\"Failed to parse JSON data: {e}\")"
|
" content.append(element.get_text(strip=True))\n",
|
||||||
|
"\n",
|
||||||
|
" # Print the extracted content\n",
|
||||||
|
" for item in content:\n",
|
||||||
|
" print(item)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"# Locate the dialog containing the table\n",
|
||||||
|
"dialog = soup.find('dialog', class_='modal modal-dialog')\n",
|
||||||
|
"if not dialog:\n",
|
||||||
|
" raise ValueError(\"Dialog element not found in the HTML.\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Locate the table within the dialog\n",
|
||||||
|
"table = dialog.find('table')\n",
|
||||||
|
"if not table:\n",
|
||||||
|
" raise ValueError(\"Table element not found within the dialog.\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Extract table headers\n",
|
||||||
|
"headers = [header.get_text(strip=True) for header in table.find_all('th')]\n",
|
||||||
|
"\n",
|
||||||
|
"# Extract table rows\n",
|
||||||
|
"data = []\n",
|
||||||
|
"for row in table.find_all('tr')[1:]: # Skip the header row\n",
|
||||||
|
" cells = row.find_all('td')\n",
|
||||||
|
" if len(cells) == len(headers): # Ensure the row matches the expected structure\n",
|
||||||
|
" row_data = {\n",
|
||||||
|
" headers[0]: cells[0].get_text(strip=True), # Postcodegebied\n",
|
||||||
|
" headers[1]: cells[1].get_text(strip=True), # Aantal leerlingen\n",
|
||||||
|
" headers[2]: cells[2].get_text(strip=True) # Percentage\n",
|
||||||
|
" }\n",
|
||||||
|
" data.append(row_data)\n",
|
||||||
|
"\n",
|
||||||
|
"# Print the extracted data\n",
|
||||||
|
"for entry in data:\n",
|
||||||
|
" print(entry)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
8971
utils/scholenopdekaart.csv
Normal file
8971
utils/scholenopdekaart.csv
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user