Publisher task

2025-04-18 16:36:14 +02:00
parent d7373edba2
commit 8d0697edee
21 changed files with 574 additions and 50 deletions
@@ -0,0 +1,335 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from bs4 import BeautifulSoup\n",
+    "from urllib.parse import urljoin\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "\n",
+    "headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function to decode Cloudflare-protected emails\n",
+    "def decode_email(encoded_email):\n",
+    "    \"\"\"\n",
+    "    Decode an email protected by Cloudflare's email protection.\n",
+    "    :param encoded_email: The encoded email string from the data-cfemail attribute.\n",
+    "    :return: The decoded email address.\n",
+    "    \"\"\"\n",
+    "    email = \"\"\n",
+    "    key = int(encoded_email[:2], 16)  # Extract the key (first two characters)\n",
+    "    for i in range(2, len(encoded_email), 2):\n",
+    "        # XOR each pair of hex characters with the key\n",
+    "        email += chr(int(encoded_email[i:i + 2], 16) ^ key)\n",
+    "    return email\n",
+    "\n",
+    "def extract_emails(soup):\n",
+    "    # Find all visible email links (mailto:)\n",
+    "    visible_emails = []\n",
+    "    for link in soup.find_all('a', href=lambda href: href and href.startswith('mailto:')):\n",
+    "        email = link['href'].replace('mailto:', '')\n",
+    "        visible_emails.append(email)\n",
+    "\n",
+    "    # Find all Cloudflare-protected emails\n",
+    "    protected_emails = []\n",
+    "    for span in soup.find_all('span', class_='__cf_email__', attrs={'data-cfemail': True}):\n",
+    "        encoded_email = span['data-cfemail']\n",
+    "        decoded_email = decode_email(encoded_email)\n",
+    "        protected_emails.append(decoded_email)\n",
+    "\n",
+    "    # Combine all emails\n",
+    "    all_emails = visible_emails + protected_emails\n",
+    "    all_emails = list(set(all_emails))\n",
+    "    if (len(all_emails) == 0):\n",
+    "        return None\n",
+    "    elif (len(all_emails) == 1):\n",
+    "        return all_emails[0]\n",
+    "    else:\n",
+    "        return all_emails\n",
+    "\n",
+    "def find_website(soup_school):\n",
+    "    # Find all <a> tags with href attributes\n",
+    "    for link in soup_school.find(class_=\"dl-horizontal dl-icons\").find_all('a', href=True):\n",
+    "        href = link['href']\n",
+    "        # Filter out only valid URLs (e.g., starting with http or https)\n",
+    "        if href.startswith(('http://', 'https://')):\n",
+    "            # websites.append(href)\n",
+    "            return href\n",
+    "\n",
+    "\n",
+    "def main():\n",
+    "    list_urls = [\n",
+    "        \"https://scholenopdekaart.nl/Basisscholen/\",\n",
+    "        \"https://scholenopdekaart.nl/middelbare-scholen/\"\n",
+    "    ]\n",
+    "\n",
+    "    list_school_data_dicts = []\n",
+    "\n",
+    "    # For each category\n",
+    "    for url in list_urls:\n",
+    "        # Fetch the HTML content of the page\n",
+    "        response = requests.get(url, headers=headers)\n",
+    "        response.raise_for_status()  # Raise an exception for HTTP errors\n",
+    "        # Parse the HTML content using BeautifulSoup\n",
+    "        soup = BeautifulSoup(response.text, 'html.parser')\n",
+    "\n",
+    "        # Get category\n",
+    "        category = url.strip(\"/\").split(\"/\")[-1].lower()\n",
+    "\n",
+    "        # Find all <a> tags with href attributes\n",
+    "        links_areas = []\n",
+    "        for a_tag in soup.find_all('a', href=True):\n",
+    "            href = a_tag['href']\n",
+    "            \n",
+    "            if (category not in href):\n",
+    "                continue\n",
+    "            \n",
+    "            # Convert relative URLs to absolute URLs\n",
+    "            area_full_url = urljoin(url, href)\n",
+    "            links_areas.append(area_full_url)\n",
+    "\n",
+    "            # Area\n",
+    "            area = href.rstrip(\"/\").split(\"/\")[-1]\n",
+    "\n",
+    "            ###############################################\n",
+    "            # Fetch the HTML content of the page\n",
+    "            print(\".\", end=\"\")\n",
+    "            response = requests.get(area_full_url, headers=headers)\n",
+    "            response.raise_for_status()  # Raise an exception for HTTP errors\n",
+    "\n",
+    "            # Parse the HTML content using BeautifulSoup\n",
+    "            soup_area= BeautifulSoup(response.text, 'html.parser')\n",
+    "\n",
+    "            # Get schools in area\n",
+    "            for a_tag in soup_area.find_all('a', href=True):\n",
+    "                href = a_tag['href']\n",
+    "\n",
+    "                school_url = urljoin(url, href)\n",
+    "                if (area_full_url not in school_url):\n",
+    "                    continue\n",
+    "                \n",
+    "                school_name = a_tag.text.rstrip(\".\")\n",
+    "                school_data = {\n",
+    "                    \"category\": category,\n",
+    "                    \"area\": area,\n",
+    "                    \"name\": school_name,\n",
+    "                    \"url\": school_url,\n",
+    "                }\n",
+    "\n",
+    "                try:\n",
+    "                    # Process school (request contact details)\n",
+    "                    response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
+    "                    response.raise_for_status()  # Raise an exception for HTTP errors\n",
+    "\n",
+    "                    # Parse the HTML content using BeautifulSoup\n",
+    "                    soup_school = BeautifulSoup(response.text, 'html.parser')\n",
+    "\n",
+    "                    # School details\n",
+    "                    school_details = soup_school.find(class_=\"school-details\")\n",
+    "                    for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n",
+    "                        data = li_detail.find('span', class_='infotip-term')['data-dfn']\n",
+    "                        text = li_detail.get_text(strip=True)\n",
+    "                        # Set data\n",
+    "                        school_data[\"category_{}\".format(category_idx)] = text\n",
+    "                        school_data[\"category_{}_description\".format(category_idx)] = data\n",
+    "                    \n",
+    "                    school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n",
+    "                    school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n",
+    "                    school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n",
+    "                    school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n",
+    "\n",
+    "                    school_data[\"city\"] = school_city\n",
+    "                    school_data[\"postcode\"] = school_postcode\n",
+    "                    school_data[\"address\"] = school_address\n",
+    "\n",
+    "                    try:\n",
+    "                        school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n",
+    "                    except Exception as e:\n",
+    "                        pass\n",
+    "                    try:\n",
+    "                        school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n",
+    "                    except Exception as e:\n",
+    "                        pass\n",
+    "                    try:\n",
+    "                        school_data[\"email\"] = extract_emails(soup_school)\n",
+    "                    except Exception as e:\n",
+    "                        pass\n",
+    "                    \n",
+    "                except Exception as e:\n",
+    "                    print(school_url, str(e))\n",
+    "                    # assert False\n",
+    "\n",
+    "                list_school_data_dicts.append(school_data)\n",
+    "\n",
+    "    df = pd.DataFrame(list_school_data_dicts)\n",
+    "    df.to_csv(\"scholenopdekaart.csv\")\n",
+    "\n",
+    "\"\"\" # Issues with URL:\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/grave/1900/merletcollege-grave/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/lent/4386/citadel-college-locatie-griftdijk/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/24527/montessori-college-k33-nijmegen/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26368/aventurijn-park-neerbosch/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26187/kandinsky-college-voor-lyceum-havo-mavo-vbo-lwoo/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/1791/karel-de-grote-college/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2040/mondial-college-locatie-leuvensbroek/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2041/mondial-college-meeuwse-acker/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2036/stedelijk-gymnasium-nijmegen/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2038/stedelijke-scholengemeenschap-nijmegen/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26184/yuverta-vmbo-het-groene-lyceum-nijmegen/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/oss/23719/het-hooghuis-locatie-mondriaan-college/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/oss/943/het-hooghuis-locatie-oss-stadion/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/oss/947/het-hooghuis-zuidwest-gebouw-west/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/oss/946/het-hooghuis-zuidwest-gebouw-zuid/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/oss/1929/het-maaslandcollege-scholengemeenschap-voor-tweetalig-mavo-havo-vwo/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/oss/25783/sonnewijser-unit-route-arbeid/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/oss/11432/sonnewijser-unit-vervolgonderwijs-oss/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/oss/942/titus-brandsmalyceum/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/velp-noord-brabant/24545/merletcollege-eerste-opvang-anderstaligen-eoa/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/wijchen/2018/maaswaal-college-havo-atheneum-gymnasium/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/wijchen/2020/maaswaal-college-vmbo-basis-kader-mavo/\n",
+    "https://scholenopdekaart.nl/middelbare-scholen/wijchen/1781/pro-college-wijchen/\n",
+    "\"\"\"\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "'''\n",
+    "school_url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n",
+    "response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
+    "# Parse the HTML content using BeautifulSoup\n",
+    "soup_school = BeautifulSoup(response.text, 'html.parser')\n",
+    "soup_school\n",
+    "'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n",
+    "df.loc[0, \"category_3\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from bs4 import BeautifulSoup\n",
+    "\n",
+    "# Step 1: Fetch the webpage\n",
+    "url = \"https://scholenopdekaart.nl/basisscholen/aagtekerke/25963/jhr-willem-versluijsschool/\"\n",
+    "headers = {\n",
+    "    \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36\"\n",
+    "}\n",
+    "response = requests.get(url, headers=headers)\n",
+    "\n",
+    "# Check if the request was successful\n",
+    "if response.status_code != 200:\n",
+    "    print(f\"Failed to retrieve the page. Status code: {response.status_code}\")\n",
+    "    exit()\n",
+    "\n",
+    "# Step 2: Parse the HTML content\n",
+    "soup = BeautifulSoup(response.text, 'html.parser')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Aantal per Leerjaar:\n",
+      "Groep 1: 29 leerlingen\n",
+      "Groep 2: 28 leerlingen\n",
+      "Groep 3: 30 leerlingen\n",
+      "Groep 4: 25 leerlingen\n",
+      "Groep 5: 19 leerlingen\n",
+      "Groep 6: 26 leerlingen\n",
+      "Groep 7: 22 leerlingen\n",
+      "Groep 8: 20 leerlingen\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "# Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
+    "chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
+    "\n",
+    "if not chart_tag:\n",
+    "    print(\"Could not find the 'aantal per leerjaar' section.\")\n",
+    "else:\n",
+    "    # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
+    "    raw_data = chart_tag['aantal-per-leerjaar']\n",
+    "    \n",
+    "    # Step 3: Parse the JSON data\n",
+    "    try:\n",
+    "        data = json.loads(raw_data)\n",
+    "        \n",
+    "        # Step 4: Print the extracted data\n",
+    "        print(\"Aantal per Leerjaar:\")\n",
+    "        for entry in data:\n",
+    "            print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
+    "    except json.JSONDecodeError as e:\n",
+    "        print(f\"Failed to parse JSON data: {e}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "matitos_urls",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}