{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "from urllib.parse import urljoin\n", "import pandas as pd\n", "import os\n", "\n", "headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Function to decode Cloudflare-protected emails\n", "def decode_email(encoded_email):\n", " \"\"\"\n", " Decode an email protected by Cloudflare's email protection.\n", " :param encoded_email: The encoded email string from the data-cfemail attribute.\n", " :return: The decoded email address.\n", " \"\"\"\n", " email = \"\"\n", " key = int(encoded_email[:2], 16) # Extract the key (first two characters)\n", " for i in range(2, len(encoded_email), 2):\n", " # XOR each pair of hex characters with the key\n", " email += chr(int(encoded_email[i:i + 2], 16) ^ key)\n", " return email\n", "\n", "def extract_emails(soup):\n", " # Find all visible email links (mailto:)\n", " visible_emails = []\n", " for link in soup.find_all('a', href=lambda href: href and href.startswith('mailto:')):\n", " email = link['href'].replace('mailto:', '')\n", " visible_emails.append(email)\n", "\n", " # Find all Cloudflare-protected emails\n", " protected_emails = []\n", " for span in soup.find_all('span', class_='__cf_email__', attrs={'data-cfemail': True}):\n", " encoded_email = span['data-cfemail']\n", " decoded_email = decode_email(encoded_email)\n", " protected_emails.append(decoded_email)\n", "\n", " # Combine all emails\n", " all_emails = visible_emails + protected_emails\n", " all_emails = list(set(all_emails))\n", " if (len(all_emails) == 0):\n", " return None\n", " elif (len(all_emails) == 1):\n", " return all_emails[0]\n", " else:\n", " return all_emails\n", "\n", "def find_website(soup_school):\n", " # Find all tags with href attributes\n", " for link in soup_school.find(class_=\"dl-horizontal dl-icons\").find_all('a', href=True):\n", " href = link['href']\n", " # Filter out only valid URLs (e.g., starting with http or https)\n", " if href.startswith(('http://', 'https://')):\n", " # websites.append(href)\n", " return href\n", "\n", "\n", "def main():\n", " list_urls = [\n", " \"https://scholenopdekaart.nl/Basisscholen/\",\n", " \"https://scholenopdekaart.nl/middelbare-scholen/\"\n", " ]\n", "\n", " list_school_data_dicts = []\n", "\n", " # For each category\n", " for url in list_urls:\n", " # Fetch the HTML content of the page\n", " response = requests.get(url, headers=headers)\n", " response.raise_for_status() # Raise an exception for HTTP errors\n", " # Parse the HTML content using BeautifulSoup\n", " soup = BeautifulSoup(response.text, 'html.parser')\n", "\n", " # Get category\n", " category = url.strip(\"/\").split(\"/\")[-1].lower()\n", "\n", " # Find all tags with href attributes\n", " links_areas = []\n", " for a_tag in soup.find_all('a', href=True):\n", " href = a_tag['href']\n", " \n", " if (category not in href):\n", " continue\n", " \n", " # Convert relative URLs to absolute URLs\n", " area_full_url = urljoin(url, href)\n", " links_areas.append(area_full_url)\n", "\n", " # Area\n", " area = href.rstrip(\"/\").split(\"/\")[-1]\n", "\n", " ###############################################\n", " # Fetch the HTML content of the page\n", " print(\".\", end=\"\")\n", " response = requests.get(area_full_url, headers=headers)\n", " response.raise_for_status() # Raise an exception for HTTP errors\n", "\n", " # Parse the HTML content using BeautifulSoup\n", " soup_area= BeautifulSoup(response.text, 'html.parser')\n", "\n", " # Get schools in area\n", " for a_tag in soup_area.find_all('a', href=True):\n", " href = a_tag['href']\n", "\n", " school_url = urljoin(url, href)\n", " if (area_full_url not in school_url):\n", " continue\n", " \n", " school_name = a_tag.text.rstrip(\".\")\n", " school_data = {\n", " \"category\": category,\n", " \"area\": area,\n", " \"name\": school_name,\n", " \"url\": school_url,\n", " }\n", "\n", " try:\n", " # Process school (request contact details)\n", " response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n", " response.raise_for_status() # Raise an exception for HTTP errors\n", "\n", " # Parse the HTML content using BeautifulSoup\n", " soup_school = BeautifulSoup(response.text, 'html.parser')\n", "\n", " # School details\n", " school_details = soup_school.find(class_=\"school-details\")\n", " for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n", " data = li_detail.find('span', class_='infotip-term')['data-dfn']\n", " text = li_detail.get_text(strip=True)\n", " # Set data\n", " school_data[\"category_{}\".format(category_idx)] = text\n", " school_data[\"category_{}_description\".format(category_idx)] = data\n", " \n", " school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n", " school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n", " school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n", " school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n", "\n", " school_data[\"city\"] = school_city\n", " school_data[\"postcode\"] = school_postcode\n", " school_data[\"address\"] = school_address\n", "\n", " try:\n", " school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n", " except Exception as e:\n", " pass\n", " try:\n", " school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n", " except Exception as e:\n", " pass\n", " try:\n", " school_data[\"email\"] = extract_emails(soup_school)\n", " except Exception as e:\n", " pass\n", " \n", " except Exception as e:\n", " print(school_url, str(e))\n", " # assert False\n", "\n", " list_school_data_dicts.append(school_data)\n", "\n", " df = pd.DataFrame(list_school_data_dicts)\n", " df.to_csv(\"scholenopdekaart.csv\")\n", "\n", "\"\"\" # Issues with URL:\n", "https://scholenopdekaart.nl/middelbare-scholen/grave/1900/merletcollege-grave/\n", "https://scholenopdekaart.nl/middelbare-scholen/lent/4386/citadel-college-locatie-griftdijk/\n", "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/24527/montessori-college-k33-nijmegen/\n", "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26368/aventurijn-park-neerbosch/\n", "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26187/kandinsky-college-voor-lyceum-havo-mavo-vbo-lwoo/\n", "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/1791/karel-de-grote-college/\n", "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2040/mondial-college-locatie-leuvensbroek/\n", "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2041/mondial-college-meeuwse-acker/\n", "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2036/stedelijk-gymnasium-nijmegen/\n", "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2038/stedelijke-scholengemeenschap-nijmegen/\n", "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26184/yuverta-vmbo-het-groene-lyceum-nijmegen/\n", "https://scholenopdekaart.nl/middelbare-scholen/oss/23719/het-hooghuis-locatie-mondriaan-college/\n", "https://scholenopdekaart.nl/middelbare-scholen/oss/943/het-hooghuis-locatie-oss-stadion/\n", "https://scholenopdekaart.nl/middelbare-scholen/oss/947/het-hooghuis-zuidwest-gebouw-west/\n", "https://scholenopdekaart.nl/middelbare-scholen/oss/946/het-hooghuis-zuidwest-gebouw-zuid/\n", "https://scholenopdekaart.nl/middelbare-scholen/oss/1929/het-maaslandcollege-scholengemeenschap-voor-tweetalig-mavo-havo-vwo/\n", "https://scholenopdekaart.nl/middelbare-scholen/oss/25783/sonnewijser-unit-route-arbeid/\n", "https://scholenopdekaart.nl/middelbare-scholen/oss/11432/sonnewijser-unit-vervolgonderwijs-oss/\n", "https://scholenopdekaart.nl/middelbare-scholen/oss/942/titus-brandsmalyceum/\n", "https://scholenopdekaart.nl/middelbare-scholen/velp-noord-brabant/24545/merletcollege-eerste-opvang-anderstaligen-eoa/\n", "https://scholenopdekaart.nl/middelbare-scholen/wijchen/2018/maaswaal-college-havo-atheneum-gymnasium/\n", "https://scholenopdekaart.nl/middelbare-scholen/wijchen/2020/maaswaal-college-vmbo-basis-kader-mavo/\n", "https://scholenopdekaart.nl/middelbare-scholen/wijchen/1781/pro-college-wijchen/\n", "\"\"\"\n", "\n", "if __name__ == \"__main__\":\n", " main()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'''\n", "school_url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n", "response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n", "# Parse the HTML content using BeautifulSoup\n", "soup_school = BeautifulSoup(response.text, 'html.parser')\n", "soup_school\n", "'''" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n", "df.loc[0, \"category_3\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "# Step 1: Fetch the webpage\n", "url = \"https://scholenopdekaart.nl/basisscholen/aagtekerke/25963/jhr-willem-versluijsschool/\"\n", "headers = {\n", " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36\"\n", "}\n", "response = requests.get(url, headers=headers)\n", "\n", "# Check if the request was successful\n", "if response.status_code != 200:\n", " print(f\"Failed to retrieve the page. Status code: {response.status_code}\")\n", " exit()\n", "\n", "# Step 2: Parse the HTML content\n", "soup = BeautifulSoup(response.text, 'html.parser')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Aantal per Leerjaar:\n", "Groep 1: 29 leerlingen\n", "Groep 2: 28 leerlingen\n", "Groep 3: 30 leerlingen\n", "Groep 4: 25 leerlingen\n", "Groep 5: 19 leerlingen\n", "Groep 6: 26 leerlingen\n", "Groep 7: 22 leerlingen\n", "Groep 8: 20 leerlingen\n" ] } ], "source": [ "import json\n", "\n", "# Step 1: Locate the tag\n", "chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n", "\n", "if not chart_tag:\n", " print(\"Could not find the 'aantal per leerjaar' section.\")\n", "else:\n", " # Step 2: Extract the 'aantal-per-leerjaar' attribute\n", " raw_data = chart_tag['aantal-per-leerjaar']\n", " \n", " # Step 3: Parse the JSON data\n", " try:\n", " data = json.loads(raw_data)\n", " \n", " # Step 4: Print the extracted data\n", " print(\"Aantal per Leerjaar:\")\n", " for entry in data:\n", " print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n", " except json.JSONDecodeError as e:\n", " print(f\"Failed to parse JSON data: {e}\")" ] } ], "metadata": { "kernelspec": { "display_name": "matitos_urls", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 2 }