471 lines
20 KiB
Plaintext
471 lines
20 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import requests\n",
|
|
"from bs4 import BeautifulSoup\n",
|
|
"from urllib.parse import urljoin\n",
|
|
"import pandas as pd\n",
|
|
"import os\n",
|
|
"import json\n",
|
|
"import csv\n",
|
|
"\n",
|
|
"headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Function to decode Cloudflare-protected emails\n",
|
|
"def decode_email(encoded_email):\n",
|
|
" \"\"\"\n",
|
|
" Decode an email protected by Cloudflare's email protection.\n",
|
|
" :param encoded_email: The encoded email string from the data-cfemail attribute.\n",
|
|
" :return: The decoded email address.\n",
|
|
" \"\"\"\n",
|
|
" email = \"\"\n",
|
|
" key = int(encoded_email[:2], 16) # Extract the key (first two characters)\n",
|
|
" for i in range(2, len(encoded_email), 2):\n",
|
|
" # XOR each pair of hex characters with the key\n",
|
|
" email += chr(int(encoded_email[i:i + 2], 16) ^ key)\n",
|
|
" return email\n",
|
|
"\n",
|
|
"def extract_emails(soup):\n",
|
|
" # Find all visible email links (mailto:)\n",
|
|
" visible_emails = []\n",
|
|
" for link in soup.find_all('a', href=lambda href: href and href.startswith('mailto:')):\n",
|
|
" email = link['href'].replace('mailto:', '')\n",
|
|
" visible_emails.append(email)\n",
|
|
"\n",
|
|
" # Find all Cloudflare-protected emails\n",
|
|
" protected_emails = []\n",
|
|
" for span in soup.find_all('span', class_='__cf_email__', attrs={'data-cfemail': True}):\n",
|
|
" encoded_email = span['data-cfemail']\n",
|
|
" decoded_email = decode_email(encoded_email)\n",
|
|
" protected_emails.append(decoded_email)\n",
|
|
"\n",
|
|
" # Combine all emails\n",
|
|
" all_emails = visible_emails + protected_emails\n",
|
|
" all_emails = list(set(all_emails))\n",
|
|
" if (len(all_emails) == 0):\n",
|
|
" return None\n",
|
|
" elif (len(all_emails) == 1):\n",
|
|
" return all_emails[0]\n",
|
|
" else:\n",
|
|
" return all_emails\n",
|
|
"\n",
|
|
"def find_website(soup_school):\n",
|
|
" # Find all <a> tags with href attributes\n",
|
|
" for link in soup_school.find(class_=\"dl-horizontal dl-icons\").find_all('a', href=True):\n",
|
|
" href = link['href']\n",
|
|
" # Filter out only valid URLs (e.g., starting with http or https)\n",
|
|
" if href.startswith(('http://', 'https://')):\n",
|
|
" # websites.append(href)\n",
|
|
" return href\n",
|
|
"\n",
|
|
"def get_num_students_per_zipcode(soup):\n",
|
|
" list_zipcode_students_percentage = []\n",
|
|
"\n",
|
|
" h3_tag = soup.find(\"h3\", string=\"In welk postcodegebied wonen de leerlingen van deze school?\")\n",
|
|
" if h3_tag:\n",
|
|
" dialog = h3_tag.find_parent(\"dialog\")\n",
|
|
"\n",
|
|
" if dialog:\n",
|
|
" # print(dialog.prettify())\n",
|
|
" table = dialog.find(\"table\")\n",
|
|
" if table:\n",
|
|
" rows = table.find_all(\"tr\")\n",
|
|
" for row in rows:\n",
|
|
" cells = row.find_all([\"th\", \"td\"])\n",
|
|
" row_data = [cell.get_text(strip=True) for cell in cells]\n",
|
|
" zipcode, num_students, percentage = row_data\n",
|
|
" list_zipcode_students_percentage.append( (zipcode, num_students, percentage) )\n",
|
|
" \n",
|
|
" return list_zipcode_students_percentage\n",
|
|
"\n",
|
|
"def get_num_students_trend(soup):\n",
|
|
" # Step 1: Locate the <aantal-leerlingen-trend-line-chart> tag\n",
|
|
" trend_chart_tag = soup.find(\"aantal-leerlingen-trend-line-chart\")\n",
|
|
"\n",
|
|
" if trend_chart_tag:\n",
|
|
" # Step 2: Extract the 'leerlingen-trend-data' attribute\n",
|
|
" trend_data_attr = trend_chart_tag.get(\"leerlingen-trend-data\")\n",
|
|
" \n",
|
|
" if trend_data_attr:\n",
|
|
" # Step 3: Parse the JSON string into a Python object\n",
|
|
" trend_data = json.loads(trend_data_attr)\n",
|
|
" #print(\"Extracted leerlingen-trend-data:\")\n",
|
|
" #print(json.dumps(trend_data, indent=4)) # Pretty-print the JSON data\n",
|
|
" return [ (e.get(\"key\"), e.get(\"aantal\") ) for e in trend_data]\n",
|
|
"\n",
|
|
"def get_num_students_per_age_and_group(soup):\n",
|
|
" num_students_per_group, num_students_per_age = [], []\n",
|
|
" ############################################################################\n",
|
|
" # Step 1: Locate the <aantal-leerlingen-leeftijd-bar-chart> tag\n",
|
|
" chart_tag = soup.find('aantal-leerlingen-leeftijd-bar-chart', attrs={'aantal-per-leeftijd': True})\n",
|
|
" # Step 2: Extract the 'aantal-per-leeftijd' attribute\n",
|
|
" raw_data = chart_tag['aantal-per-leeftijd']\n",
|
|
"\n",
|
|
" # Step 3: Parse the JSON data\n",
|
|
" try:\n",
|
|
" data = json.loads(raw_data)\n",
|
|
" # Step 4: Print the extracted data\n",
|
|
" # print(\"Aantal per Leeftijd:\")\n",
|
|
" for entry in data:\n",
|
|
" age = entry['key']\n",
|
|
" num_students = entry['aantal']\n",
|
|
" # school_data[\"num_students_age_{}\".format(age)] = num_students\n",
|
|
" num_students_per_age.append( (age, num_students) )\n",
|
|
" # print(f\"Age {entry['key']}: {entry['aantal']} leerlingen\")\n",
|
|
" except json.JSONDecodeError as e:\n",
|
|
" print(f\"Failed to parse JSON data: {e}\")\n",
|
|
"\n",
|
|
" ############################################################################\n",
|
|
" # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
|
|
" chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
|
|
"\n",
|
|
" if not chart_tag:\n",
|
|
" print(\"Could not find the 'aantal per leerjaar' section.\")\n",
|
|
" else:\n",
|
|
" # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
|
|
" raw_data = chart_tag['aantal-per-leerjaar']\n",
|
|
" \n",
|
|
" # Step 3: Parse the JSON data\n",
|
|
" try:\n",
|
|
" data = json.loads(raw_data)\n",
|
|
" # Step 4: Print the extracted data\n",
|
|
" # print(\"Aantal per Leerjaar:\")\n",
|
|
" for entry in data:\n",
|
|
" group = entry['key']\n",
|
|
" num_students = entry['aantal']\n",
|
|
" # school_data[\"num_students_group_{}\".format(group)] = num_students\n",
|
|
" num_students_per_group.append( (group, num_students) )\n",
|
|
" # print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
|
|
" except json.JSONDecodeError as e:\n",
|
|
" print(f\"Failed to parse JSON data: {e}\")\n",
|
|
" ############################################################################\n",
|
|
" return num_students_per_group, num_students_per_age\n",
|
|
"\n",
|
|
"\n",
|
|
"def update_school_data(school_url, school_data):\n",
|
|
" try:\n",
|
|
" # Process school (request contact details)\n",
|
|
" response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
|
|
" response.raise_for_status() # Raise an exception for HTTP errors\n",
|
|
" # Parse the HTML content using BeautifulSoup\n",
|
|
" soup_school = BeautifulSoup(response.text, 'html.parser')\n",
|
|
"\n",
|
|
" # School details\n",
|
|
" school_details = soup_school.find(class_=\"school-details\")\n",
|
|
" for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n",
|
|
" data = li_detail.find('span', class_='infotip-term')['data-dfn']\n",
|
|
" text = li_detail.get_text(strip=True)\n",
|
|
" # Set data\n",
|
|
" school_data[\"category_{}\".format(category_idx)] = text\n",
|
|
" school_data[\"category_{}_description\".format(category_idx)] = data\n",
|
|
" \n",
|
|
" school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n",
|
|
" school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n",
|
|
" school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n",
|
|
" school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n",
|
|
"\n",
|
|
" school_data[\"city\"] = school_city\n",
|
|
" school_data[\"postcode\"] = school_postcode\n",
|
|
" school_data[\"address\"] = school_address\n",
|
|
"\n",
|
|
" try:\n",
|
|
" school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n",
|
|
" except Exception as e:\n",
|
|
" pass\n",
|
|
" try:\n",
|
|
" school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n",
|
|
" except Exception as e:\n",
|
|
" pass\n",
|
|
" try:\n",
|
|
" school_data[\"email\"] = extract_emails(soup_school)\n",
|
|
" except Exception as e:\n",
|
|
" pass\n",
|
|
"\n",
|
|
" # Process school main site\n",
|
|
" response = requests.get(os.path.join(school_url), headers=headers)\n",
|
|
" response.raise_for_status() # Raise an exception for HTTP errors\n",
|
|
" # Parse the HTML content using BeautifulSoup\n",
|
|
" soup_school = BeautifulSoup(response.text, 'html.parser')\n",
|
|
"\n",
|
|
" try:\n",
|
|
" school_data[\"students_per_zipcode\"] = get_num_students_per_zipcode(soup_school)\n",
|
|
" except Exception as e:\n",
|
|
" pass\n",
|
|
" try:\n",
|
|
" school_data[\"students_per_year_trend\"] = get_num_students_trend(soup_school)\n",
|
|
" except Exception as e:\n",
|
|
" pass\n",
|
|
"\n",
|
|
" if (school_data.get(\"category\").lower() == \"basisscholen\"):\n",
|
|
" try:\n",
|
|
" num_students_per_group, num_students_per_age = get_num_students_per_age_and_group(soup_school)\n",
|
|
" school_data[\"num_students_per_group\"] = num_students_per_group if len(num_students_per_group)>0 else None\n",
|
|
" school_data[\"num_students_per_age\"] = num_students_per_age if len(num_students_per_age)>0 else None\n",
|
|
" except Exception as e:\n",
|
|
" pass\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" print(school_url, str(e))\n",
|
|
"\n",
|
|
"def main():\n",
|
|
" list_urls = [\n",
|
|
" \"https://scholenopdekaart.nl/Basisscholen/\",\n",
|
|
" \"https://scholenopdekaart.nl/middelbare-scholen/\"\n",
|
|
" ]\n",
|
|
"\n",
|
|
" list_school_data_dicts = []\n",
|
|
"\n",
|
|
" # For each category\n",
|
|
" for url in list_urls:\n",
|
|
" # Fetch the HTML content of the page\n",
|
|
" response = requests.get(url, headers=headers)\n",
|
|
" response.raise_for_status() # Raise an exception for HTTP errors\n",
|
|
" # Parse the HTML content using BeautifulSoup\n",
|
|
" soup = BeautifulSoup(response.text, 'html.parser')\n",
|
|
"\n",
|
|
" # Get category\n",
|
|
" category = url.strip(\"/\").split(\"/\")[-1].lower()\n",
|
|
"\n",
|
|
" # Find all <a> tags with href attributes\n",
|
|
" links_areas = []\n",
|
|
" for a_tag in soup.find_all('a', href=True):\n",
|
|
" href = a_tag['href']\n",
|
|
" \n",
|
|
" if (category not in href):\n",
|
|
" continue\n",
|
|
" \n",
|
|
" # Convert relative URLs to absolute URLs\n",
|
|
" area_full_url = urljoin(url, href)\n",
|
|
" links_areas.append(area_full_url)\n",
|
|
"\n",
|
|
" # Area\n",
|
|
" area = href.rstrip(\"/\").split(\"/\")[-1]\n",
|
|
"\n",
|
|
" ###############################################\n",
|
|
" # Fetch the HTML content of the page\n",
|
|
" print(\".\", end=\"\")\n",
|
|
" response = requests.get(area_full_url, headers=headers)\n",
|
|
" response.raise_for_status() # Raise an exception for HTTP errors\n",
|
|
"\n",
|
|
" # Parse the HTML content using BeautifulSoup\n",
|
|
" soup_area= BeautifulSoup(response.text, 'html.parser')\n",
|
|
"\n",
|
|
" # Get schools in area\n",
|
|
" for a_tag in soup_area.find_all('a', href=True):\n",
|
|
" href = a_tag['href']\n",
|
|
"\n",
|
|
" school_url = urljoin(url, href)\n",
|
|
" if (area_full_url not in school_url):\n",
|
|
" continue\n",
|
|
" \n",
|
|
" school_name = a_tag.text.rstrip(\".\")\n",
|
|
" school_data = {\n",
|
|
" \"category\": category,\n",
|
|
" \"area\": area,\n",
|
|
" \"name\": school_name,\n",
|
|
" \"url\": school_url,\n",
|
|
" }\n",
|
|
"\n",
|
|
" update_school_data(school_url, school_data)\n",
|
|
"\n",
|
|
" list_school_data_dicts.append(school_data)\n",
|
|
"\n",
|
|
" # Save per processed school to track progress\n",
|
|
" df = pd.DataFrame(list_school_data_dicts)\n",
|
|
" df.to_csv(\"scholenopdekaart_tmp.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n",
|
|
"\n",
|
|
" df = pd.DataFrame(list_school_data_dicts)\n",
|
|
" df.to_csv(\"scholenopdekaart.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n",
|
|
" # Without extra columns\n",
|
|
" df.drop(columns=[\"students_per_zipcode\", \"students_per_year_trend\", \"num_students_per_group\", \"num_students_per_age\"]).to_csv(\"scholenopdekaart_.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\"\"\" # Issues with URL:\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/grave/1900/merletcollege-grave/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/lent/4386/citadel-college-locatie-griftdijk/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/nijmegen/24527/montessori-college-k33-nijmegen/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26368/aventurijn-park-neerbosch/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26187/kandinsky-college-voor-lyceum-havo-mavo-vbo-lwoo/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/nijmegen/1791/karel-de-grote-college/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2040/mondial-college-locatie-leuvensbroek/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2041/mondial-college-meeuwse-acker/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2036/stedelijk-gymnasium-nijmegen/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2038/stedelijke-scholengemeenschap-nijmegen/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26184/yuverta-vmbo-het-groene-lyceum-nijmegen/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/oss/23719/het-hooghuis-locatie-mondriaan-college/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/oss/943/het-hooghuis-locatie-oss-stadion/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/oss/947/het-hooghuis-zuidwest-gebouw-west/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/oss/946/het-hooghuis-zuidwest-gebouw-zuid/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/oss/1929/het-maaslandcollege-scholengemeenschap-voor-tweetalig-mavo-havo-vwo/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/oss/25783/sonnewijser-unit-route-arbeid/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/oss/11432/sonnewijser-unit-vervolgonderwijs-oss/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/oss/942/titus-brandsmalyceum/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/velp-noord-brabant/24545/merletcollege-eerste-opvang-anderstaligen-eoa/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/wijchen/2018/maaswaal-college-havo-atheneum-gymnasium/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/wijchen/2020/maaswaal-college-vmbo-basis-kader-mavo/\n",
|
|
"https://scholenopdekaart.nl/middelbare-scholen/wijchen/1781/pro-college-wijchen/\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"if __name__ == \"__main__\":\n",
|
|
" main()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"df = pd.read_csv(\"~/Downloads/scholenopdekaart.csv\", index_col=0)\n",
|
|
"\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def to_dict(row):\n",
|
|
" # Empty?\n",
|
|
" if (pd.isna(row)):\n",
|
|
" return {}\n",
|
|
" # Evaluate, to dict\n",
|
|
" dict_data = dict(eval(row))\n",
|
|
" # Remove None values\n",
|
|
" for k in list(dict_data.keys()):\n",
|
|
" if dict_data[k] is None:\n",
|
|
" del dict_data[k]\n",
|
|
" # Prefix\n",
|
|
" return {f\"{column}_{k}\": v for k, v in dict_data.items()}\n",
|
|
"\n",
|
|
"for column in [\"students_per_year_trend\", \"num_students_per_group\", \"num_students_per_age\"]:\n",
|
|
" print(column)\n",
|
|
" # Convert the list of tuples into a dictionary per row\n",
|
|
" df_dicts = df[column].apply(to_dict)\n",
|
|
" # Expand into separate columns\n",
|
|
" df_expanded = pd.json_normalize(df_dicts)\n",
|
|
" # Sort\n",
|
|
" df_expanded = df_expanded[sorted(df_expanded.columns)]\n",
|
|
" # Combine with original columns\n",
|
|
" df = pd.concat([df.drop(columns=[column]), df_expanded], axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def to_dict(row):\n",
|
|
" # Empty?\n",
|
|
" if (pd.isna(row)):\n",
|
|
" return {}\n",
|
|
" # Evaluate, to dict\n",
|
|
" data = eval(row)\n",
|
|
" # Remove first useless data\n",
|
|
" data = data[1:]\n",
|
|
"\n",
|
|
" # Generate dict\n",
|
|
" dict_data = {}\n",
|
|
" for (zipcode, num, percentage) in data:\n",
|
|
" dict_data[f\"num_students_zipcode_{zipcode}\"] = num\n",
|
|
" dict_data[f\"percentage_students_zipcode_{zipcode}\"] = percentage\n",
|
|
"\n",
|
|
" # Remove None values\n",
|
|
" for k in list(dict_data.keys()):\n",
|
|
" if dict_data[k] is None:\n",
|
|
" del dict_data[k]\n",
|
|
" return dict_data\n",
|
|
"\n",
|
|
"for column in [\"students_per_zipcode\"]:\n",
|
|
" print(column)\n",
|
|
" # Convert the list of tuples into a dictionary per row\n",
|
|
" df_dicts = df[column].apply(to_dict)\n",
|
|
" # Expand into separate columns\n",
|
|
" df_expanded = pd.json_normalize(df_dicts)\n",
|
|
" # Sort\n",
|
|
" df_expanded = df_expanded[sorted(df_expanded.columns)]\n",
|
|
" # Combine with original columns\n",
|
|
" df = pd.concat([df.drop(columns=[column]), df_expanded], axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.to_csv(\"schools_nl.csv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"list(df.columns)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "fetcher",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.11"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|