In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import os
import json
import csv


headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}

In [None]:
# Function to decode Cloudflare-protected emails
def decode_email(encoded_email):
    """
    Decode an email protected by Cloudflare's email protection.
    :param encoded_email: The encoded email string from the data-cfemail attribute.
    :return: The decoded email address.
    """
    email = ""
    key = int(encoded_email[:2], 16)  # Extract the key (first two characters)
    for i in range(2, len(encoded_email), 2):
        # XOR each pair of hex characters with the key
        email += chr(int(encoded_email[i:i + 2], 16) ^ key)
    return email

def extract_emails(soup):
    # Find all visible email links (mailto:)
    visible_emails = []
    for link in soup.find_all('a', href=lambda href: href and href.startswith('mailto:')):
        email = link['href'].replace('mailto:', '')
        visible_emails.append(email)

    # Find all Cloudflare-protected emails
    protected_emails = []
    for span in soup.find_all('span', class_='__cf_email__', attrs={'data-cfemail': True}):
        encoded_email = span['data-cfemail']
        decoded_email = decode_email(encoded_email)
        protected_emails.append(decoded_email)

    # Combine all emails
    all_emails = visible_emails + protected_emails
    all_emails = list(set(all_emails))
    if (len(all_emails) == 0):
        return None
    elif (len(all_emails) == 1):
        return all_emails[0]
    else:
        return all_emails

def find_website(soup_school):
    # Find all <a> tags with href attributes
    for link in soup_school.find(class_="dl-horizontal dl-icons").find_all('a', href=True):
        href = link['href']
        # Filter out only valid URLs (e.g., starting with http or https)
        if href.startswith(('http://', 'https://')):
            # websites.append(href)
            return href

def get_num_students_per_zipcode(soup):
    list_zipcode_students_percentage = []

    h3_tag = soup.find("h3", string="In welk postcodegebied wonen de leerlingen van deze school?")
    if h3_tag:
        dialog = h3_tag.find_parent("dialog")

        if dialog:
            # print(dialog.prettify())
            table = dialog.find("table")
            if table:
                rows = table.find_all("tr")
                for row in rows:
                    cells = row.find_all(["th", "td"])
                    row_data = [cell.get_text(strip=True) for cell in cells]
                    zipcode, num_students, percentage = row_data
                    list_zipcode_students_percentage.append( (zipcode, num_students, percentage) )
    
    return list_zipcode_students_percentage

def get_num_students_trend(soup):
    # Step 1: Locate the <aantal-leerlingen-trend-line-chart> tag
    trend_chart_tag = soup.find("aantal-leerlingen-trend-line-chart")

    if trend_chart_tag:
        # Step 2: Extract the 'leerlingen-trend-data' attribute
        trend_data_attr = trend_chart_tag.get("leerlingen-trend-data")
        
        if trend_data_attr:
            # Step 3: Parse the JSON string into a Python object
            trend_data = json.loads(trend_data_attr)
            #print("Extracted leerlingen-trend-data:")
            #print(json.dumps(trend_data, indent=4))  # Pretty-print the JSON data
            return [ (e.get("key"), e.get("aantal") ) for e in trend_data]

def get_num_students_per_age_and_group(soup):
    num_students_per_group, num_students_per_age = [], []
    ############################################################################
    # Step 1: Locate the <aantal-leerlingen-leeftijd-bar-chart> tag
    chart_tag = soup.find('aantal-leerlingen-leeftijd-bar-chart', attrs={'aantal-per-leeftijd': True})
    # Step 2: Extract the 'aantal-per-leeftijd' attribute
    raw_data = chart_tag['aantal-per-leeftijd']

    # Step 3: Parse the JSON data
    try:
        data = json.loads(raw_data)
        # Step 4: Print the extracted data
        # print("Aantal per Leeftijd:")
        for entry in data:
            age = entry['key']
            num_students = entry['aantal']
            # school_data["num_students_age_{}".format(age)] = num_students
            num_students_per_age.append( (age, num_students) )
            # print(f"Age {entry['key']}: {entry['aantal']} leerlingen")
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON data: {e}")

    ############################################################################
    # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag
    chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})

    if not chart_tag:
        print("Could not find the 'aantal per leerjaar' section.")
    else:
        # Step 2: Extract the 'aantal-per-leerjaar' attribute
        raw_data = chart_tag['aantal-per-leerjaar']
        
        # Step 3: Parse the JSON data
        try:
            data = json.loads(raw_data)
            # Step 4: Print the extracted data
            # print("Aantal per Leerjaar:")
            for entry in data:
                group = entry['key']
                num_students = entry['aantal']
                # school_data["num_students_group_{}".format(group)] = num_students
                num_students_per_group.append( (group, num_students) )
                # print(f"Groep {entry['key']}: {entry['aantal']} leerlingen")
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON data: {e}")
    ############################################################################
    return num_students_per_group, num_students_per_age


def update_school_data(school_url, school_data):
    try:
        # Process school (request contact details)
        response = requests.get(os.path.join(school_url, "contact/#inhoud"), headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors
        # Parse the HTML content using BeautifulSoup
        soup_school = BeautifulSoup(response.text, 'html.parser')

        # School details
        school_details = soup_school.find(class_="school-details")
        for category_idx, li_detail in enumerate(school_details.find_all("li")):
            data = li_detail.find('span', class_='infotip-term')['data-dfn']
            text = li_detail.get_text(strip=True)
            # Set data
            school_data["category_{}".format(category_idx)] = text
            school_data["category_{}_description".format(category_idx)] = data
        
        school_address = soup_school.find(class_="school-adres").get_text(strip=True)
        school_postcode_city = soup_school.find(class_="school-postcode-woonplaats").get_text(strip=True)
        school_postcode = "".join(school_postcode_city.split(" ")[:2])
        school_city = " ".join(school_postcode_city.split(" ")[2:])

        school_data["city"] = school_city
        school_data["postcode"] = school_postcode
        school_data["address"] = school_address

        try:
            school_data["website"] = find_website(soup_school) # soup_school.find(class_="button button-primary").get('href')
        except Exception as e:
            pass
        try:
            school_data["phone"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text
        except Exception as e:
            pass
        try:
            school_data["email"] = extract_emails(soup_school)
        except Exception as e:
            pass

        # Process school main site
        response = requests.get(os.path.join(school_url), headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors
        # Parse the HTML content using BeautifulSoup
        soup_school = BeautifulSoup(response.text, 'html.parser')

        try:
            school_data["students_per_zipcode"] = get_num_students_per_zipcode(soup_school)
        except Exception as e:
            pass
        try:
            school_data["students_per_year_trend"] = get_num_students_trend(soup_school)
        except Exception as e:
            pass

        if (school_data.get("category").lower() == "basisscholen"):
            try:
                num_students_per_group, num_students_per_age = get_num_students_per_age_and_group(soup_school)
                school_data["num_students_per_group"] = num_students_per_group if len(num_students_per_group)>0 else None
                school_data["num_students_per_age"] = num_students_per_age if len(num_students_per_age)>0 else None
            except Exception as e:
                pass
        
    except Exception as e:
        print(school_url, str(e))

def main():
    list_urls = [
        "https://scholenopdekaart.nl/Basisscholen/",
        "https://scholenopdekaart.nl/middelbare-scholen/"
    ]

    list_school_data_dicts = []

    # For each category
    for url in list_urls:
        # Fetch the HTML content of the page
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Get category
        category = url.strip("/").split("/")[-1].lower()

        # Find all <a> tags with href attributes
        links_areas = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            
            if (category not in href):
                continue
            
            # Convert relative URLs to absolute URLs
            area_full_url = urljoin(url, href)
            links_areas.append(area_full_url)

            # Area
            area = href.rstrip("/").split("/")[-1]

            ###############################################
            # Fetch the HTML content of the page
            print(".", end="")
            response = requests.get(area_full_url, headers=headers)
            response.raise_for_status()  # Raise an exception for HTTP errors

            # Parse the HTML content using BeautifulSoup
            soup_area= BeautifulSoup(response.text, 'html.parser')

            # Get schools in area
            for a_tag in soup_area.find_all('a', href=True):
                href = a_tag['href']

                school_url = urljoin(url, href)
                if (area_full_url not in school_url):
                    continue
                
                school_name = a_tag.text.rstrip(".")
                school_data = {
                    "category": category,
                    "area": area,
                    "name": school_name,
                    "url": school_url,
                }

                update_school_data(school_url, school_data)

                list_school_data_dicts.append(school_data)

                # Save per processed school to track progress
                df = pd.DataFrame(list_school_data_dicts)
                df.to_csv("scholenopdekaart_tmp.csv", encoding="utf-8", quoting=csv.QUOTE_ALL)

    df = pd.DataFrame(list_school_data_dicts)
    df.to_csv("scholenopdekaart.csv", encoding="utf-8", quoting=csv.QUOTE_ALL)
    # Without extra columns
    df.drop(columns=["students_per_zipcode", "students_per_year_trend", "num_students_per_group", "num_students_per_age"]).to_csv("scholenopdekaart_.csv", encoding="utf-8", quoting=csv.QUOTE_ALL)


In [None]:
""" # Issues with URL:
https://scholenopdekaart.nl/middelbare-scholen/grave/1900/merletcollege-grave/
https://scholenopdekaart.nl/middelbare-scholen/lent/4386/citadel-college-locatie-griftdijk/
https://scholenopdekaart.nl/middelbare-scholen/nijmegen/24527/montessori-college-k33-nijmegen/
https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26368/aventurijn-park-neerbosch/
https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26187/kandinsky-college-voor-lyceum-havo-mavo-vbo-lwoo/
https://scholenopdekaart.nl/middelbare-scholen/nijmegen/1791/karel-de-grote-college/
https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2040/mondial-college-locatie-leuvensbroek/
https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2041/mondial-college-meeuwse-acker/
https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2036/stedelijk-gymnasium-nijmegen/
https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2038/stedelijke-scholengemeenschap-nijmegen/
https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26184/yuverta-vmbo-het-groene-lyceum-nijmegen/
https://scholenopdekaart.nl/middelbare-scholen/oss/23719/het-hooghuis-locatie-mondriaan-college/
https://scholenopdekaart.nl/middelbare-scholen/oss/943/het-hooghuis-locatie-oss-stadion/
https://scholenopdekaart.nl/middelbare-scholen/oss/947/het-hooghuis-zuidwest-gebouw-west/
https://scholenopdekaart.nl/middelbare-scholen/oss/946/het-hooghuis-zuidwest-gebouw-zuid/
https://scholenopdekaart.nl/middelbare-scholen/oss/1929/het-maaslandcollege-scholengemeenschap-voor-tweetalig-mavo-havo-vwo/
https://scholenopdekaart.nl/middelbare-scholen/oss/25783/sonnewijser-unit-route-arbeid/
https://scholenopdekaart.nl/middelbare-scholen/oss/11432/sonnewijser-unit-vervolgonderwijs-oss/
https://scholenopdekaart.nl/middelbare-scholen/oss/942/titus-brandsmalyceum/
https://scholenopdekaart.nl/middelbare-scholen/velp-noord-brabant/24545/merletcollege-eerste-opvang-anderstaligen-eoa/
https://scholenopdekaart.nl/middelbare-scholen/wijchen/2018/maaswaal-college-havo-atheneum-gymnasium/
https://scholenopdekaart.nl/middelbare-scholen/wijchen/2020/maaswaal-college-vmbo-basis-kader-mavo/
https://scholenopdekaart.nl/middelbare-scholen/wijchen/1781/pro-college-wijchen/
"""

if __name__ == "__main__":
    main()

In [None]:
df = pd.read_csv("scholenopdekaart.csv", index_col=0)

df.head()

In [None]:
df.tail()