Schools NL, Ghost post utils, nude + age detection

2025-04-30 15:50:54 +02:00
parent aa369d0458
commit ccfd0f9188
11 changed files with 841 additions and 246 deletions
--- a/utils/Schools-NL.ipynb
+++ b/utils/Schools-NL.ipynb
@@ -12,6 +12,8 @@
    "import pandas as pd\n",
    "import os\n",
    "import json\n",
+    "import csv\n",
+    "\n",
    "\n",
    "headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
   ]
@@ -69,6 +71,154 @@
    "            # websites.append(href)\n",
    "            return href\n",
    "\n",
+    "def get_num_students_per_zipcode(soup):\n",
+    "    list_zipcode_students_percentage = []\n",
+    "\n",
+    "    h3_tag = soup.find(\"h3\", string=\"In welk postcodegebied wonen de leerlingen van deze school?\")\n",
+    "    if h3_tag:\n",
+    "        dialog = h3_tag.find_parent(\"dialog\")\n",
+    "\n",
+    "        if dialog:\n",
+    "            # print(dialog.prettify())\n",
+    "            table = dialog.find(\"table\")\n",
+    "            if table:\n",
+    "                rows = table.find_all(\"tr\")\n",
+    "                for row in rows:\n",
+    "                    cells = row.find_all([\"th\", \"td\"])\n",
+    "                    row_data = [cell.get_text(strip=True) for cell in cells]\n",
+    "                    zipcode, num_students, percentage = row_data\n",
+    "                    list_zipcode_students_percentage.append( (zipcode, num_students, percentage) )\n",
+    "    \n",
+    "    return list_zipcode_students_percentage\n",
+    "\n",
+    "def get_num_students_trend(soup):\n",
+    "    # Step 1: Locate the <aantal-leerlingen-trend-line-chart> tag\n",
+    "    trend_chart_tag = soup.find(\"aantal-leerlingen-trend-line-chart\")\n",
+    "\n",
+    "    if trend_chart_tag:\n",
+    "        # Step 2: Extract the 'leerlingen-trend-data' attribute\n",
+    "        trend_data_attr = trend_chart_tag.get(\"leerlingen-trend-data\")\n",
+    "        \n",
+    "        if trend_data_attr:\n",
+    "            # Step 3: Parse the JSON string into a Python object\n",
+    "            trend_data = json.loads(trend_data_attr)\n",
+    "            #print(\"Extracted leerlingen-trend-data:\")\n",
+    "            #print(json.dumps(trend_data, indent=4))  # Pretty-print the JSON data\n",
+    "            return [ (e.get(\"key\"), e.get(\"aantal\") ) for e in trend_data]\n",
+    "\n",
+    "def get_num_students_per_age_and_group(soup):\n",
+    "    num_students_per_group, num_students_per_age = [], []\n",
+    "    ############################################################################\n",
+    "    # Step 1: Locate the <aantal-leerlingen-leeftijd-bar-chart> tag\n",
+    "    chart_tag = soup.find('aantal-leerlingen-leeftijd-bar-chart', attrs={'aantal-per-leeftijd': True})\n",
+    "    # Step 2: Extract the 'aantal-per-leeftijd' attribute\n",
+    "    raw_data = chart_tag['aantal-per-leeftijd']\n",
+    "\n",
+    "    # Step 3: Parse the JSON data\n",
+    "    try:\n",
+    "        data = json.loads(raw_data)\n",
+    "        # Step 4: Print the extracted data\n",
+    "        # print(\"Aantal per Leeftijd:\")\n",
+    "        for entry in data:\n",
+    "            age = entry['key']\n",
+    "            num_students = entry['aantal']\n",
+    "            # school_data[\"num_students_age_{}\".format(age)] = num_students\n",
+    "            num_students_per_age.append( (age, num_students) )\n",
+    "            # print(f\"Age {entry['key']}: {entry['aantal']} leerlingen\")\n",
+    "    except json.JSONDecodeError as e:\n",
+    "        print(f\"Failed to parse JSON data: {e}\")\n",
+    "\n",
+    "    ############################################################################\n",
+    "    # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
+    "    chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
+    "\n",
+    "    if not chart_tag:\n",
+    "        print(\"Could not find the 'aantal per leerjaar' section.\")\n",
+    "    else:\n",
+    "        # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
+    "        raw_data = chart_tag['aantal-per-leerjaar']\n",
+    "        \n",
+    "        # Step 3: Parse the JSON data\n",
+    "        try:\n",
+    "            data = json.loads(raw_data)\n",
+    "            # Step 4: Print the extracted data\n",
+    "            # print(\"Aantal per Leerjaar:\")\n",
+    "            for entry in data:\n",
+    "                group = entry['key']\n",
+    "                num_students = entry['aantal']\n",
+    "                # school_data[\"num_students_group_{}\".format(group)] = num_students\n",
+    "                num_students_per_group.append( (group, num_students) )\n",
+    "                # print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
+    "        except json.JSONDecodeError as e:\n",
+    "            print(f\"Failed to parse JSON data: {e}\")\n",
+    "    ############################################################################\n",
+    "    return num_students_per_group, num_students_per_age\n",
+    "\n",
+    "\n",
+    "def update_school_data(school_url, school_data):\n",
+    "    try:\n",
+    "        # Process school (request contact details)\n",
+    "        response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
+    "        response.raise_for_status()  # Raise an exception for HTTP errors\n",
+    "        # Parse the HTML content using BeautifulSoup\n",
+    "        soup_school = BeautifulSoup(response.text, 'html.parser')\n",
+    "\n",
+    "        # School details\n",
+    "        school_details = soup_school.find(class_=\"school-details\")\n",
+    "        for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n",
+    "            data = li_detail.find('span', class_='infotip-term')['data-dfn']\n",
+    "            text = li_detail.get_text(strip=True)\n",
+    "            # Set data\n",
+    "            school_data[\"category_{}\".format(category_idx)] = text\n",
+    "            school_data[\"category_{}_description\".format(category_idx)] = data\n",
+    "        \n",
+    "        school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n",
+    "        school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n",
+    "        school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n",
+    "        school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n",
+    "\n",
+    "        school_data[\"city\"] = school_city\n",
+    "        school_data[\"postcode\"] = school_postcode\n",
+    "        school_data[\"address\"] = school_address\n",
+    "\n",
+    "        try:\n",
+    "            school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n",
+    "        except Exception as e:\n",
+    "            pass\n",
+    "        try:\n",
+    "            school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n",
+    "        except Exception as e:\n",
+    "            pass\n",
+    "        try:\n",
+    "            school_data[\"email\"] = extract_emails(soup_school)\n",
+    "        except Exception as e:\n",
+    "            pass\n",
+    "\n",
+    "        # Process school main site\n",
+    "        response = requests.get(os.path.join(school_url), headers=headers)\n",
+    "        response.raise_for_status()  # Raise an exception for HTTP errors\n",
+    "        # Parse the HTML content using BeautifulSoup\n",
+    "        soup_school = BeautifulSoup(response.text, 'html.parser')\n",
+    "\n",
+    "        try:\n",
+    "            school_data[\"students_per_zipcode\"] = get_num_students_per_zipcode(soup_school)\n",
+    "        except Exception as e:\n",
+    "            pass\n",
+    "        try:\n",
+    "            school_data[\"students_per_year_trend\"] = get_num_students_trend(soup_school)\n",
+    "        except Exception as e:\n",
+    "            pass\n",
+    "\n",
+    "        if (school_data.get(\"category\").lower() == \"basisscholen\"):\n",
+    "            try:\n",
+    "                num_students_per_group, num_students_per_age = get_num_students_per_age_and_group(soup_school)\n",
+    "                school_data[\"num_students_per_group\"] = num_students_per_group if len(num_students_per_group)>0 else None\n",
+    "                school_data[\"num_students_per_age\"] = num_students_per_age if len(num_students_per_age)>0 else None\n",
+    "            except Exception as e:\n",
+    "                pass\n",
+    "        \n",
+    "    except Exception as e:\n",
+    "        print(school_url, str(e))\n",
    "\n",
    "def main():\n",
    "    list_urls = [\n",
@@ -129,100 +279,26 @@
    "                    \"url\": school_url,\n",
    "                }\n",
    "\n",
-    "                try:\n",
-    "                    # Process school (request contact details)\n",
-    "                    response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
-    "                    response.raise_for_status()  # Raise an exception for HTTP errors\n",
-    "\n",
-    "                    # Parse the HTML content using BeautifulSoup\n",
-    "                    soup_school = BeautifulSoup(response.text, 'html.parser')\n",
-    "\n",
-    "                    # School details\n",
-    "                    school_details = soup_school.find(class_=\"school-details\")\n",
-    "                    for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n",
-    "                        data = li_detail.find('span', class_='infotip-term')['data-dfn']\n",
-    "                        text = li_detail.get_text(strip=True)\n",
-    "                        # Set data\n",
-    "                        school_data[\"category_{}\".format(category_idx)] = text\n",
-    "                        school_data[\"category_{}_description\".format(category_idx)] = data\n",
-    "                    \n",
-    "                    school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n",
-    "                    school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n",
-    "                    school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n",
-    "                    school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n",
-    "\n",
-    "                    school_data[\"city\"] = school_city\n",
-    "                    school_data[\"postcode\"] = school_postcode\n",
-    "                    school_data[\"address\"] = school_address\n",
-    "\n",
-    "                    try:\n",
-    "                        school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n",
-    "                    except Exception as e:\n",
-    "                        pass\n",
-    "                    try:\n",
-    "                        school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n",
-    "                    except Exception as e:\n",
-    "                        pass\n",
-    "                    try:\n",
-    "                        school_data[\"email\"] = extract_emails(soup_school)\n",
-    "                    except Exception as e:\n",
-    "                        pass\n",
-    "\n",
-    "                    if (category.lower() == \"basisscholen\"):\n",
-    "                        ############################################################################\n",
-    "                        # Step 1: Locate the <aantal-leerlingen-leeftijd-bar-chart> tag\n",
-    "                        chart_tag = soup.find('aantal-leerlingen-leeftijd-bar-chart', attrs={'aantal-per-leeftijd': True})\n",
-    "                        # Step 2: Extract the 'aantal-per-leeftijd' attribute\n",
-    "                        raw_data = chart_tag['aantal-per-leeftijd']\n",
-    "\n",
-    "                        # Step 3: Parse the JSON data\n",
-    "                        try:\n",
-    "                            data = json.loads(raw_data)\n",
-    "                            # Step 4: Print the extracted data\n",
-    "                            print(\"Aantal per Leeftijd:\")\n",
-    "                            for entry in data:\n",
-    "                                age = entry['key']\n",
-    "                                num_students = entry['aantal']\n",
-    "                                school_data[\"num_students_age_{}\".format(age)] = num_students\n",
-    "                                # print(f\"Age {entry['key']}: {entry['aantal']} leerlingen\")\n",
-    "                        except json.JSONDecodeError as e:\n",
-    "                            print(f\"Failed to parse JSON data: {e}\")\n",
-    "\n",
-    "                        # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
-    "                        chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
-    "\n",
-    "                        ############################################################################\n",
-    "                        # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
-    "                        chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
-    "\n",
-    "                        if not chart_tag:\n",
-    "                            print(\"Could not find the 'aantal per leerjaar' section.\")\n",
-    "                        else:\n",
-    "                            # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
-    "                            raw_data = chart_tag['aantal-per-leerjaar']\n",
-    "                            \n",
-    "                            # Step 3: Parse the JSON data\n",
-    "                            try:\n",
-    "                                data = json.loads(raw_data)\n",
-    "                                # Step 4: Print the extracted data\n",
-    "                                print(\"Aantal per Leerjaar:\")\n",
-    "                                for entry in data:\n",
-    "                                    group = entry['key']\n",
-    "                                    num_students = entry['aantal']\n",
-    "                                    school_data[\"num_students_group_{}\".format(group)] = num_students\n",
-    "                                    print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
-    "                            except json.JSONDecodeError as e:\n",
-    "                                print(f\"Failed to parse JSON data: {e}\")\n",
-    "                        ############################################################################\n",
-    "                except Exception as e:\n",
-    "                    print(school_url, str(e))\n",
-    "                    # assert False\n",
+    "                update_school_data(school_url, school_data)\n",
    "\n",
    "                list_school_data_dicts.append(school_data)\n",
    "\n",
-    "    df = pd.DataFrame(list_school_data_dicts)\n",
-    "    df.to_csv(\"scholenopdekaart.csv\")\n",
+    "                # Save per processed school to track progress\n",
+    "                df = pd.DataFrame(list_school_data_dicts)\n",
+    "                df.to_csv(\"scholenopdekaart_tmp.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n",
    "\n",
+    "    df = pd.DataFrame(list_school_data_dicts)\n",
+    "    df.to_csv(\"scholenopdekaart.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n",
+    "    # Without extra columns\n",
+    "    df.drop(columns=[\"students_per_zipcode\", \"students_per_year_trend\", \"num_students_per_group\", \"num_students_per_age\"]).to_csv(\"scholenopdekaart_.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
    "\"\"\" # Issues with URL:\n",
    "https://scholenopdekaart.nl/middelbare-scholen/grave/1900/merletcollege-grave/\n",
    "https://scholenopdekaart.nl/middelbare-scholen/lent/4386/citadel-college-locatie-griftdijk/\n",
@@ -259,24 +335,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "'''\n",
-    "school_url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n",
-    "response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
-    "# Parse the HTML content using BeautifulSoup\n",
-    "soup_school = BeautifulSoup(response.text, 'html.parser')\n",
-    "soup_school\n",
-    "'''"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
    "df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n",
+    "\n",
    "df.head()"
   ]
  },
@@ -288,122 +348,6 @@
   "source": [
    "df.tail()"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# https://scholenopdekaart.nl/middelbare-scholen/hardenberg/26614/de-ambelt/\n",
-    "# From which zip codes the students come\n",
-    "# How many kids passed the exams', ...\n",
-    "\n",
-    "import requests\n",
-    "from bs4 import BeautifulSoup\n",
-    "import os\n",
-    "\n",
-    "url = \"https://scholenopdekaart.nl/middelbare-scholen/hardenberg/26614/de-ambelt/\"\n",
-    "response = requests.get( os.path.join(url, \"#inhoud\") )\n",
-    "\n",
-    "\n",
-    "url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n",
-    "response = requests.get(url)\n",
-    "\n",
-    "soup = BeautifulSoup(response.content, 'html.parser')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "# Locate the section containing \"Welke profielen volgen de examendeelnemers?\"\n",
-    "section_header = soup.find('h3', string=\"Welke profielen volgen de examendeelnemers?\")\n",
-    "if not section_header:\n",
-    "    raise ValueError(\"Section 'Welke profielen volgen de examendeelnemers?' not found in the HTML.\")\n",
-    "\n",
-    "# Navigate to the parent section or subsection\n",
-    "section = section_header.find_parent('section')\n",
-    "if not section:\n",
-    "    raise ValueError(\"Parent section for 'Welke profielen volgen de examendeelnemers?' not found.\")\n",
-    "\n",
-    "# Check if the section contains a message indicating no data is available\n",
-    "no_data_message = section.find('p', string=\"Deze informatie is voor deze school niet bekend.\")\n",
-    "if no_data_message:\n",
-    "    print(\"No data available for 'Welke profielen volgen de examendeelnemers?'.\")\n",
-    "else:\n",
-    "    # Extract the relevant content (e.g., tables, lists, or paragraphs)\n",
-    "    content = []\n",
-    "    for element in section.find_all(['p', 'table', 'ul', 'ol']):\n",
-    "        if element.name == 'table':\n",
-    "            # Extract table rows\n",
-    "            rows = element.find_all('tr')\n",
-    "            for row in rows:\n",
-    "                cells = row.find_all(['th', 'td'])\n",
-    "                row_data = [cell.get_text(strip=True) for cell in cells]\n",
-    "                content.append(row_data)\n",
-    "        else:\n",
-    "            # Extract text from paragraphs, lists, etc.\n",
-    "            content.append(element.get_text(strip=True))\n",
-    "\n",
-    "    # Print the extracted content\n",
-    "    for item in content:\n",
-    "        print(item)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "# Locate the dialog containing the table\n",
-    "dialog = soup.find('dialog', class_='modal modal-dialog')\n",
-    "if not dialog:\n",
-    "    raise ValueError(\"Dialog element not found in the HTML.\")\n",
-    "\n",
-    "# Locate the table within the dialog\n",
-    "table = dialog.find('table')\n",
-    "if not table:\n",
-    "    raise ValueError(\"Table element not found within the dialog.\")\n",
-    "\n",
-    "# Extract table headers\n",
-    "headers = [header.get_text(strip=True) for header in table.find_all('th')]\n",
-    "\n",
-    "# Extract table rows\n",
-    "data = []\n",
-    "for row in table.find_all('tr')[1:]:  # Skip the header row\n",
-    "    cells = row.find_all('td')\n",
-    "    if len(cells) == len(headers):  # Ensure the row matches the expected structure\n",
-    "        row_data = {\n",
-    "            headers[0]: cells[0].get_text(strip=True),  # Postcodegebied\n",
-    "            headers[1]: cells[1].get_text(strip=True),  # Aantal leerlingen\n",
-    "            headers[2]: cells[2].get_text(strip=True)   # Percentage\n",
-    "        }\n",
-    "        data.append(row_data)\n",
-    "\n",
-    "# Print the extracted data\n",
-    "for entry in data:\n",
-    "    print(entry)"
-   ]
  }
 ],
 "metadata": {