diff --git a/.env b/.env index 217212b..c97170f 100644 --- a/.env +++ b/.env @@ -50,3 +50,8 @@ PATH_LOGS_DIRECTORY=/opt/logs # Deploy resources per App DEPLOY_CPUS=2 DEPLOY_RAM=4G + +# Ghost +GHOST_ADMIN_API_URL=https://news.matitos.org/ghost/api/admin/ +GHOST_ADMIN_API_KEY=67fffe1b8a57a80001ecec5b:59f580020c196f92e05e208d288702082f8edad6366e2b2c8940b54e41cc355a +PEXELS_API_KEY=Y6clJkY32eihf34ukX4JsINYu9lzxh3xDdNq2HMAmGwXp0a0tt6vr6S9 diff --git a/app_urls/fetcher/middleware/__init__.py b/app_urls/fetcher/middleware/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app_urls/fetcher/middleware/favicon.py b/app_urls/fetcher/middleware/favicon.py index 3abd60c..601943e 100644 --- a/app_urls/fetcher/middleware/favicon.py +++ b/app_urls/fetcher/middleware/favicon.py @@ -1,14 +1,5 @@ from django.utils.deprecation import MiddlewareMixin -''' -class FaviconMiddleware(MiddlewareMixin): - def process_response(self, request, response): - if 'text/html' in response.get('Content-Type', '') and b'' in response.content: - icon_link = b'\n' - response.content = response.content.replace(b'', icon_link + b'') - return response -''' - class FaviconMiddleware(MiddlewareMixin): def process_response(self, request, response): if 'text/html' in response.get('Content-Type', '') and b'' in response.content: diff --git a/app_urls/fetcher/src/__init__.py b/app_urls/fetcher/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index ff5e791..af06bc1 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -4,7 +4,7 @@ from django.core.cache import cache from django.db import IntegrityError from django.utils import timezone from datetime import timedelta -from .url_processor import process_url, get_with_protocol +from .fetch_utils_url_processor import process_url, get_with_protocol import re import os import traceback diff --git a/app_urls/fetcher/src/fetch_parser.py b/app_urls/fetcher/src/fetch_parser.py index 321c0f7..c3bd60c 100644 --- a/app_urls/fetcher/src/fetch_parser.py +++ b/app_urls/fetcher/src/fetch_parser.py @@ -1,6 +1,6 @@ from .db_utils import DB_Handler from ..models import Search, Source -from .url_processor import get_with_protocol, url_host_slowdown +from .fetch_utils_url_processor import get_with_protocol, url_host_slowdown import newspaper import traceback from .logger import get_logger diff --git a/app_urls/fetcher/src/fetch_search_instances.py b/app_urls/fetcher/src/fetch_search_instances.py index 023bf2a..d188976 100644 --- a/app_urls/fetcher/src/fetch_search_instances.py +++ b/app_urls/fetcher/src/fetch_search_instances.py @@ -4,7 +4,7 @@ import os from django.utils import timezone from datetime import timedelta from ..models import Search, Source -from .fetch_utils import decode_gnews_urls +from .fetch_utils_gnews import decode_gnews_urls from .logger import get_logger logger = get_logger() diff --git a/app_urls/fetcher/src/fetch_utils.py b/app_urls/fetcher/src/fetch_utils_gnews.py similarity index 100% rename from app_urls/fetcher/src/fetch_utils.py rename to app_urls/fetcher/src/fetch_utils_gnews.py diff --git a/app_urls/fetcher/src/url_processor.py b/app_urls/fetcher/src/fetch_utils_url_processor.py similarity index 100% rename from app_urls/fetcher/src/url_processor.py rename to app_urls/fetcher/src/fetch_utils_url_processor.py diff --git a/app_urls/fetcher/src/llm.py b/app_urls/fetcher/src/llm.py new file mode 100644 index 0000000..2d2cdd4 --- /dev/null +++ b/app_urls/fetcher/src/llm.py @@ -0,0 +1,24 @@ +import ollama +import os + +class OllamaClient(): + def __init__(self): + self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org")) + + def _get_default_model(self): + return "llama3.2:3b" + + def get_models(self): + models = sorted([m.model for m in self.client.list().models]) + if (self._get_default_model() in models): + return [self._get_default_model()] + [m for m in models if m != self._get_default_model()] + else: + return models + + def get_prompt(self): + return ("Rewrite the text below into a clear and concise summary of one paragraph maximum, presenting the key points as if they are newly written insights. " + "Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. " + "Write in a natural, standalone format that feels like an original explanation. " + "Keep it brief, engaging, informative, in the style of a news article: \n" + ) + \ No newline at end of file diff --git a/app_urls/fetcher/src/publisher.py b/app_urls/fetcher/src/publisher.py new file mode 100644 index 0000000..92cdb9d --- /dev/null +++ b/app_urls/fetcher/src/publisher.py @@ -0,0 +1,149 @@ +import time +import jwt +import os +import requests +import random +from .llm import OllamaClient +from ..models import Urls, UrlContent + +from .logger import get_logger +logger = get_logger() + + +class Publisher(): + def __init__(self): + pass + + def _create_jwt(self, admin_api_key): + id_, secret = admin_api_key.split(':') + iat = int(time.time()) + exp = iat + 5 * 60 # 5 minutes + header = {'alg': 'HS256', 'kid': id_} + payload = { + 'iat': iat, + 'exp': exp, + 'aud': '/v5/admin/' # Adjust depending on your Ghost version + } + token = jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header) + return token + + def _create_ghost_post(self, post_data): + # Get token + jwt_token = self._create_jwt(os.getenv("GHOST_ADMIN_API_KEY")) + # Get Admin API URL + admin_api_url = os.getenv("GHOST_ADMIN_API_URL") + + headers = { + 'Authorization': f'Ghost {jwt_token}', + 'Content-Type': 'application/json' + } + + post_data = {"posts": [post_data]} + + response = requests.post( + os.path.join(admin_api_url, "posts"), + json=post_data, + headers=headers, + params={"source":"html"} + ) + + if response.status_code == 201: + logger.info("Ghost post published successfully") + return response.json() + else: + logger.warning("Ghost - Failed to publish post: {} {}".format(response.status_code, response.text)) + return None + + def _get_photo_url(self, query): + # TODO: Get already used photos to skip. Use DB + try: + endpoint = "https://api.pexels.com/v1/search?query={}".format(query) + header= {"Authorization": os.getenv("PEXELS_API_KEY")} + + while True: + # Request + r = requests.get(endpoint, headers=header) + dict_images = r.json() + + # Get list of photos + list_photos = dict_images.get("photos", []) + + # TODO: IMPROVE... + photo_url = random.choice(list_photos).get("src").get("landscape") + return photo_url + + + for photo in list_photos: + # Already used? -> Continue + # photo.get("id") # Compare against DB + + # Get landscape photo + photo_url = photo.get("src").get("landscape") + return photo_url + + # Iterated page, already used all images + endpoint = dict_images.get("next_page") + except Exception as e: + logger.warning("Something went wrong while fetching image from Pexels: {}".format(str(e))) + return None + + def publish(self, url_id): + logger.info("Publishing URL ID {}".format(url_id)) + + # URL Content + url_content = UrlContent.objects.filter(pk=url_id).first() + url = Urls.objects.filter(pk=url_id).first() + + if (url_content is None): + logger.warning("Ghost - URL Content is NULL for URL ID: {} {}".format(url_id, url.url)) + return + if (url_content.valid_content is False): + logger.warning("Ghost - URL Content is not valid for URL ID: {} {}".format(url_id, url.url)) + return + + model = "llama3.2:3b" + prompt = "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:" + + ollama_msg = {"role": "user", "content": "{}\n{}".format(prompt, url_content.content)} + response = OllamaClient().client.chat(model=model, messages=[ollama_msg]) + + article_summary = response["message"]["content"] + + ################################################################################################ + if (url_content.image_main_url is None) or (requests.get(url_content.image_main_url).status_code != 200): + # Invalid main image -> Search for one + photo_query = "Mountain landscape" + photo_url = self._get_photo_url(photo_query) + else: + photo_url = url_content.image_main_url + + post_data = { + # "slug": "hey-short", + "title": url_content.title, + "html": "".join([ "

{}

".format(t) for t in article_summary.split("\n") ]) + 'Source'.format(url.url), + #"meta_title": "", + #"meta_description": "", + "feature_image": photo_url, + #"feature_image_caption": "", + "status": "published", + } + + # Publish post + payload = self._create_ghost_post(post_data) + logger.debug("Ghost payload: {}".format(str(payload))) + + ''' + # Return a response (you can customize this as needed) + return HttpResponse(f""" +

Generated Content

+

URL ID: {id_url}

+

URL: {url.url}

+

Title: {url_content.title}

+

Description: {url_content.description}

+

Content: {url_content.content}

+

Valid content: {url_content.valid_content}

+

Language: {url_content.language}

+

Main image: {url_content.image_main_url}

+

Generated summary: {article_summary}

+ """) + ''' \ No newline at end of file diff --git a/app_urls/fetcher/tasks.py b/app_urls/fetcher/tasks.py index 0042b7a..7fca626 100644 --- a/app_urls/fetcher/tasks.py +++ b/app_urls/fetcher/tasks.py @@ -5,6 +5,7 @@ from .src.fetch_parser import FetchParser from .src.fetch_search import FetchSearcher from .src.fetch_missing_kids import FetchMissingKids from .src.db_utils import DB_Handler +from .src.publisher import Publisher from .src.logger import get_logger logger = get_logger() @@ -118,7 +119,7 @@ def background_task(process_type: str): elif ("process_missing_kids_urls" in process_type): DB_Handler().process_missing_kids_urls(batch_size=batch_size) - elif ( "clean_old_url_content" in process_type ): + elif ("clean_old_url_content" in process_type ): # Older than X days encoded in URL try: older_than_days = float(process_type.split("_")[-1]) @@ -126,6 +127,12 @@ def background_task(process_type: str): older_than_days = None DB_Handler().clean_old_url_content(older_than_days=older_than_days) + + elif ("publish" in process_type): + # Extract URL ID + url_id = process_type.split("_")[-1] + # Publish + Publisher().publish(url_id) else: logger.info("Task unknown!: {}".format(process_type)) diff --git a/app_urls/fetcher/templates/url_detail.html b/app_urls/fetcher/templates/url_detail.html index 92fe114..9d2bc8e 100644 --- a/app_urls/fetcher/templates/url_detail.html +++ b/app_urls/fetcher/templates/url_detail.html @@ -115,7 +115,7 @@ } // Fetch URL - let fetchUrl = `/urls/llm/`; + let fetchUrl = `/llm/`; let resultContainer = $("#chat-output"); resultContainer.html(""); // Clear previous content before fetching @@ -189,7 +189,7 @@ - + diff --git a/app_urls/fetcher/urls.py b/app_urls/fetcher/urls.py index d575d23..9f1903c 100644 --- a/app_urls/fetcher/urls.py +++ b/app_urls/fetcher/urls.py @@ -15,8 +15,8 @@ urlpatterns = [ path('urls-per-source/', views.urls_per_source, name='urls_per_source'), path('urls-per-search/', views.urls_per_search, name='urls_per_search'), # + path('llm/', views.llm, name='llm'), + # path('urls/', views.filtered_urls, name='filtered_urls'), path('urls//', views.url_detail_view, name='url_detail'), - path('urls/llm/', views.llm, name='llm'), - path('urls/content_generation', views.content_generation, name='content_generation'), ] diff --git a/app_urls/fetcher/views.py b/app_urls/fetcher/views.py index 75f015e..b3395b5 100644 --- a/app_urls/fetcher/views.py +++ b/app_urls/fetcher/views.py @@ -2,34 +2,16 @@ from .views_base import link_list, logs, log_db, trigger_task from django.core.paginator import Paginator from django.shortcuts import render, get_object_or_404 -from django.http import StreamingHttpResponse, JsonResponse +from django.http import StreamingHttpResponse, JsonResponse, HttpResponse from django.db.models import Q, Count from django.utils import timezone from django.utils.timezone import now, timedelta from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate -import ollama -import os +from .src.llm import OllamaClient import json #################################################################################################### -class OllamaClient(): - def __init__(self): - self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org")) - - def _get_default_model(self): - return "llama3.2:3b" - - def get_models(self): - models = sorted([m.model for m in self.client.list().models]) - if (self._get_default_model() in models): - return [self._get_default_model()] + [m for m in models if m != self._get_default_model()] - else: - return models - - def get_prompt(self): - return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:" - def llm(request): @@ -72,7 +54,6 @@ def url_detail_view(request, id): except UrlContent.DoesNotExist: url_content = {} - # TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client ollama = OllamaClient() context = { @@ -301,15 +282,5 @@ def filtered_urls(request): } return render(request, 'filtered_urls.html', context) -#################################################################################################### - -def content_generation(request): - ''' - # Get list of URLs ID - selected_urls = request.GET.getlist('urls', []) - - # Sample URLs - selected_urls = [13460, 13455, 13454, 13452, 13210] - ''' #################################################################################################### \ No newline at end of file diff --git a/app_urls/fetcher/views_base.py b/app_urls/fetcher/views_base.py index b728be0..c6eba73 100644 --- a/app_urls/fetcher/views_base.py +++ b/app_urls/fetcher/views_base.py @@ -1,5 +1,4 @@ import os -import psycopg from .tasks import background_task from django.http import JsonResponse, HttpResponse from django.db import connection diff --git a/app_urls/init_data_sca.json b/app_urls/init_data_sca.json new file mode 100644 index 0000000..0ed7168 --- /dev/null +++ b/app_urls/init_data_sca.json @@ -0,0 +1,34 @@ +{ + "SEARCH": { + "rss_feed": [ + "https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC", + "https://feeds.feedburner.com/breitbart", + "https://feeds.feedburner.com/zerohedge/feed", + "https://moxie.foxnews.com/google-publisher/latest.xml", + "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362", + "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362" + ], + "url_host": [ + "missingkids.org/poster", + "missingkids.org/new-poster", + "breitbart.com", + "zerohedge.com", + "foxnews.com", + "cnbc.com" + ], + "keyword_search": [ + "child abuse" + ] + }, + "REGEX_PATTERN_STATUS_PRIORITY": [ + [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50], + [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75], + [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75], + [".*radio.foxnews\\.com\\/.*", "invalid", 75], + [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75], + [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75], + [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50], + [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50], + [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50] + ] +} diff --git a/app_urls/requirements.txt b/app_urls/requirements.txt index ae1e158..d5f2786 100644 --- a/app_urls/requirements.txt +++ b/app_urls/requirements.txt @@ -16,4 +16,5 @@ GoogleNews duckduckgo_search git+https://github.com/tasos-py/Search-Engines-Scraper.git langdetect -ollama \ No newline at end of file +ollama +PyJWT \ No newline at end of file diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 56b0fae..fa4f645 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -64,6 +64,10 @@ services: # Selenium - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT} - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA} + # Ghost + - GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY} + - GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL} + - PEXELS_API_KEY=${PEXELS_API_KEY} ######################## volumes: # Development mode - ./app_urls:/opt/app diff --git a/docker-compose.yml b/docker-compose.yml index 54d04d5..3f32d6d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -64,6 +64,10 @@ services: # Selenium - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT} - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA} + # Ghost + - GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY} + - GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL} + - PEXELS_API_KEY=${PEXELS_API_KEY} ######################## #volumes: # Development mode # - ./app_urls:/opt/app diff --git a/utils/Schools-NL.ipynb b/utils/Schools-NL.ipynb new file mode 100644 index 0000000..3bee3dd --- /dev/null +++ b/utils/Schools-NL.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "from urllib.parse import urljoin\n", + "import pandas as pd\n", + "import os\n", + "\n", + "headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to decode Cloudflare-protected emails\n", + "def decode_email(encoded_email):\n", + " \"\"\"\n", + " Decode an email protected by Cloudflare's email protection.\n", + " :param encoded_email: The encoded email string from the data-cfemail attribute.\n", + " :return: The decoded email address.\n", + " \"\"\"\n", + " email = \"\"\n", + " key = int(encoded_email[:2], 16) # Extract the key (first two characters)\n", + " for i in range(2, len(encoded_email), 2):\n", + " # XOR each pair of hex characters with the key\n", + " email += chr(int(encoded_email[i:i + 2], 16) ^ key)\n", + " return email\n", + "\n", + "def extract_emails(soup):\n", + " # Find all visible email links (mailto:)\n", + " visible_emails = []\n", + " for link in soup.find_all('a', href=lambda href: href and href.startswith('mailto:')):\n", + " email = link['href'].replace('mailto:', '')\n", + " visible_emails.append(email)\n", + "\n", + " # Find all Cloudflare-protected emails\n", + " protected_emails = []\n", + " for span in soup.find_all('span', class_='__cf_email__', attrs={'data-cfemail': True}):\n", + " encoded_email = span['data-cfemail']\n", + " decoded_email = decode_email(encoded_email)\n", + " protected_emails.append(decoded_email)\n", + "\n", + " # Combine all emails\n", + " all_emails = visible_emails + protected_emails\n", + " all_emails = list(set(all_emails))\n", + " if (len(all_emails) == 0):\n", + " return None\n", + " elif (len(all_emails) == 1):\n", + " return all_emails[0]\n", + " else:\n", + " return all_emails\n", + "\n", + "def find_website(soup_school):\n", + " # Find all tags with href attributes\n", + " for link in soup_school.find(class_=\"dl-horizontal dl-icons\").find_all('a', href=True):\n", + " href = link['href']\n", + " # Filter out only valid URLs (e.g., starting with http or https)\n", + " if href.startswith(('http://', 'https://')):\n", + " # websites.append(href)\n", + " return href\n", + "\n", + "\n", + "def main():\n", + " list_urls = [\n", + " \"https://scholenopdekaart.nl/Basisscholen/\",\n", + " \"https://scholenopdekaart.nl/middelbare-scholen/\"\n", + " ]\n", + "\n", + " list_school_data_dicts = []\n", + "\n", + " # For each category\n", + " for url in list_urls:\n", + " # Fetch the HTML content of the page\n", + " response = requests.get(url, headers=headers)\n", + " response.raise_for_status() # Raise an exception for HTTP errors\n", + " # Parse the HTML content using BeautifulSoup\n", + " soup = BeautifulSoup(response.text, 'html.parser')\n", + "\n", + " # Get category\n", + " category = url.strip(\"/\").split(\"/\")[-1].lower()\n", + "\n", + " # Find all tags with href attributes\n", + " links_areas = []\n", + " for a_tag in soup.find_all('a', href=True):\n", + " href = a_tag['href']\n", + " \n", + " if (category not in href):\n", + " continue\n", + " \n", + " # Convert relative URLs to absolute URLs\n", + " area_full_url = urljoin(url, href)\n", + " links_areas.append(area_full_url)\n", + "\n", + " # Area\n", + " area = href.rstrip(\"/\").split(\"/\")[-1]\n", + "\n", + " ###############################################\n", + " # Fetch the HTML content of the page\n", + " print(\".\", end=\"\")\n", + " response = requests.get(area_full_url, headers=headers)\n", + " response.raise_for_status() # Raise an exception for HTTP errors\n", + "\n", + " # Parse the HTML content using BeautifulSoup\n", + " soup_area= BeautifulSoup(response.text, 'html.parser')\n", + "\n", + " # Get schools in area\n", + " for a_tag in soup_area.find_all('a', href=True):\n", + " href = a_tag['href']\n", + "\n", + " school_url = urljoin(url, href)\n", + " if (area_full_url not in school_url):\n", + " continue\n", + " \n", + " school_name = a_tag.text.rstrip(\".\")\n", + " school_data = {\n", + " \"category\": category,\n", + " \"area\": area,\n", + " \"name\": school_name,\n", + " \"url\": school_url,\n", + " }\n", + "\n", + " try:\n", + " # Process school (request contact details)\n", + " response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n", + " response.raise_for_status() # Raise an exception for HTTP errors\n", + "\n", + " # Parse the HTML content using BeautifulSoup\n", + " soup_school = BeautifulSoup(response.text, 'html.parser')\n", + "\n", + " # School details\n", + " school_details = soup_school.find(class_=\"school-details\")\n", + " for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n", + " data = li_detail.find('span', class_='infotip-term')['data-dfn']\n", + " text = li_detail.get_text(strip=True)\n", + " # Set data\n", + " school_data[\"category_{}\".format(category_idx)] = text\n", + " school_data[\"category_{}_description\".format(category_idx)] = data\n", + " \n", + " school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n", + " school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n", + " school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n", + " school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n", + "\n", + " school_data[\"city\"] = school_city\n", + " school_data[\"postcode\"] = school_postcode\n", + " school_data[\"address\"] = school_address\n", + "\n", + " try:\n", + " school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n", + " except Exception as e:\n", + " pass\n", + " try:\n", + " school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n", + " except Exception as e:\n", + " pass\n", + " try:\n", + " school_data[\"email\"] = extract_emails(soup_school)\n", + " except Exception as e:\n", + " pass\n", + " \n", + " except Exception as e:\n", + " print(school_url, str(e))\n", + " # assert False\n", + "\n", + " list_school_data_dicts.append(school_data)\n", + "\n", + " df = pd.DataFrame(list_school_data_dicts)\n", + " df.to_csv(\"scholenopdekaart.csv\")\n", + "\n", + "\"\"\" # Issues with URL:\n", + "https://scholenopdekaart.nl/middelbare-scholen/grave/1900/merletcollege-grave/\n", + "https://scholenopdekaart.nl/middelbare-scholen/lent/4386/citadel-college-locatie-griftdijk/\n", + "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/24527/montessori-college-k33-nijmegen/\n", + "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26368/aventurijn-park-neerbosch/\n", + "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26187/kandinsky-college-voor-lyceum-havo-mavo-vbo-lwoo/\n", + "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/1791/karel-de-grote-college/\n", + "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2040/mondial-college-locatie-leuvensbroek/\n", + "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2041/mondial-college-meeuwse-acker/\n", + "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2036/stedelijk-gymnasium-nijmegen/\n", + "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2038/stedelijke-scholengemeenschap-nijmegen/\n", + "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26184/yuverta-vmbo-het-groene-lyceum-nijmegen/\n", + "https://scholenopdekaart.nl/middelbare-scholen/oss/23719/het-hooghuis-locatie-mondriaan-college/\n", + "https://scholenopdekaart.nl/middelbare-scholen/oss/943/het-hooghuis-locatie-oss-stadion/\n", + "https://scholenopdekaart.nl/middelbare-scholen/oss/947/het-hooghuis-zuidwest-gebouw-west/\n", + "https://scholenopdekaart.nl/middelbare-scholen/oss/946/het-hooghuis-zuidwest-gebouw-zuid/\n", + "https://scholenopdekaart.nl/middelbare-scholen/oss/1929/het-maaslandcollege-scholengemeenschap-voor-tweetalig-mavo-havo-vwo/\n", + "https://scholenopdekaart.nl/middelbare-scholen/oss/25783/sonnewijser-unit-route-arbeid/\n", + "https://scholenopdekaart.nl/middelbare-scholen/oss/11432/sonnewijser-unit-vervolgonderwijs-oss/\n", + "https://scholenopdekaart.nl/middelbare-scholen/oss/942/titus-brandsmalyceum/\n", + "https://scholenopdekaart.nl/middelbare-scholen/velp-noord-brabant/24545/merletcollege-eerste-opvang-anderstaligen-eoa/\n", + "https://scholenopdekaart.nl/middelbare-scholen/wijchen/2018/maaswaal-college-havo-atheneum-gymnasium/\n", + "https://scholenopdekaart.nl/middelbare-scholen/wijchen/2020/maaswaal-college-vmbo-basis-kader-mavo/\n", + "https://scholenopdekaart.nl/middelbare-scholen/wijchen/1781/pro-college-wijchen/\n", + "\"\"\"\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "school_url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n", + "response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n", + "# Parse the HTML content using BeautifulSoup\n", + "soup_school = BeautifulSoup(response.text, 'html.parser')\n", + "soup_school\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n", + "df.loc[0, \"category_3\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "\n", + "# Step 1: Fetch the webpage\n", + "url = \"https://scholenopdekaart.nl/basisscholen/aagtekerke/25963/jhr-willem-versluijsschool/\"\n", + "headers = {\n", + " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36\"\n", + "}\n", + "response = requests.get(url, headers=headers)\n", + "\n", + "# Check if the request was successful\n", + "if response.status_code != 200:\n", + " print(f\"Failed to retrieve the page. Status code: {response.status_code}\")\n", + " exit()\n", + "\n", + "# Step 2: Parse the HTML content\n", + "soup = BeautifulSoup(response.text, 'html.parser')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Aantal per Leerjaar:\n", + "Groep 1: 29 leerlingen\n", + "Groep 2: 28 leerlingen\n", + "Groep 3: 30 leerlingen\n", + "Groep 4: 25 leerlingen\n", + "Groep 5: 19 leerlingen\n", + "Groep 6: 26 leerlingen\n", + "Groep 7: 22 leerlingen\n", + "Groep 8: 20 leerlingen\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "# Step 1: Locate the tag\n", + "chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n", + "\n", + "if not chart_tag:\n", + " print(\"Could not find the 'aantal per leerjaar' section.\")\n", + "else:\n", + " # Step 2: Extract the 'aantal-per-leerjaar' attribute\n", + " raw_data = chart_tag['aantal-per-leerjaar']\n", + " \n", + " # Step 3: Parse the JSON data\n", + " try:\n", + " data = json.loads(raw_data)\n", + " \n", + " # Step 4: Print the extracted data\n", + " print(\"Aantal per Leerjaar:\")\n", + " for entry in data:\n", + " print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n", + " except json.JSONDecodeError as e:\n", + " print(f\"Failed to parse JSON data: {e}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "matitos_urls", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}
URLURL [✍️ Publish] {{ url_item.url }}