Schools NL, Ghost post utils, nude + age detection

This commit is contained in:
Luciano Gervasoni
2025-04-30 15:50:54 +02:00
parent aa369d0458
commit ccfd0f9188
11 changed files with 841 additions and 246 deletions

6
.gitignore vendored
View File

@@ -3,4 +3,8 @@ __pycache__/
**/credentials.py
logs/
postgres/
docker_data/
docker_data/
**/*.pt
**/*.pth
**/*.tar
**/*.onnx

157
app_cv/Demo.ipynb Normal file
View File

@@ -0,0 +1,157 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import base64\n",
"import json\n",
"import requests\n",
"import io\n",
"import numpy as np\n",
"import PIL.Image\n",
"import cv2\n",
"from pprint import pprint\n",
"\n",
"def process_image(path_img):\n",
" with open(path_img, \"rb\") as image_file:\n",
" encoded_string = base64.b64encode(image_file.read()).decode('utf-8')\n",
" response = requests.post(\n",
" 'http://localhost:5000/process',\n",
" headers={'Content-Type': 'application/json'},\n",
" data=json.dumps({'image': encoded_string})\n",
" )\n",
" response_dict = response.json()\n",
" pprint(response_dict)\n",
" # Decode\n",
" image_bytes = base64.b64decode(response_dict.get(\"image_b64\"))\n",
" img_array = np.frombuffer(io.BytesIO(image_bytes).getvalue(), dtype=np.uint8)\n",
" img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)\n",
" img_rgb = img_bgr[:, :, ::-1]\n",
" return img_rgb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"path_img = \"imgs/img_1p.jpg\"\n",
"PIL.Image.fromarray( process_image(path_img) )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"path_img = \"imgs/img_nude.jpg\"\n",
"PIL.Image.fromarray( process_image(path_img) )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"'''\n",
"# !git clone https://github.com/wildchlamydia/mivolo\n",
"# !pip install ultralytics yt_dlp pandas scipy timm==0.8.13.dev0\n",
"# !pip install ./mivolo\n",
"\n",
"!python mivolo/demo.py \\\n",
" --input \"face_data/sample_image.jpg\" \\\n",
" --output \"output\" \\\n",
" --detector-weights \"mivolo/pretrained/yolov8x_person_face.pt\" \\\n",
" --checkpoint \"mivolo/pretrained/model_imdb_cross_person_4.22_99.46.pth.tar\" \\\n",
" --device \"cpu\" \\\n",
" --draw\n",
"'''\n",
"\n",
"'''\n",
"# !git clone https://github.com/Kartik-3004/facexformer.git\n",
"# !pip install huggingface_hub torch torchvision torchaudio opencv-python facenet_pytorch\n",
"from huggingface_hub import hf_hub_download\n",
"hf_hub_download(repo_id=\"kartiknarayan/facexformer\", filename=\"ckpts/model.pt\", local_dir=\"./facexformer\")\n",
"\n",
"!python facexformer/inference.py \\\n",
" --model_path facexformer/ckpts/model.pt \\\n",
" --image_path face_data/sample_image.jpg \\\n",
" --results_path face_data \\\n",
" --task parsing\n",
" x\n",
"!python facexformer/inference.py \\\n",
" --model_path facexformer/ckpts/model.pt \\\n",
" --image_path face_data/face.png \\\n",
" --results_path face_data \\\n",
" --task landmarks\n",
"\n",
"!python facexformer/inference.py \\\n",
" --model_path facexformer/ckpts/model.pt \\\n",
" --image_path face_data/face.png \\\n",
" --results_path face_data \\\n",
" --task headpose\n",
"\n",
"!python facexformer/inference.py \\\n",
" --model_path facexformer/ckpts/model.pt \\\n",
" --image_path face_data/face.png \\\n",
" --results_path face_data \\\n",
" --task attributes\n",
"\n",
"!python facexformer/inference.py \\\n",
" --model_path facexformer/ckpts/model.pt \\\n",
" --image_path face_data/face.png \\\n",
" --results_path face_data \\\n",
" --task age_gender_race\n",
"'''"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos_cv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

17
app_cv/README.md Normal file
View File

@@ -0,0 +1,17 @@
# Requirements
```
pip install git+https://github.com/wildchlamydia/mivolo.git "nudenet>=3.4.2"
```
- Download checkpoints
- https://github.com/wildchlamydia/mivolo
- models/mivolo/model_imdb_cross_person_4.22_99.46.pth.tar
- models/mivolo/yolov8x_person_face.pt
- https://github.com/notAI-tech/NudeNet?tab=readme-ov-file#available-models
- models/nude_detector/640m.onnx
# TODO
- Client side inference: https://github.com/notAI-tech/NudeNet/tree/v3/in_browser

35
app_cv/Server.ipynb Normal file
View File

@@ -0,0 +1,35 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install flask\n",
"!python app.py"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos_cv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

48
app_cv/app.py Normal file
View File

@@ -0,0 +1,48 @@
from flask import Flask, request, jsonify
import base64
import io
import cv2
import traceback
import os
import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[logging.StreamHandler()])
from cv_processor import process
app = Flask(__name__)
@app.route('/process', methods=['POST'])
def process_image():
logging.info("POST /process")
# Json
data = request.get_json()
# Valid data?
if not data or 'image' not in data:
return jsonify({"error": "No image data provided"}), 400
try:
image_data = data['image']
# Decode base64 string
image_bytes = base64.b64decode(image_data)
image_stream = io.BytesIO(image_bytes)
# Process the image
results = process(image_stream)
# Encode processed image to base64
_, buffer = cv2.imencode('.jpg', results.get("image"), [cv2.IMWRITE_JPEG_QUALITY, 100])
processed_image_base64 = base64.b64encode(buffer).decode('utf-8')
# Update image with base64 encoded
results["image_b64"] = processed_image_base64
# Pop image (not serializable)
results.pop("image")
# Jsonify
return jsonify(results)
except Exception as e:
logging.warning("Exception: {}".format(traceback.format_exc()))
return jsonify({"error": traceback.format_exc()}), 400
if __name__ == '__main__':
app.run(debug=os.getenv("DEBUG_MODE", False))

132
app_cv/cv_processor.py Normal file
View File

@@ -0,0 +1,132 @@
import cv2
import numpy as np
import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[logging.StreamHandler()])
# Age
from mivolo.predictor import Predictor
import argparse
# Nudity
from nudenet import NudeDetector
class CV():
def __init__(self):
args = argparse.ArgumentParser()
args.add_argument("--device", type=str, default="cpu")
args.add_argument("--checkpoint", default="models/mivolo/model_imdb_cross_person_4.22_99.46.pth.tar")
args.add_argument("--detector_weights", default="models/mivolo/yolov8x_person_face.pt")
args.add_argument("--with-persons", action="store_true", default=False, help="If set model will run with persons, if available")
args.add_argument("--disable-faces", action="store_true", default=False, help="If set model will use only persons if available")
args.add_argument("--draw", action="store_true", default=False, help="If set, resulted images will be drawn")
args = args.parse_args([])
# Initialize
self.predictor_age = Predictor(args)
# Initialize
self.nude_detector = NudeDetector(model_path="models/nude_detector/640m.onnx", inference_resolution=640)
# detector = NudeDetector(model_path="downloaded_640m.onnx path", inference_resolution=640)
# https://github.com/notAI-tech/NudeNet?tab=readme-ov-file#available-models
# All labels list
self.nudity_all_labels = [
"FEMALE_GENITALIA_COVERED",
"FACE_FEMALE",
"BUTTOCKS_EXPOSED",
"FEMALE_BREAST_EXPOSED",
"FEMALE_GENITALIA_EXPOSED",
"MALE_BREAST_EXPOSED",
"ANUS_EXPOSED",
"FEET_EXPOSED",
"BELLY_COVERED",
"FEET_COVERED",
"ARMPITS_COVERED",
"ARMPITS_EXPOSED",
"FACE_MALE",
"BELLY_EXPOSED",
"MALE_GENITALIA_EXPOSED",
"ANUS_COVERED",
"FEMALE_BREAST_COVERED",
"BUTTOCKS_COVERED",
]
# Classes of interest
self.nudity_classes_of_interest = ["BUTTOCKS_EXPOSED", "FEMALE_BREAST_EXPOSED", "FEMALE_GENITALIA_EXPOSED", "ANUS_EXPOSED", "MALE_GENITALIA_EXPOSED"]
def _censor(self, image_bgr, detections):
# Copy original image
image_bgr_censored = image_bgr.copy()
for detection in detections:
box = detection["box"]
x, y, w, h = box[0], box[1], box[2], box[3]
# Change these pixels to pure black
image_bgr_censored[y : y + h, x : x + w] = (0, 0, 0)
return image_bgr_censored
def process_image(self, image_bgr):
###################################################################
# Predict
detected_objects, out_img = self.predictor_age.recognize(image_bgr)
logging.debug("#persons: {}, #faces: {}".format(detected_objects.n_persons, detected_objects.n_faces))
# Num faces and persons detected
detected_objects.n_faces, detected_objects.n_persons
# Association
detected_objects.associate_faces_with_persons()
# detected_objects.face_to_person_map
# {2: 1, 3: 0}
# detected_objects.ages
# [None, None, 27.18, 23.77]
age_predictions = [e for e in detected_objects.ages if e is not None]
# Crops of faces & persons
# crops = detected_objects.collect_crops(img)
any_minor_present = any([ a < 18 for a in detected_objects.ages if a is not None ])
###################################################################
###################################################################
# Predict
nude_detections = self.nude_detector.detect(image_bgr)
logging.debug("Nude detections: {}".format(nude_detections))
# Filter by classes of interest
nude_detections = [ detection for detection in nude_detections if detection["class"] in self.nudity_classes_of_interest ]
# Nude detections present?
any_nude_detection = len(nude_detections) > 0
###################################################################
###################################################################
# Censor image
censored_img_bgr = self._censor(image_bgr, nude_detections)
# Plot age predictions on censored image
output_image = detected_objects.plot(img=censored_img_bgr)
###################################################################
results = {
"any_minor_present": any_minor_present,
"any_nude_detection": any_nude_detection,
"nudity_detections": nude_detections,
"age_predictions": age_predictions,
"image": output_image,
}
return results
def process(image_bytes):
try:
logging.info("Processing image")
# Convert bytes to NumPy array
img_array = np.frombuffer(image_bytes.getvalue(), dtype=np.uint8)
# Decode image using OpenCV
img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
if img_bgr is None:
return {}
# Process
results = CV().process_image(img_bgr)
logging.info("Returning results")
return results
except Exception as e:
logging.warning("Error processing image: {}".format(str(e)))
return {}

BIN
app_cv/imgs/img_1p.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

BIN
app_cv/imgs/img_nude.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

View File

@@ -10,6 +10,7 @@
"import time\n",
"import jwt\n",
"import requests\n",
"from datetime import datetime, timedelta, timezone\n",
"\n",
"admin_api_url = \"\"\n",
"admin_api_key = \"\"\n",
@@ -25,7 +26,15 @@
" 'aud': '/v5/admin/' # Adjust depending on your Ghost version\n",
" }\n",
" token = jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)\n",
" return token\n"
" return token\n",
"\n",
"# Get token\n",
"jwt_token = _create_jwt(os.getenv(\"GHOST_ADMIN_API_KEY\"))\n",
"\n",
"headers = {\n",
" 'Authorization': f'Ghost {jwt_token}',\n",
" 'Content-Type': 'application/json'\n",
"}"
]
},
{
@@ -34,33 +43,100 @@
"metadata": {},
"outputs": [],
"source": [
"# Get token\n",
"jwt_token = _create_jwt(os.getenv(\"GHOST_ADMIN_API_KEY\"))\n",
"DELETE_ALL_POSTS = False\n",
"\n",
"headers = {\n",
" 'Authorization': f'Ghost {jwt_token}',\n",
" 'Content-Type': 'application/json'\n",
"}\n",
"if DELETE_ALL_POSTS:\n",
" while (True):\n",
" # GET /admin/posts/\n",
" response = requests.get(os.path.join(admin_api_url, \"posts\"), headers=headers)\n",
" dict_response = response.json()\n",
"\n",
"deleted_post = True\n",
" if (len(dict_response.get(\"posts\")) == 0):\n",
" break\n",
"\n",
"while (deleted_post):\n",
" # GET /admin/posts/\n",
" response = requests.get(os.path.join(admin_api_url, \"posts\"), headers=headers)\n",
" dict_response = response.json()\n",
" # Iterate posts\n",
" for p in dict_response.get(\"posts\"):\n",
" # Post ID\n",
" post_id = p.get(\"id\")\n",
"\n",
" if (len(dict_response.get(\"posts\")) == 0):\n",
" deleted_post = False\n",
" break\n",
" # DELETE /admin/posts/{id}/\n",
" r = requests.delete(os.path.join(admin_api_url, \"posts\", \"{}\".format(post_id)), headers=headers)\n",
" print(\"Post:\", post_id, \"Status:\", r.status_code, r.text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"PUBLISH_SAMPLE = False\n",
"\n",
" # Iterate posts\n",
" for p in dict_response.get(\"posts\"):\n",
" # Post ID\n",
" post_id = p.get(\"id\")\n",
"def _create_ghost_post(jwt_token, admin_api_url, post_data):\n",
" # Get Admin API URL\n",
" admin_api_url = os.getenv(\"GHOST_ADMIN_API_URL\")\n",
"\n",
" # DELETE /admin/posts/{id}/\n",
" r = requests.delete(os.path.join(admin_api_url, \"posts\", \"{}\".format(post_id)), headers=headers)\n",
" print(\"Post:\", post_id, \"Status:\", r.status_code, r.text)"
" headers = {\n",
" 'Authorization': f'Ghost {jwt_token}',\n",
" 'Content-Type': 'application/json'\n",
" }\n",
" \n",
" post_data = {\"posts\": [post_data]}\n",
"\n",
" response = requests.post(\n",
" os.path.join(admin_api_url, \"posts\"),\n",
" json=post_data,\n",
" headers=headers,\n",
" params={\"source\":\"html\"}\n",
" )\n",
"\n",
" if response.status_code == 201:\n",
" print(\"Ghost post published successfully\")\n",
" return response.json()\n",
" else:\n",
" print(\"Ghost - Failed to publish post: {} {}\".format(response.status_code, response.text))\n",
" return None\n",
"\n",
"if (PUBLISH_SAMPLE):\n",
" url_id = 150\n",
"\n",
" post_data = {\n",
" # \"slug\": \"hey-short\",\n",
" \"title\": \"Hey there, sample title\",\n",
" \"html\": \"<p>Hey there!</p>\",\n",
" # \"feature_image\": photo_url,\n",
" # \"feature_image_caption\": \"\",\n",
" \"status\": \"published\",\n",
" \"tags\": [\"#url-id-{}\".format(url_id)]\n",
" }\n",
"\n",
" # Publish post\n",
" payload = _create_ghost_post(jwt_token, admin_api_url, post_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Filter by post title\n",
"post_title = \"Funds raised for legal action over failure to stop grooming gangs\"\n",
"# Filter by published date\n",
"iso_time = (datetime.now(timezone.utc) - timedelta(hours=48)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'\n",
"# Parameter for filter\n",
"params = {\"filter\": \"title:'{}'+published_at:>{}\".format(post_title, iso_time)}\n",
"\n",
"# Filter by URL ID\n",
"url_id = 150\n",
"# Parameter for filter\n",
"params = {\"filter\": \"tags:hash-url-id-{}\".format(url_id)}\n",
"\n",
"# Get posts using filter\n",
"response = requests.get(os.path.join(admin_api_url, \"posts\"), params=params, headers=headers)\n",
"dict_response = response.json()\n",
"\n",
"len(dict_response.get(\"posts\"))"
]
}
],

View File

@@ -12,6 +12,8 @@
"import pandas as pd\n",
"import os\n",
"import json\n",
"import csv\n",
"\n",
"\n",
"headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
]
@@ -69,6 +71,154 @@
" # websites.append(href)\n",
" return href\n",
"\n",
"def get_num_students_per_zipcode(soup):\n",
" list_zipcode_students_percentage = []\n",
"\n",
" h3_tag = soup.find(\"h3\", string=\"In welk postcodegebied wonen de leerlingen van deze school?\")\n",
" if h3_tag:\n",
" dialog = h3_tag.find_parent(\"dialog\")\n",
"\n",
" if dialog:\n",
" # print(dialog.prettify())\n",
" table = dialog.find(\"table\")\n",
" if table:\n",
" rows = table.find_all(\"tr\")\n",
" for row in rows:\n",
" cells = row.find_all([\"th\", \"td\"])\n",
" row_data = [cell.get_text(strip=True) for cell in cells]\n",
" zipcode, num_students, percentage = row_data\n",
" list_zipcode_students_percentage.append( (zipcode, num_students, percentage) )\n",
" \n",
" return list_zipcode_students_percentage\n",
"\n",
"def get_num_students_trend(soup):\n",
" # Step 1: Locate the <aantal-leerlingen-trend-line-chart> tag\n",
" trend_chart_tag = soup.find(\"aantal-leerlingen-trend-line-chart\")\n",
"\n",
" if trend_chart_tag:\n",
" # Step 2: Extract the 'leerlingen-trend-data' attribute\n",
" trend_data_attr = trend_chart_tag.get(\"leerlingen-trend-data\")\n",
" \n",
" if trend_data_attr:\n",
" # Step 3: Parse the JSON string into a Python object\n",
" trend_data = json.loads(trend_data_attr)\n",
" #print(\"Extracted leerlingen-trend-data:\")\n",
" #print(json.dumps(trend_data, indent=4)) # Pretty-print the JSON data\n",
" return [ (e.get(\"key\"), e.get(\"aantal\") ) for e in trend_data]\n",
"\n",
"def get_num_students_per_age_and_group(soup):\n",
" num_students_per_group, num_students_per_age = [], []\n",
" ############################################################################\n",
" # Step 1: Locate the <aantal-leerlingen-leeftijd-bar-chart> tag\n",
" chart_tag = soup.find('aantal-leerlingen-leeftijd-bar-chart', attrs={'aantal-per-leeftijd': True})\n",
" # Step 2: Extract the 'aantal-per-leeftijd' attribute\n",
" raw_data = chart_tag['aantal-per-leeftijd']\n",
"\n",
" # Step 3: Parse the JSON data\n",
" try:\n",
" data = json.loads(raw_data)\n",
" # Step 4: Print the extracted data\n",
" # print(\"Aantal per Leeftijd:\")\n",
" for entry in data:\n",
" age = entry['key']\n",
" num_students = entry['aantal']\n",
" # school_data[\"num_students_age_{}\".format(age)] = num_students\n",
" num_students_per_age.append( (age, num_students) )\n",
" # print(f\"Age {entry['key']}: {entry['aantal']} leerlingen\")\n",
" except json.JSONDecodeError as e:\n",
" print(f\"Failed to parse JSON data: {e}\")\n",
"\n",
" ############################################################################\n",
" # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
" chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
"\n",
" if not chart_tag:\n",
" print(\"Could not find the 'aantal per leerjaar' section.\")\n",
" else:\n",
" # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
" raw_data = chart_tag['aantal-per-leerjaar']\n",
" \n",
" # Step 3: Parse the JSON data\n",
" try:\n",
" data = json.loads(raw_data)\n",
" # Step 4: Print the extracted data\n",
" # print(\"Aantal per Leerjaar:\")\n",
" for entry in data:\n",
" group = entry['key']\n",
" num_students = entry['aantal']\n",
" # school_data[\"num_students_group_{}\".format(group)] = num_students\n",
" num_students_per_group.append( (group, num_students) )\n",
" # print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
" except json.JSONDecodeError as e:\n",
" print(f\"Failed to parse JSON data: {e}\")\n",
" ############################################################################\n",
" return num_students_per_group, num_students_per_age\n",
"\n",
"\n",
"def update_school_data(school_url, school_data):\n",
" try:\n",
" # Process school (request contact details)\n",
" response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
" response.raise_for_status() # Raise an exception for HTTP errors\n",
" # Parse the HTML content using BeautifulSoup\n",
" soup_school = BeautifulSoup(response.text, 'html.parser')\n",
"\n",
" # School details\n",
" school_details = soup_school.find(class_=\"school-details\")\n",
" for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n",
" data = li_detail.find('span', class_='infotip-term')['data-dfn']\n",
" text = li_detail.get_text(strip=True)\n",
" # Set data\n",
" school_data[\"category_{}\".format(category_idx)] = text\n",
" school_data[\"category_{}_description\".format(category_idx)] = data\n",
" \n",
" school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n",
" school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n",
" school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n",
" school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n",
"\n",
" school_data[\"city\"] = school_city\n",
" school_data[\"postcode\"] = school_postcode\n",
" school_data[\"address\"] = school_address\n",
"\n",
" try:\n",
" school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n",
" except Exception as e:\n",
" pass\n",
" try:\n",
" school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n",
" except Exception as e:\n",
" pass\n",
" try:\n",
" school_data[\"email\"] = extract_emails(soup_school)\n",
" except Exception as e:\n",
" pass\n",
"\n",
" # Process school main site\n",
" response = requests.get(os.path.join(school_url), headers=headers)\n",
" response.raise_for_status() # Raise an exception for HTTP errors\n",
" # Parse the HTML content using BeautifulSoup\n",
" soup_school = BeautifulSoup(response.text, 'html.parser')\n",
"\n",
" try:\n",
" school_data[\"students_per_zipcode\"] = get_num_students_per_zipcode(soup_school)\n",
" except Exception as e:\n",
" pass\n",
" try:\n",
" school_data[\"students_per_year_trend\"] = get_num_students_trend(soup_school)\n",
" except Exception as e:\n",
" pass\n",
"\n",
" if (school_data.get(\"category\").lower() == \"basisscholen\"):\n",
" try:\n",
" num_students_per_group, num_students_per_age = get_num_students_per_age_and_group(soup_school)\n",
" school_data[\"num_students_per_group\"] = num_students_per_group if len(num_students_per_group)>0 else None\n",
" school_data[\"num_students_per_age\"] = num_students_per_age if len(num_students_per_age)>0 else None\n",
" except Exception as e:\n",
" pass\n",
" \n",
" except Exception as e:\n",
" print(school_url, str(e))\n",
"\n",
"def main():\n",
" list_urls = [\n",
@@ -129,100 +279,26 @@
" \"url\": school_url,\n",
" }\n",
"\n",
" try:\n",
" # Process school (request contact details)\n",
" response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
" response.raise_for_status() # Raise an exception for HTTP errors\n",
"\n",
" # Parse the HTML content using BeautifulSoup\n",
" soup_school = BeautifulSoup(response.text, 'html.parser')\n",
"\n",
" # School details\n",
" school_details = soup_school.find(class_=\"school-details\")\n",
" for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n",
" data = li_detail.find('span', class_='infotip-term')['data-dfn']\n",
" text = li_detail.get_text(strip=True)\n",
" # Set data\n",
" school_data[\"category_{}\".format(category_idx)] = text\n",
" school_data[\"category_{}_description\".format(category_idx)] = data\n",
" \n",
" school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n",
" school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n",
" school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n",
" school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n",
"\n",
" school_data[\"city\"] = school_city\n",
" school_data[\"postcode\"] = school_postcode\n",
" school_data[\"address\"] = school_address\n",
"\n",
" try:\n",
" school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n",
" except Exception as e:\n",
" pass\n",
" try:\n",
" school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n",
" except Exception as e:\n",
" pass\n",
" try:\n",
" school_data[\"email\"] = extract_emails(soup_school)\n",
" except Exception as e:\n",
" pass\n",
"\n",
" if (category.lower() == \"basisscholen\"):\n",
" ############################################################################\n",
" # Step 1: Locate the <aantal-leerlingen-leeftijd-bar-chart> tag\n",
" chart_tag = soup.find('aantal-leerlingen-leeftijd-bar-chart', attrs={'aantal-per-leeftijd': True})\n",
" # Step 2: Extract the 'aantal-per-leeftijd' attribute\n",
" raw_data = chart_tag['aantal-per-leeftijd']\n",
"\n",
" # Step 3: Parse the JSON data\n",
" try:\n",
" data = json.loads(raw_data)\n",
" # Step 4: Print the extracted data\n",
" print(\"Aantal per Leeftijd:\")\n",
" for entry in data:\n",
" age = entry['key']\n",
" num_students = entry['aantal']\n",
" school_data[\"num_students_age_{}\".format(age)] = num_students\n",
" # print(f\"Age {entry['key']}: {entry['aantal']} leerlingen\")\n",
" except json.JSONDecodeError as e:\n",
" print(f\"Failed to parse JSON data: {e}\")\n",
"\n",
" # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
" chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
"\n",
" ############################################################################\n",
" # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
" chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
"\n",
" if not chart_tag:\n",
" print(\"Could not find the 'aantal per leerjaar' section.\")\n",
" else:\n",
" # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
" raw_data = chart_tag['aantal-per-leerjaar']\n",
" \n",
" # Step 3: Parse the JSON data\n",
" try:\n",
" data = json.loads(raw_data)\n",
" # Step 4: Print the extracted data\n",
" print(\"Aantal per Leerjaar:\")\n",
" for entry in data:\n",
" group = entry['key']\n",
" num_students = entry['aantal']\n",
" school_data[\"num_students_group_{}\".format(group)] = num_students\n",
" print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
" except json.JSONDecodeError as e:\n",
" print(f\"Failed to parse JSON data: {e}\")\n",
" ############################################################################\n",
" except Exception as e:\n",
" print(school_url, str(e))\n",
" # assert False\n",
" update_school_data(school_url, school_data)\n",
"\n",
" list_school_data_dicts.append(school_data)\n",
"\n",
" df = pd.DataFrame(list_school_data_dicts)\n",
" df.to_csv(\"scholenopdekaart.csv\")\n",
" # Save per processed school to track progress\n",
" df = pd.DataFrame(list_school_data_dicts)\n",
" df.to_csv(\"scholenopdekaart_tmp.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n",
"\n",
" df = pd.DataFrame(list_school_data_dicts)\n",
" df.to_csv(\"scholenopdekaart.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n",
" # Without extra columns\n",
" df.drop(columns=[\"students_per_zipcode\", \"students_per_year_trend\", \"num_students_per_group\", \"num_students_per_age\"]).to_csv(\"scholenopdekaart_.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\"\"\" # Issues with URL:\n",
"https://scholenopdekaart.nl/middelbare-scholen/grave/1900/merletcollege-grave/\n",
"https://scholenopdekaart.nl/middelbare-scholen/lent/4386/citadel-college-locatie-griftdijk/\n",
@@ -259,24 +335,8 @@
"metadata": {},
"outputs": [],
"source": [
"'''\n",
"school_url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n",
"response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
"# Parse the HTML content using BeautifulSoup\n",
"soup_school = BeautifulSoup(response.text, 'html.parser')\n",
"soup_school\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n",
"\n",
"df.head()"
]
},
@@ -288,122 +348,6 @@
"source": [
"df.tail()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# https://scholenopdekaart.nl/middelbare-scholen/hardenberg/26614/de-ambelt/\n",
"# From which zip codes the students come\n",
"# How many kids passed the exams', ...\n",
"\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import os\n",
"\n",
"url = \"https://scholenopdekaart.nl/middelbare-scholen/hardenberg/26614/de-ambelt/\"\n",
"response = requests.get( os.path.join(url, \"#inhoud\") )\n",
"\n",
"\n",
"url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n",
"response = requests.get(url)\n",
"\n",
"soup = BeautifulSoup(response.content, 'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Locate the section containing \"Welke profielen volgen de examendeelnemers?\"\n",
"section_header = soup.find('h3', string=\"Welke profielen volgen de examendeelnemers?\")\n",
"if not section_header:\n",
" raise ValueError(\"Section 'Welke profielen volgen de examendeelnemers?' not found in the HTML.\")\n",
"\n",
"# Navigate to the parent section or subsection\n",
"section = section_header.find_parent('section')\n",
"if not section:\n",
" raise ValueError(\"Parent section for 'Welke profielen volgen de examendeelnemers?' not found.\")\n",
"\n",
"# Check if the section contains a message indicating no data is available\n",
"no_data_message = section.find('p', string=\"Deze informatie is voor deze school niet bekend.\")\n",
"if no_data_message:\n",
" print(\"No data available for 'Welke profielen volgen de examendeelnemers?'.\")\n",
"else:\n",
" # Extract the relevant content (e.g., tables, lists, or paragraphs)\n",
" content = []\n",
" for element in section.find_all(['p', 'table', 'ul', 'ol']):\n",
" if element.name == 'table':\n",
" # Extract table rows\n",
" rows = element.find_all('tr')\n",
" for row in rows:\n",
" cells = row.find_all(['th', 'td'])\n",
" row_data = [cell.get_text(strip=True) for cell in cells]\n",
" content.append(row_data)\n",
" else:\n",
" # Extract text from paragraphs, lists, etc.\n",
" content.append(element.get_text(strip=True))\n",
"\n",
" # Print the extracted content\n",
" for item in content:\n",
" print(item)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Locate the dialog containing the table\n",
"dialog = soup.find('dialog', class_='modal modal-dialog')\n",
"if not dialog:\n",
" raise ValueError(\"Dialog element not found in the HTML.\")\n",
"\n",
"# Locate the table within the dialog\n",
"table = dialog.find('table')\n",
"if not table:\n",
" raise ValueError(\"Table element not found within the dialog.\")\n",
"\n",
"# Extract table headers\n",
"headers = [header.get_text(strip=True) for header in table.find_all('th')]\n",
"\n",
"# Extract table rows\n",
"data = []\n",
"for row in table.find_all('tr')[1:]: # Skip the header row\n",
" cells = row.find_all('td')\n",
" if len(cells) == len(headers): # Ensure the row matches the expected structure\n",
" row_data = {\n",
" headers[0]: cells[0].get_text(strip=True), # Postcodegebied\n",
" headers[1]: cells[1].get_text(strip=True), # Aantal leerlingen\n",
" headers[2]: cells[2].get_text(strip=True) # Percentage\n",
" }\n",
" data.append(row_data)\n",
"\n",
"# Print the extracted data\n",
"for entry in data:\n",
" print(entry)"
]
}
],
"metadata": {

182
utils/Summary.ipynb Normal file
View File

@@ -0,0 +1,182 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# docker exec -it ollama_npu bash\n",
"# rkllama pull\n",
"#\n",
"# c01zaut/Llama-3.2-3B-Instruct-rk3588-1.1.4\n",
"# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-0-hybrid-ratio-0.0.rkllm\n",
"# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-0-hybrid-ratio-0.5.rkllm\n",
"# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-0.0.rkllm\n",
"# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-0.5.rkllm\n",
"# Llama-3.2-3B-Instruct-rk3588-w8a8_g512-opt-1-hybrid-ratio-0.5.rkllm\n",
"#\n",
"# c01zaut/Qwen2.5-3B-Instruct-RK3588-1.1.4\n",
"# Qwen2.5-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-0.0.rkllm\n",
"# Qwen2.5-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-1.0.rkllm\n",
"# Qwen2.5-3B-Instruct-rk3588-w8a8_g256-opt-1-hybrid-ratio-1.0.rkllm\n",
"#"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import ollama\n",
"import os\n",
"import requests\n",
"import json\n",
"from pprint import pprint\n",
"\n",
"# endpoint = \"https://ollamamodelnpu.matitos.org\"\n",
"endpoint = \"https://ollamamodel.matitos.org\"\n",
"model = \"qwen3:0.6b\"\n",
"model = \"qwen3:1.7b\"\n",
"client = ollama.Client(endpoint)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"r = requests.post( os.path.join(endpoint, \"unload_model\") )\n",
"r.status_code, r.json()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"r = requests.get( os.path.join(endpoint, \"models\") )\n",
"r.json().get(\"models\"), [ m.model for m in client.list().get(\"models\") ]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = \"llama3-instruct:3b\"\n",
"model = \"qwen2.5-instruct:3b\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"article_content = \"Kevin Sutherland's message to Rowan Lumsden told of his agony at what he believed were malicious rumours about his life. The best friend of tragic Kevin Sutherland has revealed a heartbreaking message sent in the last hours of his life. Rowan Lumsden, 35, says Kevins death would have been avoided if his request for anonymity in the Scottish Child Abuse Inquiry had been accepted. Mum-of-one Rowan told how her friend sent a 17-minute voice message that culminated as he stood on the Forth Road Bridge, where he is thought to have plunged to his death on December 19. The Daily Record has told how Kevin, 33, had ticked a box to say he approved of his testimony of historic abuse that he suffered to be published online. Kevins family later revealed an email sent to the inquiry, in which he begged for his real name to be redacted, suggesting he may take his own life if he was not given that protection. His appeal was dismissed by SCAI chair Lady Smith. Rowan told how Kevin left a harrowing final message, telling of his agony at what he believed to be malicious rumours that plagued his life. Rowan said: “I was asleep when the messages came in and it was devastating to hear his voice, knowing where he was and what was going to happen. I just wish I could have helped. “Kevin was pushed to the limit and he was so troubled about what people were saying about him. “He lived in fear his testimony would be used by people to make him out to be at fault or misconstrued and he bitterly regretted his decision to allow it to be made public. “I have no doubt that he would be alive today if he was allowed to to retract his on story from the record.” Rowan, 35, said Lady Smiths decision was wrong “in so many ways”. She said: “He begged her to let him be anonymous and he said that he would take his life if she refused. “But she said, No. I cannot see any way that can be explained away. He just needed the time it took to get the right interventions to turn his mental health and his life around. “Lady Smith was the top person in the inquiry. She knew she was dealing with a hugely vulnerable person as all victims are. She knew that he was having suicidal thoughts.” Kevin suffered trauma, including sexual abuse, in his childhood. In his final message to Rowan, in the hours before his suspected death, Kevin didnt refer directly to the SCAI inquiry but stated: “Its just coming from the ­absolute f****** heart and I just cannot cope with this life any more. “Its just been so f****** unbelievably brutal. I kind of feel like, whats the point? People have got their preconceived ideas and malicious gossip has served such a toxic contribution to this final decision that Ive made. “Thats me on the bridge. End of the road, eh? End of the road to all the liars and doubters and gossip mongrels.” Kevins sister Melanie Watson, who recently revealed the text of Kevins final appeal for anonymity, said she was aware of his final messages to friends. She added: “He was very fixated with the fear that people would make false assumptions about him, based on reading his testimony on Google.” The inquirys handling of Kevin is now part of an independent inquiry. An SCAI spokesperson said: “SCAI has commissioned an independent review to consider all aspects of its interactions with Kevin.”\"\n",
"article_content = \"Child services visited a Bronx apartment while a 4-year-old girl was trapped inside with the corpses of her troubled mom and brother but walked away after knocking, neighbors said. Lisa Cotton, 38, and her 8-year-old son, Nazir Millien, 8, had been dead for at least two weeks before relatives found them and the toddler inside the house of horrors Friday, one day after reps for the Administration for Childrens Services dropped the ball, neighbor Sabrina Coleson said. “They didnt do st,” Coleson said Sunday. “They were here ringing peoples bells the day before the wellness check. They were here, but they didnt do st. “One rang my bell and asked if I had any concerns for upstairs. And then a man opened his door and started yelling,” she said. “Lisa was a very cool girl. I never saw her son with her, only the girl. Its terrible.” Concerned relatives finally checked on the family on Friday and found the 4-year-old, Promise, alone, starving and in horrid condition on her mothers bed — as bugs crawled over her dead family. Cottons father, Hubert, 71, had sent his oldest granddaughter to check the apartment at East 231st Street — with the woman grabbing her young sibling and fleeing the putrid home to call police. ACS wasnt the only city agency to leave Promise trapped in hellish conditions — neighbors said cops were also called to the apartment on Tuesday but left after not sensing the stench reported by others. Hubert Cotton said the toddler survived by “feeding herself with chocolate.” Law enforcement sources said Lisa Cotton had a history of erratic behavior, and had a pending ACS case for alleged child neglect before she was found dead. She was arrested in 2021 on child abandonment charges after police said she was caught swinging her then-infant daughter around in a stroller and lighting a wig on fire on White Plains Road, sources said. When cops arrived she was allegedly walking away, leaving Promise behind. The outcome of the case was not available because the file is sealed. One neighbor said the mom had “episodes” in the past. Sources said police believe Lisa Cotton, who suffered from asthma, may have died from cardiac arrest, while her son, who was born prematurely and had a feeding tube, may have starved to death. A spokesperson for ACS declined to comment on the case on Sunday other than to say the agency is “investigating this tragedy with the NYPD.”\"\n",
"\n",
"# prompt = \"Rewrite the content below into a clear and concise summary of one paragraph maximum, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article:\\n\\n{}\".format(article_content)\n",
"# prompt = \"Provide a summary of the content below, presenting the key points as if they are newly written insights. Write in a natural, standalone format that feels like an original explanation. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Keep it brief, engaging, informative, in the style of a news article, and in one single paragraph:\\n\\n{}\".format(article_content)\n",
"# prompt = \"Provide a summary of the content below, writing in a natural and standalone format that feels like an original explanation. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Keep it brief, engaging, informative, in the style of a news article, and in one single paragraph:\\n\\n{}\".format(article_content)\n",
"\n",
"# in one sentence each\n",
"prompt = \"First, provide a summary of the content below in one paragraph. Second, specify the Who, What, When, Where and Why of the story:\\n\\n{}\".format(article_content)\n",
"# prompt = \"Provide the 5W (Who, What, When, Where, Why) and a detailed summary of the content below:\\n\\n{}\".format(article_content)\n",
"# Only answer with the location or address which can be extracted from this description\n",
"\n",
"prompt = \"Provide, in one sentence each, the who, what, when, where, why, and a detailed summary of the content below:\\n\\n{}\".format(article_content)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{}\n"
]
}
],
"source": [
"options = {\"temperature\": 0, \"seed\": 51029}\n",
"resp = client.generate(model=model, prompt=prompt, format=\"json\", options=options)\n",
"r = requests.post( os.path.join(endpoint, \"unload_model\") )\n",
"\n",
"response_dict = json.loads(resp.response)\n",
"pprint(response_dict)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'{\\n\\n\\n}'"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"resp.response"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"<think>\\nOkay, let's tackle this query. The user wants a one-sentence summary for each element: who, what, when, where, why, and a detailed summary.\\n\\nFirst, the main event is the child services visiting a Bronx apartment with a 4-year-old trapped, but the neighbors say they knocked out the corpses. So for the first sentence, I need to include who (child services), what (visited the apartment), when (Friday), where (the apartment), why (neighbors said they didn't do it), and a summary. \\n\\nThen, for the second part, the user might want more details. Let me check the content. The summary needs to include the specific details like the family members, the days they were found dead, the agencies involved, and the outcomes. Also, mention the sources like ACS and the neighbors' statements. I need to make sure each sentence is concise and covers all the points without being too lengthy. Let me structure each sentence to fit the required format.\\n</think>\\n\\n**Who:** Child services in the Bronx, **What:** Visited an apartment containing a 4-year-old trapped with a dead mom and brother, **When:** Friday, **Where:** East 231st Street, **Why:** Neighbors reported the agencys actions were inadequate, **Summary:** Child services visited a Bronx apartment with a 4-year-old trapped and dead, neighbors say they knocked out the corpses, and the incident is attributed to the agencys failure to address the situation, with the family surviving by feeding themselves and the case being sealed.\""
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#resp = client.generate(model=model, prompt=prompt, format=\"json\")\n",
"resp = client.generate(model=model, prompt=prompt)\n",
"resp.response"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos_urls",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}