diff --git a/.env b/.env
index c97170f..02ac98b 100644
--- a/.env
+++ b/.env
@@ -1,23 +1,31 @@
-# Initialization
-INITIALIZE_DB=true
-DJANGO_SUPERUSER_USERNAME=matitos
-DJANGO_SUPERUSER_PASSWORD=matitos
-DJANGO_SUPERUSER_EMAIL=matitos@matitos.org
+# Reverse proxy
+TRAEFIK_MAIL=yourmail@protonmail.com
+DUCKDNS_TOKEN=
+DUCKDNS_SUBDOMAINS=
# Reverse proxy
-REVERSE_PROXY_URL=sample.url.com
+OLLAMA_WEBUI_REVERSE_PROXY_URL=ollama.steep.duckdns.org
+OLLAMA_REVERSE_PROXY_URL=ollamamodel.steep.duckdns.org
+REVERSE_PROXY_URL=fetcher.steep.duckdns.org
+DJANGO_ALLOWED_ORIGINS=https://fetcher.steep.duckdns.org # Reverse proxy
+
+
+# Initialization
+INITIALIZE_DB=true
+DJANGO_SUPERUSER_USERNAME=steep
+DJANGO_SUPERUSER_PASSWORD=steep
+DJANGO_SUPERUSER_EMAIL=steep@steepnews.org
# Django
-DJANGO_ALLOWED_ORIGINS=https://sample.url.com # Reverse proxy
DJANGO_ALLOWED_HOSTS=* # host1,host2
-DJANGO_SECRET_KEY=EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5
+DJANGO_SECRET_KEY=EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkqN7dXVUsMSqy6a5rjY6WNCw3CcRH5
DJANGO_DEBUG=True
PATH_LOGS_DIRECTORY=/opt/logs
# Database
-DB_NAME=matitos
-DB_PASSWORD=supermatitos
-DB_USER=supermatitos
+DB_NAME=steep
+DB_PASSWORD=supersteep
+DB_USER=supersteep
PATH_DB_DATA=.
# Database: Django
@@ -40,7 +48,7 @@ FETCHER_ERROR_URL_CACHE_TIME=172800
# Selenium
SELENIUM_ENDPOINT=http://fetcher_app_selenium:80
-ENDPOINT_OLLAMA=https://ollamamodel.matitos.org
+ENDPOINT_OLLAMA=http://ollama:11434
# APP: Selenium
ARCH=amd64 # arm64, amd64
@@ -52,6 +60,6 @@ DEPLOY_CPUS=2
DEPLOY_RAM=4G
# Ghost
-GHOST_ADMIN_API_URL=https://news.matitos.org/ghost/api/admin/
-GHOST_ADMIN_API_KEY=67fffe1b8a57a80001ecec5b:59f580020c196f92e05e208d288702082f8edad6366e2b2c8940b54e41cc355a
-PEXELS_API_KEY=Y6clJkY32eihf34ukX4JsINYu9lzxh3xDdNq2HMAmGwXp0a0tt6vr6S9
+GHOST_ADMIN_API_URL=
+GHOST_ADMIN_API_KEY=
+PEXELS_API_KEY=
diff --git a/app_urls/init_data.json b/app_urls/init_data.json
index 0ed7168..ec59b44 100644
--- a/app_urls/init_data.json
+++ b/app_urls/init_data.json
@@ -1,34 +1,65 @@
{
"SEARCH": {
"rss_feed": [
- "https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
- "https://feeds.feedburner.com/breitbart",
- "https://feeds.feedburner.com/zerohedge/feed",
- "https://moxie.foxnews.com/google-publisher/latest.xml",
- "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
- "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
],
"url_host": [
- "missingkids.org/poster",
- "missingkids.org/new-poster",
- "breitbart.com",
- "zerohedge.com",
- "foxnews.com",
- "cnbc.com"
- ],
+ "johnpilger.com",
+ "lapenseeecologique.com",
+ "partage-le.com",
+ "reflets.info",
+ "rezo.net",
+ "consortiumnews.com",
+ "disclose.ngo/fr",
+ "energieetenvironnement.com",
+ "global-climat.com",
+ "slashdot.org",
+ "lesamisdebartleby.wordpress.com",
+ "lundi.am",
+ "lvsl.fr",
+ "moderndiplomacy.eu",
+ "mrmondialisation.org",
+ "ourfiniteworld.com",
+ "southfront.org",
+ "simplicius76.substack.com",
+ "smoothiex12.blogspot.com",
+ "theintercept.com",
+ "wikileaks.org",
+ "contretemps.eu",
+ "indianpunchline.com",
+ "investigaction.net/fr",
+ "notechmagazine.com",
+ "terrestres.org",
+ "truthdig.com",
+ "tass.com",
+ "bastamag.net",
+ "counterpunch.org",
+ "energy-daily.com",
+ "fakirpresse.info",
+ "geopoliticalmonitor.com",
+ "huffingtonpost.fr",
+ "legrandsoir.info",
+ "les-crises.fr",
+ "liberation.fr",
+ "maitre-eolas.fr",
+ "marianne.net",
+ "mediapart.fr",
+ "metaefficient.com",
+ "monde-diplomatique.fr",
+ "paulcraigroberts.org",
+ "politis.fr",
+ "reporterre.net",
+ "rue89.com",
+ "theguardian.com/international",
+ "treehugger.com",
+ "unz.com",
+ "voltairenet.org",
+ "wsws.org"
+ ],
"keyword_search": [
- "child abuse"
+ "society collapse"
]
},
"REGEX_PATTERN_STATUS_PRIORITY": [
- [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
- [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
- [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
- [".*radio.foxnews\\.com\\/.*", "invalid", 75],
- [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
- [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
- [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
- [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
- [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
+ [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50]
]
}
diff --git a/app_urls/init_data_fr.json b/app_urls/init_data_fr.json
deleted file mode 100644
index ec59b44..0000000
--- a/app_urls/init_data_fr.json
+++ /dev/null
@@ -1,65 +0,0 @@
-{
- "SEARCH": {
- "rss_feed": [
- ],
- "url_host": [
- "johnpilger.com",
- "lapenseeecologique.com",
- "partage-le.com",
- "reflets.info",
- "rezo.net",
- "consortiumnews.com",
- "disclose.ngo/fr",
- "energieetenvironnement.com",
- "global-climat.com",
- "slashdot.org",
- "lesamisdebartleby.wordpress.com",
- "lundi.am",
- "lvsl.fr",
- "moderndiplomacy.eu",
- "mrmondialisation.org",
- "ourfiniteworld.com",
- "southfront.org",
- "simplicius76.substack.com",
- "smoothiex12.blogspot.com",
- "theintercept.com",
- "wikileaks.org",
- "contretemps.eu",
- "indianpunchline.com",
- "investigaction.net/fr",
- "notechmagazine.com",
- "terrestres.org",
- "truthdig.com",
- "tass.com",
- "bastamag.net",
- "counterpunch.org",
- "energy-daily.com",
- "fakirpresse.info",
- "geopoliticalmonitor.com",
- "huffingtonpost.fr",
- "legrandsoir.info",
- "les-crises.fr",
- "liberation.fr",
- "maitre-eolas.fr",
- "marianne.net",
- "mediapart.fr",
- "metaefficient.com",
- "monde-diplomatique.fr",
- "paulcraigroberts.org",
- "politis.fr",
- "reporterre.net",
- "rue89.com",
- "theguardian.com/international",
- "treehugger.com",
- "unz.com",
- "voltairenet.org",
- "wsws.org"
- ],
- "keyword_search": [
- "society collapse"
- ]
- },
- "REGEX_PATTERN_STATUS_PRIORITY": [
- [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50]
- ]
-}
diff --git a/app_urls/init_data_sca.json b/app_urls/init_data_sca.json
deleted file mode 100644
index 0ed7168..0000000
--- a/app_urls/init_data_sca.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
- "SEARCH": {
- "rss_feed": [
- "https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
- "https://feeds.feedburner.com/breitbart",
- "https://feeds.feedburner.com/zerohedge/feed",
- "https://moxie.foxnews.com/google-publisher/latest.xml",
- "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
- "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
- ],
- "url_host": [
- "missingkids.org/poster",
- "missingkids.org/new-poster",
- "breitbart.com",
- "zerohedge.com",
- "foxnews.com",
- "cnbc.com"
- ],
- "keyword_search": [
- "child abuse"
- ]
- },
- "REGEX_PATTERN_STATUS_PRIORITY": [
- [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
- [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
- [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
- [".*radio.foxnews\\.com\\/.*", "invalid", 75],
- [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
- [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
- [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
- [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
- [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
- ]
-}
diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml
new file mode 100644
index 0000000..3f32d6d
--- /dev/null
+++ b/docker-compose-prod.yml
@@ -0,0 +1,123 @@
+version: '3.9'
+
+services:
+
+ fetcher_app_selenium:
+ image: fetcher_app_selenium
+ build:
+ context: ./app_selenium
+ args:
+ - ARCH=${ARCH} # arm64, amd64
+ container_name: fetcher_app_selenium
+ restart: unless-stopped
+ shm_size: 512mb
+ environment:
+ - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
+ - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
+ ports:
+ - 80
+ dns:
+ - 1.1.1.1
+ - 1.0.0.1
+ deploy:
+ resources:
+ limits:
+ cpus: '${DEPLOY_CPUS}'
+ memory: ${DEPLOY_RAM}
+
+ fetcher_app_urls:
+ image: fetcher_app_urls
+ build:
+ context: ./app_urls
+ container_name: fetcher_app_urls
+ restart: unless-stopped
+ environment:
+ # Initialization
+ - INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence
+ - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
+ - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
+ - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
+ # Django
+ - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
+ - DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
+ - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
+ - DJANGO_DEBUG=${DJANGO_DEBUG}
+ - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
+ # Database
+ - DB_NAME=${DB_NAME}
+ - DB_USER=${DB_USER}
+ - DB_PASSWORD=${DB_PASSWORD}
+ - DB_HOST=${DB_HOST}
+ - DB_PORT=${DB_PORT}
+ - REDIS_HOST=${REDIS_HOST}
+ - REDIS_PORT=${REDIS_PORT}
+ # Job timeout: 30 min
+ - JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
+ # Fetcher
+ - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
+ - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
+ - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search
+ - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host
+ - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection
+ - FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
+ - FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
+ # Selenium
+ - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
+ - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
+ # Ghost
+ - GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
+ - GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
+ - PEXELS_API_KEY=${PEXELS_API_KEY}
+ ########################
+ #volumes: # Development mode
+ # - ./app_urls:/opt/app
+ ########################
+ ports:
+ - 8000 # :8000
+ depends_on:
+ - fetcher_db
+ - fetcher_redis
+ dns:
+ - 1.1.1.1
+ - 1.0.0.1
+ deploy:
+ resources:
+ limits:
+ cpus: '${DEPLOY_CPUS}'
+ memory: ${DEPLOY_RAM}
+ labels: # Reverse proxy sample
+ - "traefik.enable=true"
+ - "traefik.http.routers.fetcher.rule=Host(`${REVERSE_PROXY_URL}`)"
+ - "traefik.http.routers.fetcher.entrypoints=websecure"
+ - "traefik.http.routers.fetcher.tls.certresolver=myresolvercd"
+ - "traefik.http.services.fetcher.loadbalancer.server.port=8000"
+ networks:
+ - default # This network
+ - docker_default # Reverse proxy network
+
+ fetcher_db:
+ image: postgres:17
+ container_name: fetcher_db
+ restart: unless-stopped
+ # Set shared memory limit when using docker-compose
+ shm_size: 128mb
+ environment:
+ POSTGRES_DB: ${DB_NAME}
+ POSTGRES_PASSWORD: ${DB_PASSWORD}
+ POSTGRES_USER: ${DB_USER}
+ POSTGRES_INITDB_ARGS: '--data-checksums'
+ volumes: # Persistent DB?
+ - ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data
+ ports:
+ - 5432 #:5432
+
+ fetcher_redis:
+ image: redis:alpine
+ container_name: fetcher_redis
+ restart: unless-stopped
+ ports:
+ - 6379 #:6379
+
+networks:
+ docker_default:
+ external: true
diff --git a/docker-compose.yml b/docker-compose.yml
index 3f32d6d..148a370 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,28 +2,126 @@ version: '3.9'
services:
- fetcher_app_selenium:
- image: fetcher_app_selenium
- build:
- context: ./app_selenium
- args:
- - ARCH=${ARCH} # arm64, amd64
- container_name: fetcher_app_selenium
+ duckdns:
+ image: lscr.io/linuxserver/duckdns:latest
+ container_name: duckdns
restart: unless-stopped
- shm_size: 512mb
environment:
- - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
- - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
+ - PUID=1000 #optional
+ - PGID=1000 #optional
+ - TZ=Europe/London
+ - SUBDOMAINS=${DUCKDNS_SUBDOMAINS}
+ - TOKEN=${DUCKDNS_TOKEN}
+ - LOG_FILE=true #optional
+ volumes:
+ - ${PATH_DB_DATA}/duckdns_config:/config
+
+ traefik:
+ image: "traefik:v3.3"
+ container_name: "traefik"
+ restart: unless-stopped
+ command:
+ - "--api.insecure=true"
+ - "--providers.docker=true"
+ - "--providers.docker.exposedbydefault=false"
+ # Logs for fail2ban
+ - "--log.level=INFO"
+ - "--accesslog=true"
+ - "--accesslog.filepath=/var/log/traefik/access.log"
+ # HTTPS
+ - "--entrypoints.websecure.address=:443"
+ # HTTPS -> Timeouts
+ - "--entrypoints.websecure.transport.respondingTimeouts.readTimeout=1200s"
+ - "--entrypoints.websecure.transport.respondingTimeouts.idleTimeout=1200s"
+ - "--entrypoints.websecure.transport.respondingTimeouts.writeTimeout=1200s"
+ # HTTP -> HTTPS
+ - "--entryPoints.web.address=:80"
+ - "--entrypoints.web.http.redirections.entryPoint.to=websecure"
+ # Let's Encrypt
+ - "--certificatesresolvers.myresolver.acme.email=${TRAEFIK_MAIL}"
+ - "--certificatesresolvers.myresolver.acme.storage=/letsencrypt/acme.json"
+ # TLS challenge to request new certificate
+ - "--certificatesresolvers.myresolver.acme.tlschallenge=true"
ports:
- - 80
- dns:
- - 1.1.1.1
- - 1.0.0.1
- deploy:
- resources:
- limits:
- cpus: '${DEPLOY_CPUS}'
- memory: ${DEPLOY_RAM}
+ - "80:80"
+ - "443:443"
+ - "8080:8080"
+ volumes:
+ - "${PATH_DB_DATA}/letsencrypt:/letsencrypt"
+ - "${PATH_DB_DATA}/traefik_logs:/var/log/traefik"
+ - "/var/run/docker.sock:/var/run/docker.sock:ro"
+
+
+ ollama:
+ image: ollama/ollama:latest
+ container_name: ollama
+ restart: unless-stopped
+ ports:
+ - '11434:11434'
+ volumes:
+ - ${PATH_DB_DATA}/ollama:/root/.ollama
+ #deploy:
+ # resources:
+ # limits:
+ # memory: 6G
+ # cpus: 6 # 80% for 8 cores
+ labels:
+ - "traefik.enable=true"
+ - "traefik.http.routers.ollama-secure.rule=Host(`${OLLAMA_REVERSE_PROXY_URL}`)"
+ - "traefik.http.routers.ollama.entrypoints=websecure"
+ - "traefik.http.routers.ollama-secure.tls.certresolver=myresolver"
+ - "traefik.http.services.ollama.loadbalancer.server.port=11434"
+
+ ollama-webui:
+ image: ghcr.io/ollama-webui/ollama-webui:main
+ container_name: ollama-webui
+ restart: unless-stopped
+ ports:
+ - 8080:8080
+ volumes:
+ - ${PATH_DB_DATA}/ollama-webui:/app/backend/data
+ depends_on:
+ - ollama
+ environment:
+ - 'OLLAMA_API_BASE_URL=http://ollama:11434/api'
+ - 'ENABLE_SIGNUP=false'
+ #- 'ENABLE_RAG_WEB_SEARCH=true'
+ #- 'RAG_WEB_SEARCH_ENGINE=brave'
+ #- 'ENABLE_IMAGE_GENERATION=true'
+ #- 'IMAGE_GENERATION_ENGINE=comfyui'
+ #- 'COMFYUI_BASE_URL=comfyui.matitos.org'
+ #- 'COMFYUI_API_KEY='
+ #- 'COMFYUI_WORKFLOW=' # https://docs.openwebui.com/getting-started/env-configuration#comfyui_workflow
+ labels:
+ - "traefik.enable=true"
+ - "traefik.http.routers.ollamawebui-secure.rule=Host(`${OLLAMA_WEBUI_REVERSE_PROXY_URL}`)"
+ - "traefik.http.routers.ollamawebui.entrypoints=websecure"
+ - "traefik.http.routers.ollamawebui-secure.tls.certresolver=myresolver"
+ - "traefik.http.services.ollamawebui.loadbalancer.server.port=8080"
+
+
+ #fetcher_app_selenium:
+ # image: fetcher_app_selenium
+ # build:
+ # context: ./app_selenium
+ # args:
+ # - ARCH=${ARCH} # arm64, amd64
+ # container_name: fetcher_app_selenium
+ # restart: unless-stopped
+ # shm_size: 512mb
+ # environment:
+ # - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
+ # - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
+ # ports:
+ # - 80
+ # dns:
+ # - 1.1.1.1
+ # - 1.0.0.1
+ # deploy:
+ # resources:
+ # limits:
+ # cpus: '${DEPLOY_CPUS}'
+ # memory: ${DEPLOY_RAM}
fetcher_app_urls:
image: fetcher_app_urls
@@ -73,7 +171,7 @@ services:
# - ./app_urls:/opt/app
########################
ports:
- - 8000 # :8000
+ - 8000:8000
depends_on:
- fetcher_db
- fetcher_redis
@@ -89,11 +187,8 @@ services:
- "traefik.enable=true"
- "traefik.http.routers.fetcher.rule=Host(`${REVERSE_PROXY_URL}`)"
- "traefik.http.routers.fetcher.entrypoints=websecure"
- - "traefik.http.routers.fetcher.tls.certresolver=myresolvercd"
+ - "traefik.http.routers.fetcher.tls.certresolver=myresolver"
- "traefik.http.services.fetcher.loadbalancer.server.port=8000"
- networks:
- - default # This network
- - docker_default # Reverse proxy network
fetcher_db:
image: postgres:17
@@ -117,7 +212,3 @@ services:
restart: unless-stopped
ports:
- 6379 #:6379
-
-networks:
- docker_default:
- external: true
diff --git a/utils/Schools-NL.ipynb b/utils/Schools-NL.ipynb
deleted file mode 100644
index 3bee3dd..0000000
--- a/utils/Schools-NL.ipynb
+++ /dev/null
@@ -1,335 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import requests\n",
- "from bs4 import BeautifulSoup\n",
- "from urllib.parse import urljoin\n",
- "import pandas as pd\n",
- "import os\n",
- "\n",
- "headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Function to decode Cloudflare-protected emails\n",
- "def decode_email(encoded_email):\n",
- " \"\"\"\n",
- " Decode an email protected by Cloudflare's email protection.\n",
- " :param encoded_email: The encoded email string from the data-cfemail attribute.\n",
- " :return: The decoded email address.\n",
- " \"\"\"\n",
- " email = \"\"\n",
- " key = int(encoded_email[:2], 16) # Extract the key (first two characters)\n",
- " for i in range(2, len(encoded_email), 2):\n",
- " # XOR each pair of hex characters with the key\n",
- " email += chr(int(encoded_email[i:i + 2], 16) ^ key)\n",
- " return email\n",
- "\n",
- "def extract_emails(soup):\n",
- " # Find all visible email links (mailto:)\n",
- " visible_emails = []\n",
- " for link in soup.find_all('a', href=lambda href: href and href.startswith('mailto:')):\n",
- " email = link['href'].replace('mailto:', '')\n",
- " visible_emails.append(email)\n",
- "\n",
- " # Find all Cloudflare-protected emails\n",
- " protected_emails = []\n",
- " for span in soup.find_all('span', class_='__cf_email__', attrs={'data-cfemail': True}):\n",
- " encoded_email = span['data-cfemail']\n",
- " decoded_email = decode_email(encoded_email)\n",
- " protected_emails.append(decoded_email)\n",
- "\n",
- " # Combine all emails\n",
- " all_emails = visible_emails + protected_emails\n",
- " all_emails = list(set(all_emails))\n",
- " if (len(all_emails) == 0):\n",
- " return None\n",
- " elif (len(all_emails) == 1):\n",
- " return all_emails[0]\n",
- " else:\n",
- " return all_emails\n",
- "\n",
- "def find_website(soup_school):\n",
- " # Find all tags with href attributes\n",
- " for link in soup_school.find(class_=\"dl-horizontal dl-icons\").find_all('a', href=True):\n",
- " href = link['href']\n",
- " # Filter out only valid URLs (e.g., starting with http or https)\n",
- " if href.startswith(('http://', 'https://')):\n",
- " # websites.append(href)\n",
- " return href\n",
- "\n",
- "\n",
- "def main():\n",
- " list_urls = [\n",
- " \"https://scholenopdekaart.nl/Basisscholen/\",\n",
- " \"https://scholenopdekaart.nl/middelbare-scholen/\"\n",
- " ]\n",
- "\n",
- " list_school_data_dicts = []\n",
- "\n",
- " # For each category\n",
- " for url in list_urls:\n",
- " # Fetch the HTML content of the page\n",
- " response = requests.get(url, headers=headers)\n",
- " response.raise_for_status() # Raise an exception for HTTP errors\n",
- " # Parse the HTML content using BeautifulSoup\n",
- " soup = BeautifulSoup(response.text, 'html.parser')\n",
- "\n",
- " # Get category\n",
- " category = url.strip(\"/\").split(\"/\")[-1].lower()\n",
- "\n",
- " # Find all tags with href attributes\n",
- " links_areas = []\n",
- " for a_tag in soup.find_all('a', href=True):\n",
- " href = a_tag['href']\n",
- " \n",
- " if (category not in href):\n",
- " continue\n",
- " \n",
- " # Convert relative URLs to absolute URLs\n",
- " area_full_url = urljoin(url, href)\n",
- " links_areas.append(area_full_url)\n",
- "\n",
- " # Area\n",
- " area = href.rstrip(\"/\").split(\"/\")[-1]\n",
- "\n",
- " ###############################################\n",
- " # Fetch the HTML content of the page\n",
- " print(\".\", end=\"\")\n",
- " response = requests.get(area_full_url, headers=headers)\n",
- " response.raise_for_status() # Raise an exception for HTTP errors\n",
- "\n",
- " # Parse the HTML content using BeautifulSoup\n",
- " soup_area= BeautifulSoup(response.text, 'html.parser')\n",
- "\n",
- " # Get schools in area\n",
- " for a_tag in soup_area.find_all('a', href=True):\n",
- " href = a_tag['href']\n",
- "\n",
- " school_url = urljoin(url, href)\n",
- " if (area_full_url not in school_url):\n",
- " continue\n",
- " \n",
- " school_name = a_tag.text.rstrip(\".\")\n",
- " school_data = {\n",
- " \"category\": category,\n",
- " \"area\": area,\n",
- " \"name\": school_name,\n",
- " \"url\": school_url,\n",
- " }\n",
- "\n",
- " try:\n",
- " # Process school (request contact details)\n",
- " response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
- " response.raise_for_status() # Raise an exception for HTTP errors\n",
- "\n",
- " # Parse the HTML content using BeautifulSoup\n",
- " soup_school = BeautifulSoup(response.text, 'html.parser')\n",
- "\n",
- " # School details\n",
- " school_details = soup_school.find(class_=\"school-details\")\n",
- " for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n",
- " data = li_detail.find('span', class_='infotip-term')['data-dfn']\n",
- " text = li_detail.get_text(strip=True)\n",
- " # Set data\n",
- " school_data[\"category_{}\".format(category_idx)] = text\n",
- " school_data[\"category_{}_description\".format(category_idx)] = data\n",
- " \n",
- " school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n",
- " school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n",
- " school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n",
- " school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n",
- "\n",
- " school_data[\"city\"] = school_city\n",
- " school_data[\"postcode\"] = school_postcode\n",
- " school_data[\"address\"] = school_address\n",
- "\n",
- " try:\n",
- " school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n",
- " except Exception as e:\n",
- " pass\n",
- " try:\n",
- " school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n",
- " except Exception as e:\n",
- " pass\n",
- " try:\n",
- " school_data[\"email\"] = extract_emails(soup_school)\n",
- " except Exception as e:\n",
- " pass\n",
- " \n",
- " except Exception as e:\n",
- " print(school_url, str(e))\n",
- " # assert False\n",
- "\n",
- " list_school_data_dicts.append(school_data)\n",
- "\n",
- " df = pd.DataFrame(list_school_data_dicts)\n",
- " df.to_csv(\"scholenopdekaart.csv\")\n",
- "\n",
- "\"\"\" # Issues with URL:\n",
- "https://scholenopdekaart.nl/middelbare-scholen/grave/1900/merletcollege-grave/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/lent/4386/citadel-college-locatie-griftdijk/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/24527/montessori-college-k33-nijmegen/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26368/aventurijn-park-neerbosch/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26187/kandinsky-college-voor-lyceum-havo-mavo-vbo-lwoo/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/1791/karel-de-grote-college/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2040/mondial-college-locatie-leuvensbroek/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2041/mondial-college-meeuwse-acker/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2036/stedelijk-gymnasium-nijmegen/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2038/stedelijke-scholengemeenschap-nijmegen/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26184/yuverta-vmbo-het-groene-lyceum-nijmegen/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/oss/23719/het-hooghuis-locatie-mondriaan-college/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/oss/943/het-hooghuis-locatie-oss-stadion/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/oss/947/het-hooghuis-zuidwest-gebouw-west/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/oss/946/het-hooghuis-zuidwest-gebouw-zuid/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/oss/1929/het-maaslandcollege-scholengemeenschap-voor-tweetalig-mavo-havo-vwo/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/oss/25783/sonnewijser-unit-route-arbeid/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/oss/11432/sonnewijser-unit-vervolgonderwijs-oss/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/oss/942/titus-brandsmalyceum/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/velp-noord-brabant/24545/merletcollege-eerste-opvang-anderstaligen-eoa/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/wijchen/2018/maaswaal-college-havo-atheneum-gymnasium/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/wijchen/2020/maaswaal-college-vmbo-basis-kader-mavo/\n",
- "https://scholenopdekaart.nl/middelbare-scholen/wijchen/1781/pro-college-wijchen/\n",
- "\"\"\"\n",
- "\n",
- "if __name__ == \"__main__\":\n",
- " main()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "'''\n",
- "school_url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n",
- "response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
- "# Parse the HTML content using BeautifulSoup\n",
- "soup_school = BeautifulSoup(response.text, 'html.parser')\n",
- "soup_school\n",
- "'''"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "\n",
- "df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n",
- "df.loc[0, \"category_3\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import requests\n",
- "from bs4 import BeautifulSoup\n",
- "\n",
- "# Step 1: Fetch the webpage\n",
- "url = \"https://scholenopdekaart.nl/basisscholen/aagtekerke/25963/jhr-willem-versluijsschool/\"\n",
- "headers = {\n",
- " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36\"\n",
- "}\n",
- "response = requests.get(url, headers=headers)\n",
- "\n",
- "# Check if the request was successful\n",
- "if response.status_code != 200:\n",
- " print(f\"Failed to retrieve the page. Status code: {response.status_code}\")\n",
- " exit()\n",
- "\n",
- "# Step 2: Parse the HTML content\n",
- "soup = BeautifulSoup(response.text, 'html.parser')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Aantal per Leerjaar:\n",
- "Groep 1: 29 leerlingen\n",
- "Groep 2: 28 leerlingen\n",
- "Groep 3: 30 leerlingen\n",
- "Groep 4: 25 leerlingen\n",
- "Groep 5: 19 leerlingen\n",
- "Groep 6: 26 leerlingen\n",
- "Groep 7: 22 leerlingen\n",
- "Groep 8: 20 leerlingen\n"
- ]
- }
- ],
- "source": [
- "import json\n",
- "\n",
- "# Step 1: Locate the tag\n",
- "chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
- "\n",
- "if not chart_tag:\n",
- " print(\"Could not find the 'aantal per leerjaar' section.\")\n",
- "else:\n",
- " # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
- " raw_data = chart_tag['aantal-per-leerjaar']\n",
- " \n",
- " # Step 3: Parse the JSON data\n",
- " try:\n",
- " data = json.loads(raw_data)\n",
- " \n",
- " # Step 4: Print the extracted data\n",
- " print(\"Aantal per Leerjaar:\")\n",
- " for entry in data:\n",
- " print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
- " except json.JSONDecodeError as e:\n",
- " print(f\"Failed to parse JSON data: {e}\")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "matitos_urls",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.12.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/website/clickhouse/ipv4-only.xml b/website/clickhouse/ipv4-only.xml
deleted file mode 100644
index 7eb0a5c..0000000
--- a/website/clickhouse/ipv4-only.xml
+++ /dev/null
@@ -1,3 +0,0 @@
-
- 0.0.0.0
-
diff --git a/website/clickhouse/logs.xml b/website/clickhouse/logs.xml
deleted file mode 100644
index 3902fdd..0000000
--- a/website/clickhouse/logs.xml
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
- warning
- true
-
-
-
- system
-
- 7500
-
- ENGINE = MergeTree
- PARTITION BY event_date
- ORDER BY (event_time)
- TTL event_date + interval 30 day
- SETTINGS ttl_only_drop_parts=1
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/website/clickhouse/low-resources.xml b/website/clickhouse/low-resources.xml
deleted file mode 100644
index a75e843..0000000
--- a/website/clickhouse/low-resources.xml
+++ /dev/null
@@ -1,23 +0,0 @@
-
-
-
- 524288000
-
-
-
-
- 1
-
- 8192
-
- 1
-
- 0
-
- 0
-
-
-
diff --git a/website/docker-compose.yml b/website/docker-compose.yml
deleted file mode 100644
index 54b230e..0000000
--- a/website/docker-compose.yml
+++ /dev/null
@@ -1,147 +0,0 @@
-services:
-
- ghost:
- image: ghost:5-alpine
- container_name: ghost
- restart: unless-stopped
- ports:
- - 2368 #- 8080:2368
- environment:
- # see https://ghost.org/docs/config/#configuration-options
- database__client: mysql
- database__connection__host: ghost_db
- database__connection__user: root
- database__connection__password: example
- database__connection__database: ghost
- url: https://news.matitos.org
- # contrary to the default mentioned in the linked documentation, this image defaults to NODE_ENV=production (so development mode needs to be explicitly specified if desired)
- #NODE_ENV: development
- volumes:
- - ./docker_data/ghost:/var/lib/ghost/content
- labels: # Reverse proxy sample
- - "traefik.enable=true"
- - "traefik.http.routers.news.rule=Host(`news.matitos.org`)"
- - "traefik.http.routers.news.entrypoints=websecure"
- - "traefik.http.routers.news.tls.certresolver=myresolvercd"
- - "traefik.http.services.news.loadbalancer.server.port=2368"
- networks:
- - default # This network
- - docker_default # Reverse proxy network
-
- ghost_db:
- image: mysql:8.0
- container_name: ghost_db
- restart: unless-stopped
- environment:
- MYSQL_ROOT_PASSWORD: example
- volumes:
- - ./docker_data/ghost_db:/var/lib/mysql
-
- plausible_db:
- image: postgres:16-alpine
- restart: unless-stopped
- container_name: plausible_db
- volumes:
- - ./docker_data/plausible_db_data:/var/lib/postgresql/data
- environment:
- - POSTGRES_PASSWORD=postgres
- healthcheck:
- test: ["CMD-SHELL", "pg_isready -U postgres"]
- start_period: 1m
-
- plausible_events_db:
- image: clickhouse/clickhouse-server:24.12-alpine
- restart: unless-stopped
- container_name: plausible_events_db
- volumes:
- - ./docker_data/event-data:/var/lib/clickhouse
- - ./docker_data/event-logs:/var/log/clickhouse-server
- - ./clickhouse/logs.xml:/etc/clickhouse-server/config.d/logs.xml:ro
- # This makes ClickHouse bind to IPv4 only, since Docker doesn't enable IPv6 in bridge networks by default.
- # Fixes "Listen [::]:9000 failed: Address family for hostname not supported" warnings.
- - ./clickhouse/ipv4-only.xml:/etc/clickhouse-server/config.d/ipv4-only.xml:ro
- # This makes ClickHouse consume less resources, which is useful for small setups.
- # https://clickhouse.com/docs/en/operations/tips#using-less-than-16gb-of-ram
- - ./clickhouse/low-resources.xml:/etc/clickhouse-server/config.d/low-resources.xml:ro
- ulimits:
- nofile:
- soft: 262144
- hard: 262144
- environment:
- - CLICKHOUSE_SKIP_USER_SETUP=1
- healthcheck:
- test: ["CMD-SHELL", "wget --no-verbose --tries=1 -O - http://127.0.0.1:8123/ping || exit 1"]
- start_period: 1m
-
- plausible:
- image: ghcr.io/plausible/community-edition:v3.0.1
- restart: unless-stopped
- container_name: plausible
- command: sh -c "/entrypoint.sh db createdb && /entrypoint.sh db migrate && /entrypoint.sh run"
- depends_on:
- plausible_db:
- condition: service_healthy
- plausible_events_db:
- condition: service_healthy
- #volumes:
- # - ./docker_data/plausible_data:/var/lib/plausible # https://github.com/plausible/community-edition/issues/163
- ulimits:
- nofile:
- soft: 65535
- hard: 65535
- ports:
- - 8000 # :8000
- environment:
- - TMPDIR=/var/lib/plausible/tmp
- # required: https://github.com/plausible/community-edition/wiki/configuration#required
- #- BASE_URL=${BASE_URL}
- #- SECRET_KEY_BASE=${SECRET_KEY_BASE}
- - BASE_URL=https://plausible.matitos.org
- - SECRET_KEY_BASE=KKfwEjeK3Xp6NdH7eCJ2szWliTueiB0vcCT4XpHvEE8ZHgvRg0Vle90wOrETQZoC
- # optional: https://github.com/plausible/community-edition/wiki/configuration#optional
- # registration: https://github.com/plausible/community-edition/wiki/configuration#registration
- - TOTP_VAULT_KEY
- - DISABLE_REGISTRATION
- - ENABLE_EMAIL_VERIFICATION
- # web: https://github.com/plausible/community-edition/wiki/configuration#web
- - HTTP_PORT=8000
- - HTTPS_PORT
- # databases: https://github.com/plausible/community-edition/wiki/configuration#database
- - DATABASE_URL
- - CLICKHOUSE_DATABASE_URL
- # Google: https://github.com/plausible/community-edition/wiki/configuration#google
- - GOOGLE_CLIENT_ID
- - GOOGLE_CLIENT_SECRET
- # geolocation: https://github.com/plausible/community-edition/wiki/configuration#ip-geolocation
- - IP_GEOLOCATION_DB
- - GEONAMES_SOURCE_FILE
- - MAXMIND_LICENSE_KEY
- - MAXMIND_EDITION
- # email: https://github.com/plausible/community-edition/wiki/configuration#email
- - MAILER_ADAPTER
- - MAILER_EMAIL
- - MAILER_NAME
- - SMTP_HOST_ADDR
- - SMTP_HOST_PORT
- - SMTP_USER_NAME
- - SMTP_USER_PWD
- - SMTP_HOST_SSL_ENABLED
- - POSTMARK_API_KEY
- - MAILGUN_API_KEY
- - MAILGUN_DOMAIN
- - MAILGUN_BASE_URI
- - MANDRILL_API_KEY
- - SENDGRID_API_KEY
- labels: # Reverse proxy sample
- - "traefik.enable=true"
- - "traefik.http.routers.plausible.rule=Host(`plausible.matitos.org`)"
- - "traefik.http.routers.plausible.entrypoints=websecure"
- - "traefik.http.routers.plausible.tls.certresolver=myresolvercd"
- - "traefik.http.services.plausible.loadbalancer.server.port=8000"
- networks:
- - default # This network
- - docker_default # Reverse proxy network
-
-networks:
- docker_default:
- external: true