From 32d01a2cd6971f8c1d80f8e22d4280cf69b7765e Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Mon, 21 Apr 2025 18:51:12 +0200 Subject: [PATCH] FR code delivery --- .env | 38 +-- app_urls/init_data.json | 77 ++++-- app_urls/init_data_fr.json | 65 ------ app_urls/init_data_sca.json | 34 --- docker-compose-prod.yml | 123 ++++++++++ docker-compose.yml | 147 +++++++++--- utils/Schools-NL.ipynb | 335 --------------------------- website/clickhouse/ipv4-only.xml | 3 - website/clickhouse/logs.xml | 28 --- website/clickhouse/low-resources.xml | 23 -- website/docker-compose.yml | 147 ------------ 11 files changed, 319 insertions(+), 701 deletions(-) delete mode 100644 app_urls/init_data_fr.json delete mode 100644 app_urls/init_data_sca.json create mode 100644 docker-compose-prod.yml delete mode 100644 utils/Schools-NL.ipynb delete mode 100644 website/clickhouse/ipv4-only.xml delete mode 100644 website/clickhouse/logs.xml delete mode 100644 website/clickhouse/low-resources.xml delete mode 100644 website/docker-compose.yml diff --git a/.env b/.env index c97170f..02ac98b 100644 --- a/.env +++ b/.env @@ -1,23 +1,31 @@ -# Initialization -INITIALIZE_DB=true -DJANGO_SUPERUSER_USERNAME=matitos -DJANGO_SUPERUSER_PASSWORD=matitos -DJANGO_SUPERUSER_EMAIL=matitos@matitos.org +# Reverse proxy +TRAEFIK_MAIL=yourmail@protonmail.com +DUCKDNS_TOKEN= +DUCKDNS_SUBDOMAINS= # Reverse proxy -REVERSE_PROXY_URL=sample.url.com +OLLAMA_WEBUI_REVERSE_PROXY_URL=ollama.steep.duckdns.org +OLLAMA_REVERSE_PROXY_URL=ollamamodel.steep.duckdns.org +REVERSE_PROXY_URL=fetcher.steep.duckdns.org +DJANGO_ALLOWED_ORIGINS=https://fetcher.steep.duckdns.org # Reverse proxy + + +# Initialization +INITIALIZE_DB=true +DJANGO_SUPERUSER_USERNAME=steep +DJANGO_SUPERUSER_PASSWORD=steep +DJANGO_SUPERUSER_EMAIL=steep@steepnews.org # Django -DJANGO_ALLOWED_ORIGINS=https://sample.url.com # Reverse proxy DJANGO_ALLOWED_HOSTS=* # host1,host2 -DJANGO_SECRET_KEY=EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5 +DJANGO_SECRET_KEY=EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkqN7dXVUsMSqy6a5rjY6WNCw3CcRH5 DJANGO_DEBUG=True PATH_LOGS_DIRECTORY=/opt/logs # Database -DB_NAME=matitos -DB_PASSWORD=supermatitos -DB_USER=supermatitos +DB_NAME=steep +DB_PASSWORD=supersteep +DB_USER=supersteep PATH_DB_DATA=. # Database: Django @@ -40,7 +48,7 @@ FETCHER_ERROR_URL_CACHE_TIME=172800 # Selenium SELENIUM_ENDPOINT=http://fetcher_app_selenium:80 -ENDPOINT_OLLAMA=https://ollamamodel.matitos.org +ENDPOINT_OLLAMA=http://ollama:11434 # APP: Selenium ARCH=amd64 # arm64, amd64 @@ -52,6 +60,6 @@ DEPLOY_CPUS=2 DEPLOY_RAM=4G # Ghost -GHOST_ADMIN_API_URL=https://news.matitos.org/ghost/api/admin/ -GHOST_ADMIN_API_KEY=67fffe1b8a57a80001ecec5b:59f580020c196f92e05e208d288702082f8edad6366e2b2c8940b54e41cc355a -PEXELS_API_KEY=Y6clJkY32eihf34ukX4JsINYu9lzxh3xDdNq2HMAmGwXp0a0tt6vr6S9 +GHOST_ADMIN_API_URL= +GHOST_ADMIN_API_KEY= +PEXELS_API_KEY= diff --git a/app_urls/init_data.json b/app_urls/init_data.json index 0ed7168..ec59b44 100644 --- a/app_urls/init_data.json +++ b/app_urls/init_data.json @@ -1,34 +1,65 @@ { "SEARCH": { "rss_feed": [ - "https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC", - "https://feeds.feedburner.com/breitbart", - "https://feeds.feedburner.com/zerohedge/feed", - "https://moxie.foxnews.com/google-publisher/latest.xml", - "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362", - "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362" ], "url_host": [ - "missingkids.org/poster", - "missingkids.org/new-poster", - "breitbart.com", - "zerohedge.com", - "foxnews.com", - "cnbc.com" - ], + "johnpilger.com", + "lapenseeecologique.com", + "partage-le.com", + "reflets.info", + "rezo.net", + "consortiumnews.com", + "disclose.ngo/fr", + "energieetenvironnement.com", + "global-climat.com", + "slashdot.org", + "lesamisdebartleby.wordpress.com", + "lundi.am", + "lvsl.fr", + "moderndiplomacy.eu", + "mrmondialisation.org", + "ourfiniteworld.com", + "southfront.org", + "simplicius76.substack.com", + "smoothiex12.blogspot.com", + "theintercept.com", + "wikileaks.org", + "contretemps.eu", + "indianpunchline.com", + "investigaction.net/fr", + "notechmagazine.com", + "terrestres.org", + "truthdig.com", + "tass.com", + "bastamag.net", + "counterpunch.org", + "energy-daily.com", + "fakirpresse.info", + "geopoliticalmonitor.com", + "huffingtonpost.fr", + "legrandsoir.info", + "les-crises.fr", + "liberation.fr", + "maitre-eolas.fr", + "marianne.net", + "mediapart.fr", + "metaefficient.com", + "monde-diplomatique.fr", + "paulcraigroberts.org", + "politis.fr", + "reporterre.net", + "rue89.com", + "theguardian.com/international", + "treehugger.com", + "unz.com", + "voltairenet.org", + "wsws.org" + ], "keyword_search": [ - "child abuse" + "society collapse" ] }, "REGEX_PATTERN_STATUS_PRIORITY": [ - [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50], - [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75], - [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75], - [".*radio.foxnews\\.com\\/.*", "invalid", 75], - [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75], - [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75], - [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50], - [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50], - [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50] + [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50] ] } diff --git a/app_urls/init_data_fr.json b/app_urls/init_data_fr.json deleted file mode 100644 index ec59b44..0000000 --- a/app_urls/init_data_fr.json +++ /dev/null @@ -1,65 +0,0 @@ -{ - "SEARCH": { - "rss_feed": [ - ], - "url_host": [ - "johnpilger.com", - "lapenseeecologique.com", - "partage-le.com", - "reflets.info", - "rezo.net", - "consortiumnews.com", - "disclose.ngo/fr", - "energieetenvironnement.com", - "global-climat.com", - "slashdot.org", - "lesamisdebartleby.wordpress.com", - "lundi.am", - "lvsl.fr", - "moderndiplomacy.eu", - "mrmondialisation.org", - "ourfiniteworld.com", - "southfront.org", - "simplicius76.substack.com", - "smoothiex12.blogspot.com", - "theintercept.com", - "wikileaks.org", - "contretemps.eu", - "indianpunchline.com", - "investigaction.net/fr", - "notechmagazine.com", - "terrestres.org", - "truthdig.com", - "tass.com", - "bastamag.net", - "counterpunch.org", - "energy-daily.com", - "fakirpresse.info", - "geopoliticalmonitor.com", - "huffingtonpost.fr", - "legrandsoir.info", - "les-crises.fr", - "liberation.fr", - "maitre-eolas.fr", - "marianne.net", - "mediapart.fr", - "metaefficient.com", - "monde-diplomatique.fr", - "paulcraigroberts.org", - "politis.fr", - "reporterre.net", - "rue89.com", - "theguardian.com/international", - "treehugger.com", - "unz.com", - "voltairenet.org", - "wsws.org" - ], - "keyword_search": [ - "society collapse" - ] - }, - "REGEX_PATTERN_STATUS_PRIORITY": [ - [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50] - ] -} diff --git a/app_urls/init_data_sca.json b/app_urls/init_data_sca.json deleted file mode 100644 index 0ed7168..0000000 --- a/app_urls/init_data_sca.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "SEARCH": { - "rss_feed": [ - "https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC", - "https://feeds.feedburner.com/breitbart", - "https://feeds.feedburner.com/zerohedge/feed", - "https://moxie.foxnews.com/google-publisher/latest.xml", - "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362", - "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362" - ], - "url_host": [ - "missingkids.org/poster", - "missingkids.org/new-poster", - "breitbart.com", - "zerohedge.com", - "foxnews.com", - "cnbc.com" - ], - "keyword_search": [ - "child abuse" - ] - }, - "REGEX_PATTERN_STATUS_PRIORITY": [ - [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50], - [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75], - [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75], - [".*radio.foxnews\\.com\\/.*", "invalid", 75], - [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75], - [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75], - [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50], - [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50], - [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50] - ] -} diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml new file mode 100644 index 0000000..3f32d6d --- /dev/null +++ b/docker-compose-prod.yml @@ -0,0 +1,123 @@ +version: '3.9' + +services: + + fetcher_app_selenium: + image: fetcher_app_selenium + build: + context: ./app_selenium + args: + - ARCH=${ARCH} # arm64, amd64 + container_name: fetcher_app_selenium + restart: unless-stopped + shm_size: 512mb + environment: + - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE} + - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY} + ports: + - 80 + dns: + - 1.1.1.1 + - 1.0.0.1 + deploy: + resources: + limits: + cpus: '${DEPLOY_CPUS}' + memory: ${DEPLOY_RAM} + + fetcher_app_urls: + image: fetcher_app_urls + build: + context: ./app_urls + container_name: fetcher_app_urls + restart: unless-stopped + environment: + # Initialization + - INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence + - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME} + - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD} + - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL} + # Django + - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2 + - DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy + - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY} + - DJANGO_DEBUG=${DJANGO_DEBUG} + - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY} + # Database + - DB_NAME=${DB_NAME} + - DB_USER=${DB_USER} + - DB_PASSWORD=${DB_PASSWORD} + - DB_HOST=${DB_HOST} + - DB_PORT=${DB_PORT} + - REDIS_HOST=${REDIS_HOST} + - REDIS_PORT=${REDIS_PORT} + # Job timeout: 30 min + - JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT} + # Fetcher + - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP} + - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP} + - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search + - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host + - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection + - FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL + - FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL + # Selenium + - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT} + - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA} + # Ghost + - GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY} + - GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL} + - PEXELS_API_KEY=${PEXELS_API_KEY} + ######################## + #volumes: # Development mode + # - ./app_urls:/opt/app + ######################## + ports: + - 8000 # :8000 + depends_on: + - fetcher_db + - fetcher_redis + dns: + - 1.1.1.1 + - 1.0.0.1 + deploy: + resources: + limits: + cpus: '${DEPLOY_CPUS}' + memory: ${DEPLOY_RAM} + labels: # Reverse proxy sample + - "traefik.enable=true" + - "traefik.http.routers.fetcher.rule=Host(`${REVERSE_PROXY_URL}`)" + - "traefik.http.routers.fetcher.entrypoints=websecure" + - "traefik.http.routers.fetcher.tls.certresolver=myresolvercd" + - "traefik.http.services.fetcher.loadbalancer.server.port=8000" + networks: + - default # This network + - docker_default # Reverse proxy network + + fetcher_db: + image: postgres:17 + container_name: fetcher_db + restart: unless-stopped + # Set shared memory limit when using docker-compose + shm_size: 128mb + environment: + POSTGRES_DB: ${DB_NAME} + POSTGRES_PASSWORD: ${DB_PASSWORD} + POSTGRES_USER: ${DB_USER} + POSTGRES_INITDB_ARGS: '--data-checksums' + volumes: # Persistent DB? + - ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data + ports: + - 5432 #:5432 + + fetcher_redis: + image: redis:alpine + container_name: fetcher_redis + restart: unless-stopped + ports: + - 6379 #:6379 + +networks: + docker_default: + external: true diff --git a/docker-compose.yml b/docker-compose.yml index 3f32d6d..148a370 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,28 +2,126 @@ version: '3.9' services: - fetcher_app_selenium: - image: fetcher_app_selenium - build: - context: ./app_selenium - args: - - ARCH=${ARCH} # arm64, amd64 - container_name: fetcher_app_selenium + duckdns: + image: lscr.io/linuxserver/duckdns:latest + container_name: duckdns restart: unless-stopped - shm_size: 512mb environment: - - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE} - - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY} + - PUID=1000 #optional + - PGID=1000 #optional + - TZ=Europe/London + - SUBDOMAINS=${DUCKDNS_SUBDOMAINS} + - TOKEN=${DUCKDNS_TOKEN} + - LOG_FILE=true #optional + volumes: + - ${PATH_DB_DATA}/duckdns_config:/config + + traefik: + image: "traefik:v3.3" + container_name: "traefik" + restart: unless-stopped + command: + - "--api.insecure=true" + - "--providers.docker=true" + - "--providers.docker.exposedbydefault=false" + # Logs for fail2ban + - "--log.level=INFO" + - "--accesslog=true" + - "--accesslog.filepath=/var/log/traefik/access.log" + # HTTPS + - "--entrypoints.websecure.address=:443" + # HTTPS -> Timeouts + - "--entrypoints.websecure.transport.respondingTimeouts.readTimeout=1200s" + - "--entrypoints.websecure.transport.respondingTimeouts.idleTimeout=1200s" + - "--entrypoints.websecure.transport.respondingTimeouts.writeTimeout=1200s" + # HTTP -> HTTPS + - "--entryPoints.web.address=:80" + - "--entrypoints.web.http.redirections.entryPoint.to=websecure" + # Let's Encrypt + - "--certificatesresolvers.myresolver.acme.email=${TRAEFIK_MAIL}" + - "--certificatesresolvers.myresolver.acme.storage=/letsencrypt/acme.json" + # TLS challenge to request new certificate + - "--certificatesresolvers.myresolver.acme.tlschallenge=true" ports: - - 80 - dns: - - 1.1.1.1 - - 1.0.0.1 - deploy: - resources: - limits: - cpus: '${DEPLOY_CPUS}' - memory: ${DEPLOY_RAM} + - "80:80" + - "443:443" + - "8080:8080" + volumes: + - "${PATH_DB_DATA}/letsencrypt:/letsencrypt" + - "${PATH_DB_DATA}/traefik_logs:/var/log/traefik" + - "/var/run/docker.sock:/var/run/docker.sock:ro" + + + ollama: + image: ollama/ollama:latest + container_name: ollama + restart: unless-stopped + ports: + - '11434:11434' + volumes: + - ${PATH_DB_DATA}/ollama:/root/.ollama + #deploy: + # resources: + # limits: + # memory: 6G + # cpus: 6 # 80% for 8 cores + labels: + - "traefik.enable=true" + - "traefik.http.routers.ollama-secure.rule=Host(`${OLLAMA_REVERSE_PROXY_URL}`)" + - "traefik.http.routers.ollama.entrypoints=websecure" + - "traefik.http.routers.ollama-secure.tls.certresolver=myresolver" + - "traefik.http.services.ollama.loadbalancer.server.port=11434" + + ollama-webui: + image: ghcr.io/ollama-webui/ollama-webui:main + container_name: ollama-webui + restart: unless-stopped + ports: + - 8080:8080 + volumes: + - ${PATH_DB_DATA}/ollama-webui:/app/backend/data + depends_on: + - ollama + environment: + - 'OLLAMA_API_BASE_URL=http://ollama:11434/api' + - 'ENABLE_SIGNUP=false' + #- 'ENABLE_RAG_WEB_SEARCH=true' + #- 'RAG_WEB_SEARCH_ENGINE=brave' + #- 'ENABLE_IMAGE_GENERATION=true' + #- 'IMAGE_GENERATION_ENGINE=comfyui' + #- 'COMFYUI_BASE_URL=comfyui.matitos.org' + #- 'COMFYUI_API_KEY=' + #- 'COMFYUI_WORKFLOW=' # https://docs.openwebui.com/getting-started/env-configuration#comfyui_workflow + labels: + - "traefik.enable=true" + - "traefik.http.routers.ollamawebui-secure.rule=Host(`${OLLAMA_WEBUI_REVERSE_PROXY_URL}`)" + - "traefik.http.routers.ollamawebui.entrypoints=websecure" + - "traefik.http.routers.ollamawebui-secure.tls.certresolver=myresolver" + - "traefik.http.services.ollamawebui.loadbalancer.server.port=8080" + + + #fetcher_app_selenium: + # image: fetcher_app_selenium + # build: + # context: ./app_selenium + # args: + # - ARCH=${ARCH} # arm64, amd64 + # container_name: fetcher_app_selenium + # restart: unless-stopped + # shm_size: 512mb + # environment: + # - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE} + # - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY} + # ports: + # - 80 + # dns: + # - 1.1.1.1 + # - 1.0.0.1 + # deploy: + # resources: + # limits: + # cpus: '${DEPLOY_CPUS}' + # memory: ${DEPLOY_RAM} fetcher_app_urls: image: fetcher_app_urls @@ -73,7 +171,7 @@ services: # - ./app_urls:/opt/app ######################## ports: - - 8000 # :8000 + - 8000:8000 depends_on: - fetcher_db - fetcher_redis @@ -89,11 +187,8 @@ services: - "traefik.enable=true" - "traefik.http.routers.fetcher.rule=Host(`${REVERSE_PROXY_URL}`)" - "traefik.http.routers.fetcher.entrypoints=websecure" - - "traefik.http.routers.fetcher.tls.certresolver=myresolvercd" + - "traefik.http.routers.fetcher.tls.certresolver=myresolver" - "traefik.http.services.fetcher.loadbalancer.server.port=8000" - networks: - - default # This network - - docker_default # Reverse proxy network fetcher_db: image: postgres:17 @@ -117,7 +212,3 @@ services: restart: unless-stopped ports: - 6379 #:6379 - -networks: - docker_default: - external: true diff --git a/utils/Schools-NL.ipynb b/utils/Schools-NL.ipynb deleted file mode 100644 index 3bee3dd..0000000 --- a/utils/Schools-NL.ipynb +++ /dev/null @@ -1,335 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "from bs4 import BeautifulSoup\n", - "from urllib.parse import urljoin\n", - "import pandas as pd\n", - "import os\n", - "\n", - "headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Function to decode Cloudflare-protected emails\n", - "def decode_email(encoded_email):\n", - " \"\"\"\n", - " Decode an email protected by Cloudflare's email protection.\n", - " :param encoded_email: The encoded email string from the data-cfemail attribute.\n", - " :return: The decoded email address.\n", - " \"\"\"\n", - " email = \"\"\n", - " key = int(encoded_email[:2], 16) # Extract the key (first two characters)\n", - " for i in range(2, len(encoded_email), 2):\n", - " # XOR each pair of hex characters with the key\n", - " email += chr(int(encoded_email[i:i + 2], 16) ^ key)\n", - " return email\n", - "\n", - "def extract_emails(soup):\n", - " # Find all visible email links (mailto:)\n", - " visible_emails = []\n", - " for link in soup.find_all('a', href=lambda href: href and href.startswith('mailto:')):\n", - " email = link['href'].replace('mailto:', '')\n", - " visible_emails.append(email)\n", - "\n", - " # Find all Cloudflare-protected emails\n", - " protected_emails = []\n", - " for span in soup.find_all('span', class_='__cf_email__', attrs={'data-cfemail': True}):\n", - " encoded_email = span['data-cfemail']\n", - " decoded_email = decode_email(encoded_email)\n", - " protected_emails.append(decoded_email)\n", - "\n", - " # Combine all emails\n", - " all_emails = visible_emails + protected_emails\n", - " all_emails = list(set(all_emails))\n", - " if (len(all_emails) == 0):\n", - " return None\n", - " elif (len(all_emails) == 1):\n", - " return all_emails[0]\n", - " else:\n", - " return all_emails\n", - "\n", - "def find_website(soup_school):\n", - " # Find all tags with href attributes\n", - " for link in soup_school.find(class_=\"dl-horizontal dl-icons\").find_all('a', href=True):\n", - " href = link['href']\n", - " # Filter out only valid URLs (e.g., starting with http or https)\n", - " if href.startswith(('http://', 'https://')):\n", - " # websites.append(href)\n", - " return href\n", - "\n", - "\n", - "def main():\n", - " list_urls = [\n", - " \"https://scholenopdekaart.nl/Basisscholen/\",\n", - " \"https://scholenopdekaart.nl/middelbare-scholen/\"\n", - " ]\n", - "\n", - " list_school_data_dicts = []\n", - "\n", - " # For each category\n", - " for url in list_urls:\n", - " # Fetch the HTML content of the page\n", - " response = requests.get(url, headers=headers)\n", - " response.raise_for_status() # Raise an exception for HTTP errors\n", - " # Parse the HTML content using BeautifulSoup\n", - " soup = BeautifulSoup(response.text, 'html.parser')\n", - "\n", - " # Get category\n", - " category = url.strip(\"/\").split(\"/\")[-1].lower()\n", - "\n", - " # Find all tags with href attributes\n", - " links_areas = []\n", - " for a_tag in soup.find_all('a', href=True):\n", - " href = a_tag['href']\n", - " \n", - " if (category not in href):\n", - " continue\n", - " \n", - " # Convert relative URLs to absolute URLs\n", - " area_full_url = urljoin(url, href)\n", - " links_areas.append(area_full_url)\n", - "\n", - " # Area\n", - " area = href.rstrip(\"/\").split(\"/\")[-1]\n", - "\n", - " ###############################################\n", - " # Fetch the HTML content of the page\n", - " print(\".\", end=\"\")\n", - " response = requests.get(area_full_url, headers=headers)\n", - " response.raise_for_status() # Raise an exception for HTTP errors\n", - "\n", - " # Parse the HTML content using BeautifulSoup\n", - " soup_area= BeautifulSoup(response.text, 'html.parser')\n", - "\n", - " # Get schools in area\n", - " for a_tag in soup_area.find_all('a', href=True):\n", - " href = a_tag['href']\n", - "\n", - " school_url = urljoin(url, href)\n", - " if (area_full_url not in school_url):\n", - " continue\n", - " \n", - " school_name = a_tag.text.rstrip(\".\")\n", - " school_data = {\n", - " \"category\": category,\n", - " \"area\": area,\n", - " \"name\": school_name,\n", - " \"url\": school_url,\n", - " }\n", - "\n", - " try:\n", - " # Process school (request contact details)\n", - " response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n", - " response.raise_for_status() # Raise an exception for HTTP errors\n", - "\n", - " # Parse the HTML content using BeautifulSoup\n", - " soup_school = BeautifulSoup(response.text, 'html.parser')\n", - "\n", - " # School details\n", - " school_details = soup_school.find(class_=\"school-details\")\n", - " for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n", - " data = li_detail.find('span', class_='infotip-term')['data-dfn']\n", - " text = li_detail.get_text(strip=True)\n", - " # Set data\n", - " school_data[\"category_{}\".format(category_idx)] = text\n", - " school_data[\"category_{}_description\".format(category_idx)] = data\n", - " \n", - " school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n", - " school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n", - " school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n", - " school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n", - "\n", - " school_data[\"city\"] = school_city\n", - " school_data[\"postcode\"] = school_postcode\n", - " school_data[\"address\"] = school_address\n", - "\n", - " try:\n", - " school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n", - " except Exception as e:\n", - " pass\n", - " try:\n", - " school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n", - " except Exception as e:\n", - " pass\n", - " try:\n", - " school_data[\"email\"] = extract_emails(soup_school)\n", - " except Exception as e:\n", - " pass\n", - " \n", - " except Exception as e:\n", - " print(school_url, str(e))\n", - " # assert False\n", - "\n", - " list_school_data_dicts.append(school_data)\n", - "\n", - " df = pd.DataFrame(list_school_data_dicts)\n", - " df.to_csv(\"scholenopdekaart.csv\")\n", - "\n", - "\"\"\" # Issues with URL:\n", - "https://scholenopdekaart.nl/middelbare-scholen/grave/1900/merletcollege-grave/\n", - "https://scholenopdekaart.nl/middelbare-scholen/lent/4386/citadel-college-locatie-griftdijk/\n", - "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/24527/montessori-college-k33-nijmegen/\n", - "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26368/aventurijn-park-neerbosch/\n", - "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26187/kandinsky-college-voor-lyceum-havo-mavo-vbo-lwoo/\n", - "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/1791/karel-de-grote-college/\n", - "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2040/mondial-college-locatie-leuvensbroek/\n", - "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2041/mondial-college-meeuwse-acker/\n", - "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2036/stedelijk-gymnasium-nijmegen/\n", - "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/2038/stedelijke-scholengemeenschap-nijmegen/\n", - "https://scholenopdekaart.nl/middelbare-scholen/nijmegen/26184/yuverta-vmbo-het-groene-lyceum-nijmegen/\n", - "https://scholenopdekaart.nl/middelbare-scholen/oss/23719/het-hooghuis-locatie-mondriaan-college/\n", - "https://scholenopdekaart.nl/middelbare-scholen/oss/943/het-hooghuis-locatie-oss-stadion/\n", - "https://scholenopdekaart.nl/middelbare-scholen/oss/947/het-hooghuis-zuidwest-gebouw-west/\n", - "https://scholenopdekaart.nl/middelbare-scholen/oss/946/het-hooghuis-zuidwest-gebouw-zuid/\n", - "https://scholenopdekaart.nl/middelbare-scholen/oss/1929/het-maaslandcollege-scholengemeenschap-voor-tweetalig-mavo-havo-vwo/\n", - "https://scholenopdekaart.nl/middelbare-scholen/oss/25783/sonnewijser-unit-route-arbeid/\n", - "https://scholenopdekaart.nl/middelbare-scholen/oss/11432/sonnewijser-unit-vervolgonderwijs-oss/\n", - "https://scholenopdekaart.nl/middelbare-scholen/oss/942/titus-brandsmalyceum/\n", - "https://scholenopdekaart.nl/middelbare-scholen/velp-noord-brabant/24545/merletcollege-eerste-opvang-anderstaligen-eoa/\n", - "https://scholenopdekaart.nl/middelbare-scholen/wijchen/2018/maaswaal-college-havo-atheneum-gymnasium/\n", - "https://scholenopdekaart.nl/middelbare-scholen/wijchen/2020/maaswaal-college-vmbo-basis-kader-mavo/\n", - "https://scholenopdekaart.nl/middelbare-scholen/wijchen/1781/pro-college-wijchen/\n", - "\"\"\"\n", - "\n", - "if __name__ == \"__main__\":\n", - " main()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "'''\n", - "school_url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n", - "response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n", - "# Parse the HTML content using BeautifulSoup\n", - "soup_school = BeautifulSoup(response.text, 'html.parser')\n", - "soup_school\n", - "'''" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n", - "df.loc[0, \"category_3\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "from bs4 import BeautifulSoup\n", - "\n", - "# Step 1: Fetch the webpage\n", - "url = \"https://scholenopdekaart.nl/basisscholen/aagtekerke/25963/jhr-willem-versluijsschool/\"\n", - "headers = {\n", - " \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36\"\n", - "}\n", - "response = requests.get(url, headers=headers)\n", - "\n", - "# Check if the request was successful\n", - "if response.status_code != 200:\n", - " print(f\"Failed to retrieve the page. Status code: {response.status_code}\")\n", - " exit()\n", - "\n", - "# Step 2: Parse the HTML content\n", - "soup = BeautifulSoup(response.text, 'html.parser')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Aantal per Leerjaar:\n", - "Groep 1: 29 leerlingen\n", - "Groep 2: 28 leerlingen\n", - "Groep 3: 30 leerlingen\n", - "Groep 4: 25 leerlingen\n", - "Groep 5: 19 leerlingen\n", - "Groep 6: 26 leerlingen\n", - "Groep 7: 22 leerlingen\n", - "Groep 8: 20 leerlingen\n" - ] - } - ], - "source": [ - "import json\n", - "\n", - "# Step 1: Locate the tag\n", - "chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n", - "\n", - "if not chart_tag:\n", - " print(\"Could not find the 'aantal per leerjaar' section.\")\n", - "else:\n", - " # Step 2: Extract the 'aantal-per-leerjaar' attribute\n", - " raw_data = chart_tag['aantal-per-leerjaar']\n", - " \n", - " # Step 3: Parse the JSON data\n", - " try:\n", - " data = json.loads(raw_data)\n", - " \n", - " # Step 4: Print the extracted data\n", - " print(\"Aantal per Leerjaar:\")\n", - " for entry in data:\n", - " print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n", - " except json.JSONDecodeError as e:\n", - " print(f\"Failed to parse JSON data: {e}\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "matitos_urls", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/website/clickhouse/ipv4-only.xml b/website/clickhouse/ipv4-only.xml deleted file mode 100644 index 7eb0a5c..0000000 --- a/website/clickhouse/ipv4-only.xml +++ /dev/null @@ -1,3 +0,0 @@ - - 0.0.0.0 - diff --git a/website/clickhouse/logs.xml b/website/clickhouse/logs.xml deleted file mode 100644 index 3902fdd..0000000 --- a/website/clickhouse/logs.xml +++ /dev/null @@ -1,28 +0,0 @@ - - - warning - true - - - - system - query_log
- 7500 - - ENGINE = MergeTree - PARTITION BY event_date - ORDER BY (event_time) - TTL event_date + interval 30 day - SETTINGS ttl_only_drop_parts=1 - -
- - - - - - - - - -
diff --git a/website/clickhouse/low-resources.xml b/website/clickhouse/low-resources.xml deleted file mode 100644 index a75e843..0000000 --- a/website/clickhouse/low-resources.xml +++ /dev/null @@ -1,23 +0,0 @@ - - - - 524288000 - - - - - 1 - - 8192 - - 1 - - 0 - - 0 - - - diff --git a/website/docker-compose.yml b/website/docker-compose.yml deleted file mode 100644 index 54b230e..0000000 --- a/website/docker-compose.yml +++ /dev/null @@ -1,147 +0,0 @@ -services: - - ghost: - image: ghost:5-alpine - container_name: ghost - restart: unless-stopped - ports: - - 2368 #- 8080:2368 - environment: - # see https://ghost.org/docs/config/#configuration-options - database__client: mysql - database__connection__host: ghost_db - database__connection__user: root - database__connection__password: example - database__connection__database: ghost - url: https://news.matitos.org - # contrary to the default mentioned in the linked documentation, this image defaults to NODE_ENV=production (so development mode needs to be explicitly specified if desired) - #NODE_ENV: development - volumes: - - ./docker_data/ghost:/var/lib/ghost/content - labels: # Reverse proxy sample - - "traefik.enable=true" - - "traefik.http.routers.news.rule=Host(`news.matitos.org`)" - - "traefik.http.routers.news.entrypoints=websecure" - - "traefik.http.routers.news.tls.certresolver=myresolvercd" - - "traefik.http.services.news.loadbalancer.server.port=2368" - networks: - - default # This network - - docker_default # Reverse proxy network - - ghost_db: - image: mysql:8.0 - container_name: ghost_db - restart: unless-stopped - environment: - MYSQL_ROOT_PASSWORD: example - volumes: - - ./docker_data/ghost_db:/var/lib/mysql - - plausible_db: - image: postgres:16-alpine - restart: unless-stopped - container_name: plausible_db - volumes: - - ./docker_data/plausible_db_data:/var/lib/postgresql/data - environment: - - POSTGRES_PASSWORD=postgres - healthcheck: - test: ["CMD-SHELL", "pg_isready -U postgres"] - start_period: 1m - - plausible_events_db: - image: clickhouse/clickhouse-server:24.12-alpine - restart: unless-stopped - container_name: plausible_events_db - volumes: - - ./docker_data/event-data:/var/lib/clickhouse - - ./docker_data/event-logs:/var/log/clickhouse-server - - ./clickhouse/logs.xml:/etc/clickhouse-server/config.d/logs.xml:ro - # This makes ClickHouse bind to IPv4 only, since Docker doesn't enable IPv6 in bridge networks by default. - # Fixes "Listen [::]:9000 failed: Address family for hostname not supported" warnings. - - ./clickhouse/ipv4-only.xml:/etc/clickhouse-server/config.d/ipv4-only.xml:ro - # This makes ClickHouse consume less resources, which is useful for small setups. - # https://clickhouse.com/docs/en/operations/tips#using-less-than-16gb-of-ram - - ./clickhouse/low-resources.xml:/etc/clickhouse-server/config.d/low-resources.xml:ro - ulimits: - nofile: - soft: 262144 - hard: 262144 - environment: - - CLICKHOUSE_SKIP_USER_SETUP=1 - healthcheck: - test: ["CMD-SHELL", "wget --no-verbose --tries=1 -O - http://127.0.0.1:8123/ping || exit 1"] - start_period: 1m - - plausible: - image: ghcr.io/plausible/community-edition:v3.0.1 - restart: unless-stopped - container_name: plausible - command: sh -c "/entrypoint.sh db createdb && /entrypoint.sh db migrate && /entrypoint.sh run" - depends_on: - plausible_db: - condition: service_healthy - plausible_events_db: - condition: service_healthy - #volumes: - # - ./docker_data/plausible_data:/var/lib/plausible # https://github.com/plausible/community-edition/issues/163 - ulimits: - nofile: - soft: 65535 - hard: 65535 - ports: - - 8000 # :8000 - environment: - - TMPDIR=/var/lib/plausible/tmp - # required: https://github.com/plausible/community-edition/wiki/configuration#required - #- BASE_URL=${BASE_URL} - #- SECRET_KEY_BASE=${SECRET_KEY_BASE} - - BASE_URL=https://plausible.matitos.org - - SECRET_KEY_BASE=KKfwEjeK3Xp6NdH7eCJ2szWliTueiB0vcCT4XpHvEE8ZHgvRg0Vle90wOrETQZoC - # optional: https://github.com/plausible/community-edition/wiki/configuration#optional - # registration: https://github.com/plausible/community-edition/wiki/configuration#registration - - TOTP_VAULT_KEY - - DISABLE_REGISTRATION - - ENABLE_EMAIL_VERIFICATION - # web: https://github.com/plausible/community-edition/wiki/configuration#web - - HTTP_PORT=8000 - - HTTPS_PORT - # databases: https://github.com/plausible/community-edition/wiki/configuration#database - - DATABASE_URL - - CLICKHOUSE_DATABASE_URL - # Google: https://github.com/plausible/community-edition/wiki/configuration#google - - GOOGLE_CLIENT_ID - - GOOGLE_CLIENT_SECRET - # geolocation: https://github.com/plausible/community-edition/wiki/configuration#ip-geolocation - - IP_GEOLOCATION_DB - - GEONAMES_SOURCE_FILE - - MAXMIND_LICENSE_KEY - - MAXMIND_EDITION - # email: https://github.com/plausible/community-edition/wiki/configuration#email - - MAILER_ADAPTER - - MAILER_EMAIL - - MAILER_NAME - - SMTP_HOST_ADDR - - SMTP_HOST_PORT - - SMTP_USER_NAME - - SMTP_USER_PWD - - SMTP_HOST_SSL_ENABLED - - POSTMARK_API_KEY - - MAILGUN_API_KEY - - MAILGUN_DOMAIN - - MAILGUN_BASE_URI - - MANDRILL_API_KEY - - SENDGRID_API_KEY - labels: # Reverse proxy sample - - "traefik.enable=true" - - "traefik.http.routers.plausible.rule=Host(`plausible.matitos.org`)" - - "traefik.http.routers.plausible.entrypoints=websecure" - - "traefik.http.routers.plausible.tls.certresolver=myresolvercd" - - "traefik.http.services.plausible.loadbalancer.server.port=8000" - networks: - - default # This network - - docker_default # Reverse proxy network - -networks: - docker_default: - external: true