Read only fetcher urls app

Disable missing kids all urls check
Views base fix
2025-08-27 12:55:00 +02:00 · 2025-08-26 10:16:18 +02:00 · 2025-08-22 13:26:14 +02:00 · 2025-08-22 13:11:02 +02:00 · 2025-08-18 16:39:26 +02:00 · 2025-08-14 15:13:18 +02:00
58 changed files with 2822 additions and 892 deletions
--- a/.env.sample
+++ b/.env.sample
@@ -1,3 +1,7 @@
 # AutoSSH DB
 REMOTE_HOST=''
 REMOTE_USERNAME=''
 # Initialization
 INITIALIZE_DB=true
 DJANGO_SUPERUSER_USERNAME=matitos
@@ -18,9 +22,6 @@ PATH_LOGS_DIRECTORY=/opt/logs
 DB_NAME=matitos
 DB_PASSWORD=supermatitos
 DB_USER=supermatitos
 PATH_DB_DATA=.
 # Database: Django
 DB_HOST=fetcher_db
 DB_PORT=5432
 REDIS_HOST=fetcher_redis
@@ -40,18 +41,17 @@ FETCHER_ERROR_URL_CACHE_TIME=172800
 # Selenium
 SELENIUM_ENDPOINT=http://fetcher_app_selenium:80
 ENDPOINT_OLLAMA=https://ollamamodel.matitos.org
 # APP: Selenium
 ARCH=amd64 # arm64, amd64
 SELENIUM_SLEEP_PER_PAGE=4
 PATH_LOGS_DIRECTORY=/opt/logs
 # Deploy resources per App
 DEPLOY_CPUS=2
-DEPLOY_RAM=4G
+DEPLOY_RAM=3G
 # Ghost
 GHOST_ADMIN_API_URL=https://news.matitos.org/ghost/api/admin/
-GHOST_ADMIN_API_KEY=67fffe1b8a57a80001ecec5b:59f580020c196f92e05e208d288702082f8edad6366e2b2c8940b54e41cc355a
+GHOST_ADMIN_API_KEY=
 PEXELS_API_KEY=Y6clJkY32eihf34ukX4JsINYu9lzxh3xDdNq2HMAmGwXp0a0tt6vr6S9
 # Ollama
 ENDPOINT_OLLAMA=https://ollamamodelnpu.matitos.org
 OLLAMA_MODEL_DEFAULT=qwen2.5-instruct:3b
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,11 @@
 .env
 __pycache__/
 *.pyc 
 **/credentials.py
 logs/
 postgres/
-docker_data/
+docker_data/
 **/*.pt
 **/*.pth
 **/*.tar
 **/*.onnx
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@
        - TODO: Proxy / VPN?
            - TooManyRequests, ...
        - TODO: Search per locale (nl-NL, fr-FR, en-GB)
    - Fetch keyword search for selenium sources
 - URLs Processing -> Updates raw URLs
    - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
@@ -52,6 +54,10 @@
 * Dev mode
 ```
 docker compose -f docker-compose-dev.yml down -v
-docker compose -f docker-compose-dev.yml build --progress=plain
+docker compose -f docker-compose-dev.yml up --no-deps --build
 docker compose -f docker-compose-dev.yml up
 ```
 * Prod mode
 ```
 docker compose down -v
 docker compose up -d --no-deps --build
 ```
--- a/app_cv/Demo.ipynb
+++ b/app_cv/Demo.ipynb
@@ -0,0 +1,157 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import base64\n",
    "import json\n",
    "import requests\n",
    "import io\n",
    "import numpy as np\n",
    "import PIL.Image\n",
    "import cv2\n",
    "from pprint import pprint\n",
    "\n",
    "def process_image(path_img):\n",
    "    with open(path_img, \"rb\") as image_file:\n",
    "        encoded_string = base64.b64encode(image_file.read()).decode('utf-8')\n",
    "    response = requests.post(\n",
    "        'http://localhost:5000/process',\n",
    "        headers={'Content-Type': 'application/json'},\n",
    "        data=json.dumps({'image': encoded_string})\n",
    "    )\n",
    "    response_dict = response.json()\n",
    "    pprint(response_dict)\n",
    "    # Decode\n",
    "    image_bytes = base64.b64decode(response_dict.get(\"image_b64\"))\n",
    "    img_array = np.frombuffer(io.BytesIO(image_bytes).getvalue(), dtype=np.uint8)\n",
    "    img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)\n",
    "    img_rgb = img_bgr[:, :, ::-1]\n",
    "    return img_rgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_img = \"imgs/img_1p.jpg\"\n",
    "PIL.Image.fromarray( process_image(path_img) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_img = \"imgs/img_nude.jpg\"\n",
    "PIL.Image.fromarray( process_image(path_img) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "# !git clone https://github.com/wildchlamydia/mivolo\n",
    "# !pip install ultralytics yt_dlp pandas scipy timm==0.8.13.dev0\n",
    "# !pip install ./mivolo\n",
    "\n",
    "!python mivolo/demo.py \\\n",
    "    --input \"face_data/sample_image.jpg\" \\\n",
    "    --output \"output\" \\\n",
    "    --detector-weights \"mivolo/pretrained/yolov8x_person_face.pt\" \\\n",
    "    --checkpoint \"mivolo/pretrained/model_imdb_cross_person_4.22_99.46.pth.tar\" \\\n",
    "    --device \"cpu\" \\\n",
    "    --draw\n",
    "'''\n",
    "\n",
    "'''\n",
    "# !git clone https://github.com/Kartik-3004/facexformer.git\n",
    "# !pip install huggingface_hub torch torchvision torchaudio opencv-python facenet_pytorch\n",
    "from huggingface_hub import hf_hub_download\n",
    "hf_hub_download(repo_id=\"kartiknarayan/facexformer\", filename=\"ckpts/model.pt\", local_dir=\"./facexformer\")\n",
    "\n",
    "!python facexformer/inference.py \\\n",
    "    --model_path facexformer/ckpts/model.pt \\\n",
    "    --image_path face_data/sample_image.jpg \\\n",
    "    --results_path face_data \\\n",
    "    --task parsing\n",
    "    x\n",
    "!python facexformer/inference.py \\\n",
    "    --model_path facexformer/ckpts/model.pt \\\n",
    "    --image_path face_data/face.png \\\n",
    "    --results_path face_data \\\n",
    "    --task landmarks\n",
    "\n",
    "!python facexformer/inference.py \\\n",
    "    --model_path facexformer/ckpts/model.pt \\\n",
    "    --image_path face_data/face.png \\\n",
    "    --results_path face_data \\\n",
    "    --task headpose\n",
    "\n",
    "!python facexformer/inference.py \\\n",
    "    --model_path facexformer/ckpts/model.pt \\\n",
    "    --image_path face_data/face.png \\\n",
    "    --results_path face_data \\\n",
    "    --task attributes\n",
    "\n",
    "!python facexformer/inference.py \\\n",
    "    --model_path facexformer/ckpts/model.pt \\\n",
    "    --image_path face_data/face.png \\\n",
    "    --results_path face_data \\\n",
    "    --task age_gender_race\n",
    "'''"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matitos_cv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/app_cv/Dockerfile
+++ b/app_cv/Dockerfile
@@ -0,0 +1,27 @@
 FROM python:3.12
 WORKDIR /app
 # LibGL for OpenCV
 RUN apt-get update && apt-get install libgl1 -y
 # Download models
 RUN mkdir models
 # https://github.com/wildchlamydia/mivolo
 RUN curl "https://drive.usercontent.google.com/download?id=11i8pKctxz3wVkDBlWKvhYIh7kpVFXSZ4&confirm=xxx" -o models/model_imdb_cross_person_4.22_99.46.pth.tar
 RUN curl "https://drive.usercontent.google.com/download?id=1CGNCkZQNj5WkP3rLpENWAOgrBQkUWRdw&confirm=xxx" -o models/yolov8x_person_face.pt
 # https://github.com/notAI-tech/NudeNet
 # Upload to an accessible link: https://github.com/notAI-tech/NudeNet/releases/download/v3.4-weights/640m.onnx
 RUN curl "https://drive.usercontent.google.com/download?id=1lHTrW1rmYoYnMSUlhLwqFCW61-w2hvKX&confirm=xxx" -o models/640m.onnx
 COPY . .
 RUN pip install --no-cache-dir -r requirements.txt
 RUN pip freeze
 # CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "app:app"]
 CMD ["uvicorn", "--host", "0.0.0.0", "--port", "5000", "--workers", "1", "--log-level", "info", "app:app"]
 # docker build -t fetcher_cv .
 # docker run --rm -p 5000:5000 fetcher_cv
--- a/app_cv/Server.ipynb
+++ b/app_cv/Server.ipynb
@@ -0,0 +1,36 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!uvicorn app:app --workers 1 --log-level info --port 5001\n",
    "#!uvicorn app:app --reload --log-level debug --port 8000\n",
    "#!python app.py"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matitos_cv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/app_cv/app.py
+++ b/app_cv/app.py
@@ -0,0 +1,76 @@
 from fastapi import FastAPI
 from nicegui import ui, events, run
 import base64
 import io
 import numpy as np
 import cv2
 import traceback
 import logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[logging.StreamHandler()])
 from cv_processor import process
 from pydantic import BaseModel
 class Item(BaseModel):
    image: str # Base64
 app = FastAPI()
 # Define the NiceGUI UI components
@ui.page("/")
 def main_page():
    async def handle_upload(e: events.UploadEventArguments) -> None:
        ui.notify('Processing...')
        # Read content -> image
        nparr = np.frombuffer(e.content.read(), np.uint8)
        img_np_bgr = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
        # Async process
        results = await run.io_bound(process, img_np_bgr)
        # Display
        with ui.dialog() as dialog:
            # Encode
            retval, buffer = cv2.imencode('.png', results.get("image"))
            img_buffer_encoded = base64.b64encode(buffer).decode('utf-8')
            img_encoded = "data:image/png;base64,{}".format(img_buffer_encoded)
            content = ui.image(img_encoded).props('fit=scale-down')
        dialog.open()
    ui.upload(on_upload=handle_upload, auto_upload=True, on_rejected=lambda: ui.notify('Rejected!')).props('accept=image').classes('max-w-full')
 ui.run_with(app, title="CV")
@app.post('/process')
 def process_image(item: Item):
    logging.info("POST /process")
    try:
        image_data = item.image
        if (image_data is None):
            return {"error": "No image data provided"}
        # Decode base64 string
        image_bytes = base64.b64decode(image_data)
        image_stream = io.BytesIO(image_bytes)
        # Convert bytes to NumPy array
        img_array = np.frombuffer(image_stream.getvalue(), dtype=np.uint8)
        # Decode image using OpenCV
        img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
        # Valid image
        assert(img_bgr is not None)
        # Process the image
        results = process(img_bgr)
        # Encode processed image to base64
        _, buffer = cv2.imencode('.jpg', results.get("image"), [cv2.IMWRITE_JPEG_QUALITY, 100])
        processed_image_base64 = base64.b64encode(buffer).decode('utf-8')
        # Update image with base64 encoded
        results["image_b64"] = processed_image_base64
        # Pop image (not serializable)
        results.pop("image")
        return results
    except Exception as e:
        logging.warning("Exception: {}".format(traceback.format_exc()))
        return {"error": traceback.format_exc()}
--- a/app_cv/cv_processor.py
+++ b/app_cv/cv_processor.py
@@ -0,0 +1,125 @@
 import cv2
 import numpy as np
 import logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[logging.StreamHandler()])
 # Age
 from mivolo.predictor import Predictor
 import argparse
 # Nudity
 from nudenet import NudeDetector
 class CV():
    def __init__(self):
        args = argparse.ArgumentParser()
        args.add_argument("--device", type=str, default="cpu")
        args.add_argument("--checkpoint", default="models/model_imdb_cross_person_4.22_99.46.pth.tar")
        args.add_argument("--detector_weights", default="models/yolov8x_person_face.pt")
        args.add_argument("--with-persons", action="store_true", default=False, help="If set model will run with persons, if available")
        args.add_argument("--disable-faces", action="store_true", default=False, help="If set model will use only persons if available")
        args.add_argument("--draw", action="store_true", default=False, help="If set, resulted images will be drawn")
        args = args.parse_args([])
        # Initialize
        self.predictor_age = Predictor(args)
        # Initialize
        self.nude_detector = NudeDetector(model_path="models/640m.onnx", inference_resolution=640)
        # detector = NudeDetector(model_path="downloaded_640m.onnx path", inference_resolution=640)
        # https://github.com/notAI-tech/NudeNet?tab=readme-ov-file#available-models
        # All labels list
        self.nudity_all_labels = [
            "FEMALE_GENITALIA_COVERED",
            "FACE_FEMALE",
            "BUTTOCKS_EXPOSED",
            "FEMALE_BREAST_EXPOSED",
            "FEMALE_GENITALIA_EXPOSED",
            "MALE_BREAST_EXPOSED",
            "ANUS_EXPOSED",
            "FEET_EXPOSED",
            "BELLY_COVERED",
            "FEET_COVERED",
            "ARMPITS_COVERED",
            "ARMPITS_EXPOSED",
            "FACE_MALE",
            "BELLY_EXPOSED",
            "MALE_GENITALIA_EXPOSED",
            "ANUS_COVERED",
            "FEMALE_BREAST_COVERED",
            "BUTTOCKS_COVERED",
        ]
        # Classes of interest
        self.nudity_classes_of_interest = ["BUTTOCKS_EXPOSED", "FEMALE_BREAST_EXPOSED", "FEMALE_GENITALIA_EXPOSED", "ANUS_EXPOSED", "MALE_GENITALIA_EXPOSED"]
    def _censor(self, image_bgr, detections):
        # Copy original image
        image_bgr_censored = image_bgr.copy()
        for detection in detections:
            box = detection["box"]
            x, y, w, h = box[0], box[1], box[2], box[3]
            # Change these pixels to pure black
            image_bgr_censored[y : y + h, x : x + w] = (0, 0, 0)
        return image_bgr_censored
    def process_image(self, image_bgr):
        ###################################################################
        # Predict
        detected_objects, out_img = self.predictor_age.recognize(image_bgr)
        logging.debug("#persons: {}, #faces: {}".format(detected_objects.n_persons, detected_objects.n_faces))
        # Num faces and persons detected
        detected_objects.n_faces, detected_objects.n_persons
        # Association
        detected_objects.associate_faces_with_persons()
        # detected_objects.face_to_person_map
        # {2: 1, 3: 0}
        # detected_objects.ages
        # [None, None, 27.18, 23.77]
        age_predictions = [e for e in detected_objects.ages if e is not None]
        # Crops of faces & persons
        # crops = detected_objects.collect_crops(img)
        any_minor_present = any([ a < 18 for a in detected_objects.ages if a is not None ])
        ###################################################################
        ###################################################################
        # Predict
        nude_detections = self.nude_detector.detect(image_bgr)
        logging.debug("Nude detections: {}".format(nude_detections))
        # Filter by classes of interest
        nude_detections = [ detection for detection in nude_detections if detection["class"] in self.nudity_classes_of_interest ]
        # Nude detections present?
        any_nude_detection = len(nude_detections) > 0
        ###################################################################
        ###################################################################
        # Censor image
        censored_img_bgr = self._censor(image_bgr, nude_detections)
        # Plot age predictions on censored image
        output_image = detected_objects.plot(img=censored_img_bgr)
        ###################################################################
        results = {
            "any_minor_present": any_minor_present,
            "any_nude_detection": any_nude_detection,
            "nudity_detections": nude_detections,
            "age_predictions": age_predictions,
            "image": output_image,
        }
        return results
 def process(img_bgr):
    try:
        logging.info("Processing image")
        # Process
        results = CV().process_image(img_bgr)
        logging.info("Returning results")
        return results
    except Exception as e:
        logging.warning("Error processing image: {}".format(str(e)))
        return {}
--- a/app_cv/docker-compose.yml
+++ b/app_cv/docker-compose.yml
@@ -0,0 +1,23 @@
 services:
  matitos_cv:
    build:
      context: .
    image: fetcher_app_cv
    container_name: fetcher_app_cv
    restart: unless-stopped
    ports:
      - 5000
    environment:
      - DEBUG_MODE=0
    labels:  # Reverse proxy sample
      - "traefik.enable=true"
      - "traefik.http.routers.cv.rule=Host(`cv.matitos.org`)"
      - "traefik.http.routers.cv.entrypoints=websecure"
      - "traefik.http.routers.cv.tls.certresolver=myresolvercd"
      - "traefik.http.services.cv.loadbalancer.server.port=5000"
    networks:
      - docker_default  # Reverse proxy network
 networks:
  docker_default:
    external: true
--- a/app_cv/imgs/img_1p.jpg
+++ b/app_cv/imgs/img_1p.jpg
--- a/app_cv/imgs/img_nude.jpg
+++ b/app_cv/imgs/img_nude.jpg
--- a/app_cv/requirements.txt
+++ b/app_cv/requirements.txt
@@ -0,0 +1,7 @@
 opencv-python
 git+https://github.com/wildchlamydia/mivolo.git
 nudenet>=3.4.2
 torch==2.5
 nicegui
 fastapi
 gunicorn
--- a/app_cv_face/ABC.ipynb
+++ b/app_cv_face/ABC.ipynb
@@ -0,0 +1,55 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: Binary output can mess up your terminal. Use \"--output -\" to tell \n",
      "Warning: curl to output it to your terminal anyway, or consider \"--output \n",
      "Warning: <FILE>\" to save to a file.\n"
     ]
    }
   ],
   "source": [
    "!curl https://api.missingkids.org/photographs/NCMC2049364c1.jpg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install deepface\n",
    "# !pip install tf-keras\n",
    "from deepface import DeepFace"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matitos_cv_face",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/app_selenium/Dockerfile
+++ b/app_selenium/Dockerfile
@@ -49,7 +49,7 @@ RUN if [ "${ARCH}" = "amd64" ] ; then \
 && apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false $toolDeps \
 && rm -rf /var/lib/apt/lists/* /tmp/*
-RUN pip install --no-cache-dir selenium fastapi "uvicorn[standard]"
+RUN pip install --no-cache-dir selenium fastapi "uvicorn[standard]" psutil
 WORKDIR /opt/app
 COPY . /opt/app/
--- a/app_selenium/app.py
+++ b/app_selenium/app.py
@@ -1,5 +1,7 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 from missing_kids import MissingKidsFetcher
 from search import SearchFetcher
 from logger import get_logger
 logger = get_logger()
@@ -8,7 +10,44 @@ app = FastAPI()
@app.get("/get_missing_kids/")
 def get_missing_kids(pages: int = -1):
    try:
        logger.info("Get missing kids, #pages={}".format(pages))
        res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)}
    except Exception as e:
        logger.warning("Exception: {}".format(str(e)), exc_info=True)
        res = {}
    return res
 class BodyVerifyMissingKid(BaseModel):
    url: str
@app.post("/verify_missing_kid/")
 def get_missing_kids(data: BodyVerifyMissingKid):
    try:
        logger.info("Verify missing kid, URL={}".format(data.url))
        res = MissingKidsFetcher().verify_missing_kid_url(data.url)
    except Exception as e:
        logger.warning("Exception: {}".format(str(e)), exc_info=True)
        res = {}
    return res
 class BodyFetchSearch(BaseModel):
    search: str
@app.post("/fetch_search/")
 def fetch_search(data: BodyFetchSearch):
    try:
        # Initialize
        search_fetcher, results = SearchFetcher(), {}
        # Iterate
        for source in search_fetcher.get_available_sources():
            logger.info("Fetch based search source={} search={}".format(source, data.search))
            # Fetch
            results[source] = SearchFetcher().search(source, data.search)
            # Empty?
            if (len(results[source]) == 0):
                results.pop(source)
    except Exception as e:
        logger.warning("Exception: {}".format(str(e)), exc_info=True)
        results = {}
    return results
--- a/app_selenium/logger.py
+++ b/app_selenium/logger.py
@@ -8,10 +8,10 @@ logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
 os.makedirs(logs_directory, exist_ok=True)
 logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
-logger = logging.getLogger("app_selenium")
+logger = logging.getLogger("selenium")
-logger.setLevel(logging.DEBUG)
+logger.setLevel(logging.INFO)
-# To file log: INFO / WARNING / ERROR / CRITICAL
+# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL
 fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
 fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
 fh.setLevel(logging.DEBUG)
--- a/app_selenium/missing_kids.py
+++ b/app_selenium/missing_kids.py
@@ -1,27 +1,85 @@
-from selenium import webdriver
+from utils import get_webdriver, kill_process_tree
 from selenium.webdriver.common.by import By
-from selenium.webdriver.firefox.options import Options 
+from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import TimeoutException
 import time
 import os
 from logger import get_logger
 logger = get_logger()
 def get_webdriver():
    options = Options()
    options.add_argument('--headless')  # Optional
    options.binary_location = '/opt/firefox/firefox'
    service = Service('/usr/local/bin/geckodriver')
    driver = webdriver.Firefox(options=options, service=service)
    return driver
 class MissingKidsFetcher():
    def __init__(self) -> None:
        pass
    def verify_missing_kid_url(self, url):
        def load_finished(driver):
            # Find all <img> tags with src attributes. Extract src URLs
            image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
            # If base64 image exists, loading finished
            finished = any(["data:image/png;base64" in i for i in image_urls])
            # logger.debug("Finished loading URL")
            return finished
        try:
            # Initialize
            logger.debug("Initializing driver")
            driver, service = get_webdriver()
            # Load URL
            logger.debug("Get URL: {}".format(url))
            driver.get(url)
            # Wait for 404?
            try:
                WebDriverWait(driver, 2).until(EC.title_contains("404"))
                logger.debug("WebDriverWait -> title contains 404")
            except TimeoutException:
                logger.debug("WebDriverWait timeout, no 404 appeared")
            if ("404" in driver.title):
                # Status invalid
                results = {"status": "invalid"}
            else:
                # Check until finished loading
                num_checks = 10
                while (not load_finished(driver)) and (num_checks>=0):
                    time.sleep(1)
                    num_checks -= 1
                # Find all <img> tags with src attributes. Extract src URLs
                image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
                # Redirects to 404?
                if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
                    # Status invalid
                    results = {"status": "invalid"}
                # Redirection to valid URL? -> Duplicate
                elif (driver.current_url != url):
                    # Redirection (duplicate)
                    results = {"status": "duplicate", "redirection": driver.current_url}
                # Valid
                elif ("Have you seen this child?" in driver.title):
                    # Status valid
                    results = {"status": "valid"}
                else:
                    results = {"status": "unknown"}
        except Exception as e:
            logger.warning("Exception while verifying MissingKid URL {}\n{}".format(url, str(e)), exc_info=True)
            results = {}
        # Release memory
        try:
            driver.quit() #driver.close()
            time.sleep(1)
            # import atexit
            # atexit.register(driver.quit)  # Will always be called on exit
        except Exception as e:
            logger.warning("Exception while closing/quitting driver: {}".format(str(e)), exc_info=True)
        kill_process_tree(service.process.pid)
        logger.info("Results: {} for URL: {}".format(str(results), url))
        return results
    def get_missing_kids_urls(self, first_n_pages=-1):
        logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
        # Poster URL
@@ -30,7 +88,9 @@ class MissingKidsFetcher():
        set_urls = set()
        try:
-            driver = get_webdriver()
+            logger.debug("Initializing driver")
            driver, service = get_webdriver()
            logger.debug("Get URL: {}".format(url))
            # Go to URL
            driver.get(url)
            # Iterate
@@ -88,8 +148,12 @@ class MissingKidsFetcher():
        # Release memory
        try:
-            driver.close()
+            driver.quit() #driver.close()
            time.sleep(1)
            # import atexit
            # atexit.register(driver.quit)  # Will always be called on exit
        except Exception as e:
-            logger.warning("Exception while closing driver: {}".format(str(e)), exc_info=True)
+            logger.warning("Exception while closing/quitting driver: {}".format(str(e)), exc_info=True)
        kill_process_tree(service.process.pid)
        return set_urls
--- a/app_selenium/search.py
+++ b/app_selenium/search.py
@@ -0,0 +1,115 @@
 from utils import get_webdriver, kill_process_tree
 from selenium.webdriver.common.by import By
 from urllib.parse import quote
 import time
 from logger import get_logger
 logger = get_logger()
 class SearchFetcher():
    def __init__(self):
        pass
    def get_available_sources(self, ):
        return ["foxnews", "breitbart", "zerohedge"]
    def search(self, source, search="child abuse"):
        try:
            if (source == "foxnews"):
                return self._search_foxnews(search)
            elif (source == "breitbart"):
                return self._search_breitbart(search)
            elif (source == "zerohedge"):
                return self._search_zerohedge(search)
            else:
                logger.warning("Search not implemented for source={} search={}".format(source, search))
                return []
        except Exception as e:
            logger.warning("Error searching for source={} search={}".format(source, search))
            return []
    def _search_foxnews(self, search):
        url_host = "foxnews.com"
        # URL search
        url_unquoted = "https://www.foxnews.com/search-results/search#q={}".format(search)
        url = quote(url_unquoted, safe=":/?=&#")
        # Initialize
        driver, service = get_webdriver()
        # Load URL
        driver.get(url)
        time.sleep(2)
        # Find the element with class "page"
        page_element = driver.find_element(By.CLASS_NAME, "page")
        # Find the articles
        articles = page_element.find_elements(By.CLASS_NAME, "article")
        # Extract URLs
        urls = [ art.find_element(By.CLASS_NAME, "m").find_element(By.TAG_NAME, "a").get_attribute("href") for art in articles ]
        # Remove duplicates, remove None
        urls = [u for u in set(urls) if u is not None]
        # Filter by URL host
        urls = [u for u in urls if url_host in u]
        driver.quit()
        kill_process_tree(service.process.pid)
        return urls
    def _search_breitbart(self, search):
        url_host = "breitbart.com"
        # URL search
        url_unquoted = "https://www.breitbart.com/search/?s={}".format(search.replace(" ", "+"))
        url = quote(url_unquoted, safe=":/?=&#")
        # Initialize
        driver, service = get_webdriver()
        # Load URL
        driver.get(url)
        time.sleep(4)
        # Find the element with class "page"
        page_element = driver.find_element(By.CLASS_NAME, "gsc-expansionArea")
        # Find the articles
        articles = page_element.find_elements(By.CLASS_NAME, "gs-title")
        # Extract URLs
        urls = [ art.get_attribute("href") for art in articles ]
        # Remove duplicates, remove None
        urls = [u for u in set(urls) if u is not None]
        # Filter by URL host
        urls = [u for u in urls if url_host in u]
        driver.quit()
        kill_process_tree(service.process.pid)
        return urls
    def _search_zerohedge(self, search):
        url_host = "zerohedge.com"
        # URL search
        url_unquoted = "https://www.zerohedge.com/search-content?qTitleBody={}".format(search.replace(" ", "+"))
        url = quote(url_unquoted, safe=":/?=&#")
        # Initialize
        driver, service = get_webdriver()
        # Load URL
        driver.get(url)
        time.sleep(2)
        # Find the element with class "page"
        page_element = driver.find_element(By.CLASS_NAME, "main-content")
        # Find the articles
        articles = page_element.find_elements(By.TAG_NAME, "a")
        # Extract URLs
        urls = [ art.get_attribute("href") for art in articles]
        # Remove duplicates, remove None
        urls = [u for u in set(urls) if u is not None]
        # Filter by URL host
        urls = [u for u in urls if url_host in u]
        driver.quit()
        kill_process_tree(service.process.pid)
        return urls
--- a/app_selenium/utils.py
+++ b/app_selenium/utils.py
@@ -0,0 +1,23 @@
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options 
 from selenium.webdriver.firefox.service import Service
 import psutil
 def get_webdriver():
    options = Options()
    options.add_argument('--headless')  # Optional
    options.binary_location = '/opt/firefox/firefox'
    service = Service('/usr/local/bin/geckodriver')
    driver = webdriver.Firefox(options=options, service=service)
    return driver, service
 def kill_process_tree(pid):
    try:
        parent = psutil.Process(pid)
        for child in parent.children(recursive=True):
            child.kill()
        parent.kill()
    except psutil.NoSuchProcess:
        pass
--- a/app_urls/Dockerfile
+++ b/app_urls/Dockerfile
@@ -5,6 +5,9 @@ ENV PYTHONDONTWRITEBYTECODE=1
 #Prevents Python from buffering stdout and stderr
 ENV PYTHONUNBUFFERED=1
 # supervisor
 RUN apt-get update && apt-get install -y supervisor
 # User
 RUN useradd -m -r appuser && \
   mkdir /opt/app && \
@@ -14,10 +17,11 @@ WORKDIR /opt/app
 # Copy the Django project and install dependencies
 COPY requirements.txt  /opt/app/
-# run this command to install all dependencies 
+# Install dependencies 
 RUN pip install --no-cache-dir -r requirements.txt
 COPY --chown=appuser:appuser . /opt/app/
 COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
 RUN chmod -R 755 /opt
 RUN chown -R appuser:appuser /opt
@@ -25,4 +29,4 @@ RUN chown -R appuser:appuser /opt
 USER appuser
 # Run Django’s server & workers
-CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"]
+CMD ["sh", "-c", "/opt/app/initialize.sh && /usr/bin/supervisord"]
--- a/app_urls/README.md
+++ b/app_urls/README.md
@@ -73,6 +73,17 @@ class Meta:
 * Environment variables
    * In docker-compose.yml
 * Tasks
 ```
 python manage.py dumpdata \
  django_celery_beat.PeriodicTask \
  django_celery_beat.IntervalSchedule \
  django_celery_beat.CrontabSchedule \
  django_celery_beat.SolarSchedule \
  django_celery_beat.ClockedSchedule \
  --indent 2 > scheduled_tasks.json
 ```
 * Deploy
 ```
 # Check environments variables on .env file
--- a/app_urls/core/init.py
+++ b/app_urls/core/init.py
@@ -0,0 +1,3 @@
 from .celery import app as celery_app
 __all__ = ('celery_app',)
--- a/app_urls/core/celery.py
+++ b/app_urls/core/celery.py
@@ -0,0 +1,14 @@
 # core/celery.py
 import os
 from celery import Celery
 # Set default Django settings module
 os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
 app = Celery('core')
 # Load config from Django settings, namespace CELERY
 app.config_from_object('django.conf:settings', namespace='CELERY')
 # Auto-discover tasks from all registered Django app configs
 app.autodiscover_tasks()
--- a/app_urls/core/settings.py
+++ b/app_urls/core/settings.py
@@ -12,15 +12,16 @@ https://docs.djangoproject.com/en/5.1/ref/settings/
 from pathlib import Path
 import os
 # Queues and routing
 from kombu import Queue
 # Build paths inside the project like this: BASE_DIR / 'subdir'.
 BASE_DIR = Path(__file__).resolve().parent.parent
 # Quick-start development settings - unsuitable for production
 # SECURITY WARNING: keep the secret key used in production secret!
-SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5')
+SECRET_KEY = os.environ.get("DJANGO_SECRET_KEY", 'django-insecure-EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5')
 # SECURITY WARNING: don't run with debug turned on in production!
 DEBUG = (os.environ.get('DJANGO_DEBUG') == "True")
@@ -37,7 +38,7 @@ INSTALLED_APPS = [
    'django.contrib.sessions',
    'django.contrib.messages',
    'django.contrib.staticfiles',
-    'scheduler',
+    'django_celery_beat',
    'fetcher',
 ]
@@ -107,59 +108,22 @@ CACHES = {
    }
 }
 '''
 from scheduler.types import SchedulerConfiguration, QueueConfiguration, Broker
 from typing import Dict
-# https://django-tasks-scheduler.readthedocs.io/en/latest/configuration/
+
-SCHEDULER_CONFIG = SchedulerConfiguration(
+# Celery configuration
-    DEFAULT_JOB_TIMEOUT = os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30),  # 30 minutes
+CELERY_BROKER_URL = 'redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB", 0))
-    BROKER=Broker.REDIS,
+CELERY_RESULT_BACKEND = 'redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB_RESULTS", 1))
 CELERY_ACCEPT_CONTENT = ['json']
 CELERY_TASK_SERIALIZER = 'json'
 # Celery Beat scheduler (required for django-celery-beat to work)
 CELERY_BEAT_SCHEDULER = 'django_celery_beat.schedulers.DatabaseScheduler'
 CELERY_TASK_QUEUES = (
    Queue('default'),
    Queue('low'),
 )
 SCHEDULER_QUEUES: Dict[str, QueueConfiguration] = {
    'default': QueueConfiguration(
        HOST = os.environ.get("REDIS_HOST", "localhost"),
        PORT = os.environ.get("REDIS_PORT", 6379),
        DB = os.environ.get("REDIS_DB", 0),
    ),
    'high': QueueConfiguration(
        HOST = os.environ.get("REDIS_HOST", "localhost"),
        PORT = os.environ.get("REDIS_PORT", 6379),
        DB = os.environ.get("REDIS_DB", 0),
    ),
    'low': QueueConfiguration(
        HOST = os.environ.get("REDIS_HOST", "localhost"),
        PORT = os.environ.get("REDIS_PORT", 6379),
        DB = os.environ.get("REDIS_DB", 0),
    ),
 }
 '''
 SCHEDULER_QUEUES = {
    'default': {
        'HOST': os.environ.get("REDIS_HOST", "localhost"),
        'PORT': os.environ.get("REDIS_PORT", 6379),
        'DB': os.environ.get("REDIS_DB", 0),
    },
    'high': {
        'HOST': os.environ.get("REDIS_HOST", "localhost"),
        'PORT': os.environ.get("REDIS_PORT", 6379),
        'DB': os.environ.get("REDIS_DB", 0),
    },
    'low': {
        'HOST': os.environ.get("REDIS_HOST", "localhost"),
        'PORT': os.environ.get("REDIS_PORT", 6379),
        'DB': os.environ.get("REDIS_DB", 0),
    }
 }
 SCHEDULER_CONFIG = {
    'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30),  # 30 minutes
    'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours
    'EXECUTIONS_IN_PAGE': 20,
    'SCHEDULER_INTERVAL': 10,  # 10 seconds
 }
 # Password validation
--- a/app_urls/core/urls.py
+++ b/app_urls/core/urls.py
@@ -19,6 +19,5 @@ from django.urls import path, include
 urlpatterns = [
    path('admin/', admin.site.urls),
    path('scheduler/', include('scheduler.urls')),
    path('', include('fetcher.urls')),
 ]
--- a/app_urls/fetcher/src/db_utils.py
+++ b/app_urls/fetcher/src/db_utils.py
@@ -4,8 +4,9 @@ from django.core.cache import cache
 from django.db import IntegrityError
 from django.utils import timezone
 from datetime import timedelta
-from .fetch_utils_url_processor import process_url, get_with_protocol
+from .fetch_utils_url_processor import process_url, get_with_protocol, url_host_slowdown
 import re
 import requests
 import os
 import traceback
 from .logger import get_logger
@@ -43,7 +44,6 @@ class DB_Handler():
                        UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
                else:
                    # Add object to insert
                    # url_object_to_insert.append(Urls(url=url))
                    urls_to_insert.append(url)
            ### Insert URLs & (URL_id, source_id)
@@ -81,24 +81,70 @@ class DB_Handler():
        except Exception as e:
            logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
    def _set_status(self, obj_url, status):
        # Update status if setting a new value
        if (obj_url.status != status):
            obj_url.status = status
            obj_url.save()
    def _set_duplicate_and_insert_canonical(self, obj_url, url_canonical):
        # Update status
        self._set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
        # Get or create URL with canonical form
        obj_url_canonical, created = Urls.objects.get_or_create(url=url_canonical)
        # Get the source-search IDs associated to obj_url.id
        list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
        for obj_url_source_search in list_url_source_search:
            # Associate same sources to url_canonical (it might already exist)
            UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
        # URLs duplciate association
        UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
    def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
-        
+        ##########################################################################
-        def set_status(obj_url, status):
+        # URL pattern: missingkids.org/poster OR missingkids.org/new-poster
-            # Update status if setting a new value
+        if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url):
-            if (obj_url.status != status):
+            # Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
-                obj_url.status = status
+            url_host_slowdown(obj_url.url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
-                obj_url.save()
+            try:
                # Request
                r = requests.get(obj_url.url, allow_redirects=True)
            except Exception as e:
                if (raise_exception_on_error):
                    # Simply raise exception, handled in a different way
                    raise Exception("Error processing URL, raising exception as expected")
                else:
                    logger.debug("Error processing URL: {}\n{}\n{}".format(obj_url.url, str(e), traceback.format_exc()))
                    # Set status to error
                    self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
                    return
            if (r.url != obj_url.url):
                # Canonical
                url_canonical = r.url
                # Set duplicate, and insert new canonical form
                self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
            elif (r.status_code == 200):
                # Not enough to determine if it is valid. Need to wait to finish javascript, it might redirect to 404
                # self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
                self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
            elif (r.status_code == 404):
                self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
            else:
                logger.debug("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
                self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
            return
        ##########################################################################
        # Found a pattern match -> Override status
        if (status_pattern_match is not None):
            logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
            # Update status
-            set_status(obj_url, status_pattern_match)
+            self._set_status(obj_url, status_pattern_match)
            ##### Filter URL? -> Invalid (don't extract content)
            if (status_pattern_match == "invalid"):
                return
-        
+
        try:
            # Extract URL content
            dict_url_data = process_url(obj_url.url, paywall_bypass)
@@ -112,19 +158,9 @@ class DB_Handler():
                dict_url_data = None
        ##### Canonical URL different? -> Duplicate
-        if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
+        if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):            
-            # Update status
+            # URL as duplicate, insert canonical URL
-            set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
+            self._set_duplicate_and_insert_canonical(obj_url, dict_url_data.get("url_canonical"))
            # Get or create URL with canonical form
            obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
            # Get the source-search IDs associated to obj_url.id
            list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
            for obj_url_source_search in list_url_source_search:
                # Associate same sources to url_canonical (it might already exist)
                UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
            # URLs duplciate association
            UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
            # Next URL
            return
@@ -133,20 +169,20 @@ class DB_Handler():
            # (dict_url_data is None) or (Exception while processing URL) ? -> Error status
            if (dict_url_data is None):
                # Update status
-                set_status(obj_url, Urls.STATUS_ENUM.ERROR)
+                self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
                # Next URL
                return
            # Invalid? e.g. binary data
            if (dict_url_data.get("override_status") == "invalid"):
                # Update status
-                set_status(obj_url, Urls.STATUS_ENUM.INVALID)
+                self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
                # Next URL
                return
            ##### Valid URL
            # Update status
-            set_status(obj_url, Urls.STATUS_ENUM.VALID)
+            self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
        try:
            if (dict_url_data is not None):
@@ -244,14 +280,31 @@ class DB_Handler():
        except Exception as e:
            logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
-    def process_missing_kids_urls(self, batch_size=None):
+    def process_missing_kids_urls(self, batch_size=None, process_status_only=None):
        try:
-            logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size))
+            logger.info("Processing MissingKids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only))
            if (process_status_only is None):
                filter = (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR))
            else:
                if (process_status_only == "valid"):
                    filter = Q(status=Urls.STATUS_ENUM.VALID)
                elif (process_status_only == "invalid"):
                    filter = Q(status=Urls.STATUS_ENUM.INVALID)
                elif (process_status_only == "error"):
                    filter = Q(status=Urls.STATUS_ENUM.ERROR)
                elif (process_status_only == "unknown"):
                    filter = Q(status=Urls.STATUS_ENUM.UNKNOWN)
                elif (process_status_only == "raw"):
                    filter = Q(status=Urls.STATUS_ENUM.RAW)
                elif (process_status_only == "duplicate"):
                    filter = Q(status=Urls.STATUS_ENUM.DUPLICATE)
                else:
                    logger.info("Unknown status to filter: {}".format(process_status_only))
            # Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
            missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
-                (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
+                filter & (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
                &
                (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
            )
            # Get batch size
@@ -261,14 +314,30 @@ class DB_Handler():
            # Per URL
            for obj_url in missingkids_urls:
                try:
-                    # Process URL. If no exception -> Valid
+                    # Missing kids fetching endpoint, verify URL
-                    self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
+                    missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
                    data = {"url": obj_url.url}
                    # POST
                    r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120)
                    # Jsonify
                    results = r.json()
                    logger.debug("Selenium results for URL {}: {}".format(obj_url.url, str(results)))
                    if (results.get("status") == "valid"):
                        self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
                    elif (results.get("status") == "invalid"):
                        self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
                    elif (results.get("status") == "duplicate"):
                        self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection"))
                    elif (results.get("status") == "unknown"):
                        # Nothing to do, not sure about it...
                        logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url))
                        pass
                except Exception as e:
-                    # Raised exception -> Invalid (404 error)
+                    logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e)))
                    obj_url.status = Urls.STATUS_ENUM.INVALID
                    obj_url.save()
-            logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
+            logger.info("Verified status of #{} missingkids.org/poster / missingkids.org/new-poster URLs".format(len(missingkids_urls)))
        except Exception as e:
            logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/fetcher/src/fetch_parser.py
+++ b/app_urls/fetcher/src/fetch_parser.py
@@ -17,6 +17,9 @@ class FetchParser():
            url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
            # Ensure URL host in URL
            raw_urls = [u for u in raw_urls if url_host_clean in u]
        # Clean URL part after "&quot"
        raw_urls = [u.split("&quot")[0] for u in raw_urls]
        return raw_urls
--- a/app_urls/fetcher/src/fetch_search.py
+++ b/app_urls/fetcher/src/fetch_search.py
@@ -54,6 +54,7 @@ class FetchSearcher():
                for SearchInstance in ListSearchInstances:
                    # Sleep between requests, avoid too many requests...
                    time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
                    # TODO: Random proxy / VPN
                    SearchInstance(args).fetch_articles(db_writer, obj_search)
                # TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
--- a/app_urls/fetcher/src/fetch_search_instances.py
+++ b/app_urls/fetcher/src/fetch_search_instances.py
@@ -1,15 +1,15 @@
 import time
 import feedparser
 import os
-from django.utils import timezone
+from urllib.parse import unquote
 from datetime import timedelta
 from ..models import Search, Source
 from .fetch_utils_gnews import decode_gnews_urls
 from .logger import get_logger
 logger = get_logger()
 from furl import furl
 from gnews import GNews
-from duckduckgo_search import DDGS
+from ddgs import DDGS
 from GoogleNews import GoogleNews
 from search_engines import Yahoo, Aol
@@ -42,6 +42,9 @@ class FetcherAbstract(ABC):
            # Ensure URL host in URL
            raw_urls = [u for u in raw_urls if url_host_clean in u]
        # Remove URL parameters, e.g. "?param=1234&h=yes"
        raw_urls = [ furl(u).remove(furl(u).args).url for u in raw_urls ]
        return raw_urls
    def fetch_articles(self, db_writer, obj_search):
@@ -110,7 +113,7 @@ class SearchDuckDuckGoGeneral(FetcherAbstract):
        return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
    def _fetch_raw_urls(self, keyword_search):
-        try:    
+        try:
            news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
            urls = [e.get("href") for e in news]
        except Exception as e:
@@ -206,7 +209,10 @@ class SearchGoogleGeneral(FetcherAbstract):
                # Links
                for l in links:
                    # 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
-                    set_links.add( l.get("link").split("&ved=")[0] )
+                    url = l.get("link").split("&ved=")[0]
                    # https://www.foxnews.com/politics%3Fparam%3D446dd5e1 -> https://www.foxnews.com/politics?param=446dd5e1
                    url = unquote(url)
                    set_links.add(url)
                # Finished?
                if (num_before == len(set_links)):
                    break
--- a/app_urls/fetcher/src/fetch_selenium.py
+++ b/app_urls/fetcher/src/fetch_selenium.py
@@ -0,0 +1,42 @@
 from .db_utils import DB_Handler
 from ..models import Search, Source
 import traceback
 import requests
 import os
 from .logger import get_logger
 logger = get_logger()
 class FetchSeleniumSourceSearch():
    def __init__(self) -> None:
        logger.debug("Initializing Selenium Source Search")
    def run(self):
        try:
            logger.debug("Starting FetchSeleniumSourceSearch.run()")
            # Get keyword searches
            list_keyword_search = Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH)
            logger.debug("Fetching news Selenium based for keyword searches: {}".format([e.search for e in list_keyword_search]))
            # Run selenium search for each keyword search
            for obj_search in list_keyword_search:
                try:
                    # Selenium fetching endpoint
                    selenium_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "fetch_search/")
                    data = {"search": obj_search.search}
                    # POST
                    r = requests.post(selenium_fetch_endpoint, json=data, timeout=900)
                    # Jsonify
                    results = r.json()
                    logger.debug("Selenium results for URL {}: {}".format(obj_search.search, str(results)))
                    for source, urls_fetched in results.items():
                        # Get source object
                        obj_source, created = Source.objects.get_or_create(source="selenium {}".format(source))
                        # Write to DB
                        DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
                except Exception as e:
                    logger.warning("Exception while fetching selenium search: {}\n{}".format(obj_search.search, str(e)))
        except Exception as e:
            logger.warning("Exception in FetchSeleniumSourceSearch.run(): {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/fetcher/src/fetch_utils_url_processor.py
+++ b/app_urls/fetcher/src/fetch_utils_url_processor.py
@@ -2,6 +2,7 @@ from django.core.cache import cache
 from .logger import get_logger
 logger = get_logger()
 import newspaper
 import requests
 import time
 import os
 from urllib.parse import unquote
@@ -39,7 +40,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
    cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
 def process_url(url, paywall_bypass=False):
-    
+
    if (paywall_bypass):
        # TODO: Implement self-hosted instance
        url_paywall_bypass_base = "https://marreta.pcdomanual.com"
@@ -51,28 +52,67 @@ def process_url(url, paywall_bypass=False):
    try:
        # Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
        url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
        # User agent
        user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
        # Process
-        article = newspaper.article(url_of_interest)
+        if ("foxnews.com" in url_of_interest) or ("zerohedge" in url_of_interest):
            # Request
            r = requests.get(url, headers={"User-Agent": user_agent})
            # Raise for error code
            r.raise_for_status()
            # Parse
            article = newspaper.Article(url=r.url).download(input_html=r.text).parse()
        else:
            # Config: Fake user agent
            config = newspaper.configuration.Configuration()
            config.headers = {'User-Agent': user_agent}
            # Default mode
            article = newspaper.article(url_of_interest, config=config)
    except newspaper.ArticleBinaryDataException:
        logger.warning("ArticleException for input URL {}".format(url))
        return {"override_status": "invalid"}
    except newspaper.ArticleException as e:
        # Too many requests or blocked for some reason
        if ("Status code 403" in str(e.args)):
            # TODO: cool down and retry once?, proxy/VPN, ...
            logger.debug("TODO: process_url Implement code 403")
        # Not found, either it doesn't exist or getting blocked...
        if ("Status code 404" in str(e.args)):
            # TODO: cool down and retry once?, proxy/VPN, ...
            logger.debug("TODO: process_url Implement code 404")
        # Too many requests? Cool down...
        if ("Status code 429" in str(e.args)):
            # TODO: cool down and retry once?, proxy/VPN, ...
            logger.debug("TODO: process_url Implement code 429")
        # Unavailable for legal reasons
        if ("Status code 451" in str(e.args)):
            # TODO: Bypass with VPN
            logger.debug("TODO: process_url Implement code 451")
        # CloudFlare protection?
        if ("Website protected with Cloudflare" in str(e.args)):
            logger.debug("TODO: process_url Implement bypass CloudFlare")
        # PerimeterX protection?
        if ("Website protected with PerimeterX" in str(e.args)):
            logger.debug("TODO: process_url Implement bypass PerimeterX")
        logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
        # Try simple request, valid response but couldn't parse article? e.g. getting blocked? -> unknown
        time.sleep(0.25)
        r = requests.get(url_of_interest)
        if (r.status_code == 200):
            return {"override_status": "unknown"}
        else:
            # Another status code still... "error" or "unknown"
            return {"override_status": "unknown"}
        return None
    except Exception as e:
        logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
--- a/app_urls/fetcher/src/llm.py
+++ b/app_urls/fetcher/src/llm.py
@@ -1,24 +1,76 @@
 import ollama
 import os
 import requests
 import json
 from .logger import get_logger
 logger = get_logger()
 class OllamaClient():
    def __init__(self):
-        self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
+        self.host = os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org")
        self.client = ollama.Client(host=self.host)
        self.options = {"temperature": 0, "seed": 13579}
    def _get_default_model(self):
-        return "llama3.2:3b"
+        return os.getenv("OLLAMA_MODEL_DEFAULT", "llama3.2:3b")
    def get_models(self):
-        models = sorted([m.model for m in self.client.list().models])
+        try:
-        if (self._get_default_model() in models):
+            # Get models
-            return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
+            models = sorted([m.model for m in self.client.list().models])
-        else:
+            # r = requests.get( os.path.join(endpoint, "models") )
-            return models
+            # r.json().get("models")
            # Default within it?
            if (self._get_default_model() in models):
                return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
            else:
                return models
        except Exception as e:
            return []
-    def get_prompt(self):
+    def get_prompt(self, content):
-        return ("Rewrite the text below into a clear and concise summary of one paragraph maximum, presenting the key points as if they are newly written insights. "
+        return "Provide, in one sentence each, the what, why, who, when, where, and a detailed summary of the content below:\n\n{}".format(content)
        return "First, provide a detailed summary of the content below in one paragraph. Second, specify in one sentence each the who, what, when, where and why of the story. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states':\n\n{}".format(content)
        return "First, provide a summary of the content below in one paragraph. Second, specify the Who, What, When, Where and Why of the story:\n\n{}".format(content)
        # First, provide a summary of the content below in one paragraph. Second, specify the who, what, when, where and why of the story in one sentence each. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states':
        '''
        return ("Rewrite the content below into a clear and concise summary of one paragraph maximum, presenting the key points as if they are newly written insights. "
                "Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. "
                "Write in a natural, standalone format that feels like an original explanation. "
-                "Keep it brief, engaging, informative, in the style of a news article: \n"
+                "Keep it brief, engaging, informative, in the style of a news article:\n\n{}".format(content)
        )
-    
+        '''
    def generate(self, model, prompt, format=None):
        try:
            # Generate response
            response = self.client.generate(model=model, prompt=prompt, format=format, options=self.options)
            # Extract response
            response = response.response
            # Json? -> Dict
            if (format == "json"):
                # Dict
                response = json.loads(response)
            # Force unload
            r = requests.post( os.path.join(self.host, "unload_model") )
        except Exception as e:
            logger.warning("Exception while generating LLM response: {}".format(str(e)))
            if (format == "json"):
                response = {}
            else:
                response = None
        # Text
        return response
    def generate_stream(self, model, prompt):
        try:
            # Generate response
            response = self.client.generate(model=model, prompt=prompt, format="json", stream=True, options=self.options)
            # Streamed chunks
            for chunk in response:
                yield chunk.response
            # Force unload
            r = requests.post( os.path.join(self.host, "unload_model") )
        except Exception as e:
            logger.warning("Exception while generating LLM response: {}".format(str(e)))
--- a/app_urls/fetcher/src/logger.py
+++ b/app_urls/fetcher/src/logger.py
@@ -1,6 +1,10 @@
 import logging
 import os
 # Set to warning
 logging.getLogger("urllib3").setLevel(logging.WARNING)
 logging.getLogger("newspaper").setLevel(logging.WARNING)
 # Get env var
 logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
@@ -11,7 +15,7 @@ logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(messa
 logger = logging.getLogger("fetcher")
 logger.setLevel(logging.DEBUG)
-# To file log: INFO / WARNING / ERROR / CRITICAL
+# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL
 fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
 fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
 fh.setLevel(logging.DEBUG)
--- a/app_urls/fetcher/src/notifier.py
+++ b/app_urls/fetcher/src/notifier.py
@@ -0,0 +1,57 @@
 from django.utils import timezone
 from django.utils.timezone import now, timedelta
 from ..models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
 from django.db.models import Count
 import requests
 import os
 def notify_telegram(last_hours=24):
    start_date = timezone.now() - timedelta(hours=last_hours)
    # Count the number of URLs grouped by status within the date range
    urls_data_status = Urls.objects.filter(ts_fetch__gte=start_date) \
                            .values('status') \
                            .annotate(count=Count('id')) \
                            .order_by('status')
    # Count the number of URLs grouped by source
    urls_data_source = UrlsSourceSearch.objects \
                                .filter(id_url__ts_fetch__gte=start_date) \
                                .values('id_source__source') \
                                .annotate(count=Count('id_url')) \
                                .order_by('id_source__source')
    # Count the number of URLs grouped by search
    urls_data_search = UrlsSourceSearch.objects \
                                .filter(id_url__ts_fetch__gte=start_date) \
                                .values('id_search__search') \
                                .annotate(count=Count('id_url')) \
                                .order_by('id_search__search')
    bot_token = os.environ.get("TELEGRAM_BOT_TOKEN", "")
    chat_id = os.environ.get("TELEGRAM_CHAT_ID", "")
    message = "During the last {} hours:\n".format(last_hours)
    message += "\nURLs per status:\n"
    for o in urls_data_status:
        message += "  {}: {}\n".format(o.get("status"), o.get("count"))
    message += "\nURLs per source:\n"
    for o in urls_data_source:
        message += "  {}: {}\n".format(o.get("id_source__source"), o.get("count"))
    message += "\nURLs per search:\n"
    for o in urls_data_search:
        message += "  {}: {}\n".format(o.get("id_search__search"), o.get("count"))
    url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
    params = {
        "chat_id": chat_id,
        "text": message
    }
    # POST
    response = requests.post(url, params=params)
--- a/app_urls/fetcher/src/publisher.py
+++ b/app_urls/fetcher/src/publisher.py
@@ -12,7 +12,8 @@ logger = get_logger()
 class Publisher():
    def __init__(self):
-        pass
+        self.admin_api_url = os.getenv("GHOST_ADMIN_API_URL")
        self.admin_api_key = os.getenv("GHOST_ADMIN_API_KEY")
    def _create_jwt(self, admin_api_key):
        id_, secret = admin_api_key.split(':')
@@ -29,9 +30,7 @@ class Publisher():
    def _create_ghost_post(self, post_data):
        # Get token
-        jwt_token = self._create_jwt(os.getenv("GHOST_ADMIN_API_KEY"))
+        jwt_token = self._create_jwt(self.admin_api_key)
        # Get Admin API URL
        admin_api_url = os.getenv("GHOST_ADMIN_API_URL")
        headers = {
            'Authorization': f'Ghost {jwt_token}',
@@ -41,7 +40,7 @@ class Publisher():
        post_data = {"posts": [post_data]}
        response = requests.post(
-            os.path.join(admin_api_url, "posts"),
+            os.path.join(self.admin_api_url, "posts"),
            json=post_data,
            headers=headers,
            params={"source":"html"}
@@ -53,6 +52,27 @@ class Publisher():
        else:
            logger.warning("Ghost - Failed to publish post: {} {}".format(response.status_code, response.text))
            return None
    def _published_url_id(self, url_id):
        # Get token
        jwt_token = self._create_jwt(self.admin_api_key)
        headers = {
            'Authorization': f'Ghost {jwt_token}',
            'Content-Type': 'application/json'
        }
        # Query param filter by URL ID
        params = {"filter": "tags:hash-url-id-{}".format(url_id)}
        # Get posts using filter
        response = requests.get(os.path.join(self.admin_api_url, "posts"), params=params, headers=headers)
        # To JSON
        dict_response = response.json()
        if (len(dict_response.get("posts")) > 0):
            return True
        else:
            return False
    def _get_photo_url(self, query):
        # TODO: Get already used photos to skip. Use DB
@@ -100,14 +120,56 @@ class Publisher():
        if (url_content.valid_content is False):
            logger.warning("Ghost - URL Content is not valid for URL ID: {} {}".format(url_id, url.url))
            return
        # URL ID already published?
        if (self._published_url_id(url_id)):
            logger.info("Ghost - URL ID {} already published, skipping".format(url_id))
            return
-        model = "llama3.2:3b"
+        ###########################################
-        prompt = "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
+        client_llm = OllamaClient()
        # Model
        model = client_llm.get_models()[0]
        # Prompt
        prompt = client_llm.get_prompt(url_content.content)
        # Generate content
        generated_content_dict = client_llm.generate(model, prompt, format="json")
        logger.debug("Generated content: {}".format(generated_content_dict))
-        ollama_msg = {"role": "user", "content": "{}\n{}".format(prompt, url_content.content)}
+        ###########################################
-        response = OllamaClient().client.chat(model=model, messages=[ollama_msg])
+        # Get where description
        generated_content_where = generated_content_dict.get("where")
        # Prompt to extract address / location
        prompt = 'Only answer with the location or address which can be extracted from this description: "{}"'.format(generated_content_where)
        # LLM
        extracted_location = client_llm.generate(model, prompt, format=None)
        logger.debug("Estimated location: {}".format(extracted_location))
        # OSM API
        params = {
            'q': extracted_location,
            'format': 'json',
            'addressdetails': 1,
            'limit': 1
        }
-        article_summary = response["message"]["content"]
+        response = requests.get('https://nominatim.openstreetmap.org/search', params=params, headers={'User-Agent': 'App'})
        list_data = response.json()
        if (len(list_data) > 0):
            data = list_data[0]
            location_url = "https://openstreetmap.org/{}/{}".format(data.get("osm_type"), data.get("osm_id"))
        else:
            location_url = None
        ###########################################
        # Parse generated content
        summary, five_w = "", ""
        for k, v in generated_content_dict.items():
            if ("summary" in k.lower()):
                summary = v if type(v) is str else "\n".join(summary)
            else:
                five_w += "{}: {}\n".format(k.capitalize(), v if type(v) is str else ". ".join(v) )
        # Aggregate generated content
        generated_content = "{}\n\n{}".format(summary, five_w)
        ################################################################################################
        if (url_content.image_main_url is None) or (requests.get(url_content.image_main_url).status_code != 200):
@@ -117,15 +179,24 @@ class Publisher():
        else:
            photo_url = url_content.image_main_url
        # HTML: Generate content
        html_data = "".join([ "<p>{}</p>".format(t) for t in generated_content.split("\n") ])
        # HTML: Add location if available
        if (location_url is not None):
            html_data += '<p><a href="{}">Estimated location</a></p>'.format(location_url)
        # HTML: Add source
        html_data += '<p><a href="{}">Source: {}</a></p>'.format(url.url, url_content.url_host.replace("https://", ""))
        post_data = {
            # "slug": "hey-short",
            "title": url_content.title,
-            "html": "".join([ "<p>{}</p>".format(t) for t in article_summary.split("\n") ]) + '<a href="{}">Source</a>'.format(url.url),
+            "html": html_data,
            #"meta_title": "",
            #"meta_description": "",
            "feature_image": photo_url,
            #"feature_image_caption": "",
            "status": "published",
            "tags": ["#url-id-{}".format(url_id)]   # Hidden tag with associated URL ID
        }
        # Publish post
--- a/app_urls/fetcher/tasks.py
+++ b/app_urls/fetcher/tasks.py
@@ -1,142 +1,84 @@
-from scheduler import job
+from celery import shared_task
 from .src.fetch_feed import FetchFeeds
 from .src.fetch_parser import FetchParser
 from .src.fetch_search import FetchSearcher
 from .src.fetch_missing_kids import FetchMissingKids
 from .src.fetch_selenium import FetchSeleniumSourceSearch
 from .src.db_utils import DB_Handler
 from .src.publisher import Publisher
 from .src.notifier import notify_telegram
 from .src.logger import get_logger
 logger = get_logger()
-@job('default')
+
@shared_task(queue='default')
 def fetch_feeds():
    task = "Fetch Feeds"
    logger.info("Task triggered: {}".format(task))
    FetchFeeds().run()
    logger.info("Task completed: {}".format(task))
-@job('default')
+@shared_task(queue='default')
 def fetch_parser():
    task = "Fetch Parser"
    logger.info("Task triggered: {}".format(task))
    FetchParser().run()
    logger.info("Task completed: {}".format(task))
-@job('default')
+@shared_task(queue='default')
 def fetch_search():
    task = "Fetch Search"
    logger.info("Task triggered: {}".format(task))
    FetchSearcher().run()
    logger.info("Task completed: {}".format(task))
-@job('default')
+@shared_task(queue='low')
 def fetch_selenium_search():
    task = "Fetch Selenium search"
    logger.info("Task triggered: {}".format(task))
    FetchSeleniumSourceSearch().run()
    logger.info("Task completed: {}".format(task))
@shared_task(queue='low')
 def fetch_missing_kids(number_pages=5):
    task = "Fetch MissingKids"
    logger.info("Task triggered: {}".format(task))
    FetchMissingKids().run(number_pages)
    logger.info("Task completed: {}".format(task))
-@job('default')
+@shared_task(queue='default')
 def fetch_missing_kids_all(number_pages=-1):
    task = "Fetch MissingKids"
    logger.info("Task triggered: {}".format(task))
    FetchMissingKids().run(number_pages)
    logger.info("Task completed: {}".format(task))
@job('default')
 def process_raw_urls(batch_size=100):
    task = "Process raw URLs"
    logger.info("Task triggered: {}".format(task))
    DB_Handler().process_raw_urls(batch_size=batch_size)
    logger.info("Task completed: {}".format(task))
-@job('default')
+@shared_task(queue='default')
 def process_error_urls(batch_size=50):
    task = "Process error URLs"
    logger.info("Task triggered: {}".format(task))
    DB_Handler().process_error_urls(batch_size=batch_size)
    logger.info("Task completed: {}".format(task))
-@job('default')
+@shared_task(queue='low')
-def process_missing_kids_urls(batch_size=50):
+def process_missing_kids_urls(batch_size=None, process_status_only=None):
-    task = "Process Missing Kids URLs"
+    task = "Process Missing Kids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only)
    logger.info("Task triggered: {}".format(task))
-    DB_Handler().process_missing_kids_urls(batch_size=batch_size)
+    DB_Handler().process_missing_kids_urls(batch_size=batch_size, process_status_only=process_status_only)
    logger.info("Task completed: {}".format(task))
-@job('default')
+@shared_task(queue='default')
-def process_missing_kids_urls_all(batch_size=None):
+def clean_old_url_content(older_than_days=14):
    task = "Process Missing Kids URLs ALL"
    logger.info("Task triggered: {}".format(task))
    DB_Handler().process_missing_kids_urls(batch_size=batch_size)
    logger.info("Task completed: {}".format(task))
@job('default')
 def clean_old_url_content(older_than_days=60):
    task = "Clean old URL content"
    logger.info("Task triggered: {}".format(task))
    DB_Handler().clean_old_url_content(older_than_days=older_than_days)
    logger.info("Task completed: {}".format(task))
-
+@shared_task(queue='default')
-@job('default')
+def notify_status():
-def background_task(process_type: str):
+    task = "Notify status"
-    logger.info("Task triggered: {}".format(process_type))
+    logger.info("Task triggered: {}".format(task))
-
+    notify_telegram()
-    try:
+    logger.info("Task completed: {}".format(task))
        if (process_type == "fetch_feeds"):
            FetchFeeds().run()
        elif (process_type == "fetch_parser"):
            FetchParser().run()
        elif (process_type == "fetch_search"):
            FetchSearcher().run()
        elif (process_type == "fetch_missingkids_all"):
            FetchMissingKids().run(number_pages=-1)
        elif ("fetch_missingkids" in process_type):
            # number_pages encoded in URL
            try:
                number_pages = int(process_type.split("_")[-1])
            except Exception as e:
                number_pages = -1
            FetchMissingKids().run(number_pages=number_pages)
        elif ("process_" in process_type):
            # Batch size encoded in URL
            try:
                batch_size = int(process_type.split("_")[-1])
            except Exception as e:
                batch_size = None
            # Task type
            if ("process_raw_urls" in process_type):
                DB_Handler().process_raw_urls(batch_size=batch_size)
            elif ("process_error_urls" in process_type):
                DB_Handler().process_error_urls(batch_size=batch_size)
            elif ("process_missing_kids_urls" in process_type):
                DB_Handler().process_missing_kids_urls(batch_size=batch_size)
        elif ("clean_old_url_content" in process_type ):
            # Older than X days encoded in URL
            try:
                older_than_days = float(process_type.split("_")[-1])
            except Exception as e:
                older_than_days = None
            DB_Handler().clean_old_url_content(older_than_days=older_than_days)
        elif ("publish" in process_type):
            # Extract URL ID
            url_id = process_type.split("_")[-1]
            # Publish
            Publisher().publish(url_id)
        else:
            logger.info("Task unknown!: {}".format(process_type))
        logger.info("Task completed: {}".format(process_type))
    except Exception as e:
        logger.error(e)
--- a/app_urls/fetcher/templates/filtered_urls.html
+++ b/app_urls/fetcher/templates/filtered_urls.html
@@ -369,7 +369,7 @@ input[type="checkbox"] {
                <tbody>
                    {% for url in urls %}
                        <tr>
-                            <td><a href="./{{ url.id }}" class="btn btn-primary btn-sm" target="_blank">{{ url.id }}</a></td>
+                            <td><a href="./{{ url.id }}" class="btn btn-primary btn-sm" target="_blank">{{ url.id }}</a> <a href="/task/publish_{{ url.id }}" target="_blank">[✍️]</a> </td>
                            <td><a href="{{ url.url }}" target="_blank">{{ url.url }}</a></td>
                            <td>
                                {% if url.status == 'raw' %}
--- a/app_urls/fetcher/templates/url_detail.html
+++ b/app_urls/fetcher/templates/url_detail.html
@@ -278,8 +278,7 @@
        <!-- Input field with a default value -->        
        <label for="custom-input-{{ url_item.id }}">Prompt:</label>
-        <textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="5">{{ prompt }}
+        <textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="5">{{ prompt }}</textarea>
 {{ url_item.url }}</textarea>
        <div class="d-flex align-items-center">
            <!-- Fetch details button -->
--- a/app_urls/fetcher/urls.py
+++ b/app_urls/fetcher/urls.py
@@ -7,8 +7,6 @@ urlpatterns = [
    path('logs/database', views.log_db, name='log_db'),
    path('logs/<str:log_type>', views.logs, name='logs'),
    #
    path('task/<str:task>', views.trigger_task, name='trigger_task'),
    #
    path('urls/charts/', views.charts, name='charts'),
    path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
    path('urls-per-status/', views.urls_per_status, name='urls_per_status'),
--- a/app_urls/fetcher/views.py
+++ b/app_urls/fetcher/views.py
@@ -1,8 +1,8 @@
-from .views_base import link_list, logs, log_db, trigger_task
+from .views_base import link_list, logs, log_db #, trigger_task, 
 from django.core.paginator import Paginator
 from django.shortcuts import render, get_object_or_404
-from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
+from django.http import StreamingHttpResponse, JsonResponse
 from django.db.models import Q, Count
 from django.utils import timezone
 from django.utils.timezone import now, timedelta
@@ -14,16 +14,6 @@ import json
 ####################################################################################################
 def llm(request):
    def stream_response(model, text):
        msg_content = {
            "role": "user", 
            "content": text,
        }
        response = OllamaClient().client.chat(model=model, messages=[msg_content], stream=True)
        for chunk in response:
            yield chunk["message"]["content"]  # Stream each chunk of text
    if request.method == 'POST':
        try:
            body_data = json.loads(request.body)
@@ -33,7 +23,7 @@ def llm(request):
            if message is None:
                return JsonResponse({'error': 'No message found in request'}, status=400)
-            return StreamingHttpResponse(stream_response(model, message), content_type="text/plain")
+            return StreamingHttpResponse(OllamaClient().generate_stream(model, message), content_type="text/plain")
        except json.JSONDecodeError:
            return JsonResponse({'error': 'Invalid JSON'}, status=400)
@@ -55,13 +45,18 @@ def url_detail_view(request, id):
        url_content = {}
    ollama = OllamaClient()
    try:
        # prompt_content = "{}\n{}\n{}".format(url_content.title, url_content.description, url_content.content)
        prompt_content = "{}".format(url_content.content)
    except Exception as e:
        prompt_content = ""
    context = {
        'url_item': url_item,
        'sources': url_sources,
        'searches': url_searches,
        'models': ollama.get_models(),
-        'prompt': ollama.get_prompt(),
+        'prompt': ollama.get_prompt(prompt_content),
        'url_content': url_content,
        'url_canonical': url_canonical,
    }
--- a/app_urls/fetcher/views_base.py
+++ b/app_urls/fetcher/views_base.py
@@ -1,26 +1,29 @@
 import os
 from .tasks import background_task
 from django.http import JsonResponse, HttpResponse
 from django.db import connection
 import os
 ####################################################################################################
 """
 ### from .tasks import background_task
 def trigger_task(request, task):
    # Enqueue function in "default" queue
    background_task.delay(task)  
    return JsonResponse({"message": "Task has been enqueued!", "task": task})
 """
 ####################################################################################################
 def link_list(request):
    # Base URL path
    app_url = request.build_absolute_uri()
    # Tasks
-    links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
+    links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "fetch_selenium_search"]
-    links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"]
+    links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_valid_all", "process_missing_kids_urls_invalid_all", "process_missing_kids_urls_unknown_all", "process_missing_kids_urls_all", "clean_old_url_content_60"]
    # List of links
    list_links = \
        [ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
-        [ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning"] ] + \
+        [ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning", "server", "beat", "worker_default", "worker_low"] ] #+ \
-        [ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
+        #[ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
    # Links tuple
    links = [(l, l) for l in list_links]
@@ -32,6 +35,7 @@ def link_list(request):
    return HttpResponse(html)
 ####################################################################################################
 def logs(request, log_type):
    # Capture output: python manage.py rqstats
@@ -68,4 +72,4 @@ def log_db(request):
        """).fetchall()
    return HttpResponse( "\n".join([str(e) for e in r]) )
-####################################################################################################
+####################################################################################################
--- a/app_urls/init_data.json
+++ b/app_urls/init_data.json
@@ -22,13 +22,15 @@
    },
    "REGEX_PATTERN_STATUS_PRIORITY": [
        [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
        ["https:\\/\\/x.com\\/.*", "invalid", 50],
        [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
-        [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
+        [".*foxnews\\.com\\/(video|category|person|books|html-sitemap)\\/.*", "invalid", 75],
-        [".*radio.foxnews\\.com\\/.*", "invalid", 75],
+        [".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
        [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
-        [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
+        [".*zerohedge\\.com\\/(user|contributors)\\/.*", "invalid", 75],
        [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
        [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
-        [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
+        [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50],
        [".*foxnews\\.com\\/[^\\/]+\\/?$", "invalid", 25]
    ]
 }
--- a/app_urls/init_data_fr.json
+++ b/app_urls/init_data_fr.json
@@ -1,65 +0,0 @@
 {
    "SEARCH": {
        "rss_feed": [
        ],
        "url_host": [
            "johnpilger.com",
            "lapenseeecologique.com",
            "partage-le.com",
            "reflets.info",
            "rezo.net",
            "consortiumnews.com",
            "disclose.ngo/fr",
            "energieetenvironnement.com",
            "global-climat.com",
            "slashdot.org",
            "lesamisdebartleby.wordpress.com",
            "lundi.am",
            "lvsl.fr",
            "moderndiplomacy.eu",
            "mrmondialisation.org",
            "ourfiniteworld.com",
            "southfront.org",
            "simplicius76.substack.com",
            "smoothiex12.blogspot.com",
            "theintercept.com",
            "wikileaks.org",
            "contretemps.eu",
            "indianpunchline.com",
            "investigaction.net/fr",
            "notechmagazine.com",
            "terrestres.org",
            "truthdig.com",
            "tass.com",
            "bastamag.net",
            "counterpunch.org",
            "energy-daily.com",
            "fakirpresse.info",
            "geopoliticalmonitor.com",
            "huffingtonpost.fr",
            "legrandsoir.info",
            "les-crises.fr",
            "liberation.fr",
            "maitre-eolas.fr",
            "marianne.net",
            "mediapart.fr",
            "metaefficient.com",
            "monde-diplomatique.fr",
            "paulcraigroberts.org",
            "politis.fr",
            "reporterre.net",
            "rue89.com",
            "theguardian.com/international",
            "treehugger.com",
            "unz.com",
            "voltairenet.org",
            "wsws.org"
        ],  
        "keyword_search": [
            "society collapse"
        ]
    },
    "REGEX_PATTERN_STATUS_PRIORITY": [
        [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50]
    ]
 }
--- a/app_urls/init_data_sca.json
+++ b/app_urls/init_data_sca.json
@@ -1,34 +0,0 @@
 {
    "SEARCH": {
        "rss_feed": [
            "https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
            "https://feeds.feedburner.com/breitbart",
            "https://feeds.feedburner.com/zerohedge/feed",
            "https://moxie.foxnews.com/google-publisher/latest.xml",
            "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
            "https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
        ],
        "url_host": [
            "missingkids.org/poster",
            "missingkids.org/new-poster",
            "breitbart.com",
            "zerohedge.com",
            "foxnews.com",
            "cnbc.com"
        ],
        "keyword_search": [
            "child abuse"
        ]
    },
    "REGEX_PATTERN_STATUS_PRIORITY": [
        [".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
        [".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
        [".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
        [".*radio.foxnews\\.com\\/.*", "invalid", 75],
        [".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
        [".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
        [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
        [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
        [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
    ]
 }
--- a/app_urls/init_db.py
+++ b/app_urls/init_db.py
@@ -29,13 +29,15 @@ def wait_connection():
                    connected = True
        except psycopg.OperationalError as e:
            print(str(e))
            # Connection not ready...
            # print(".", end="")
-            time.sleep(2)
+            time.sleep(15)
        except Exception as e:
            print(str(e))
            # Connection not ready...
            # print("e", end="")
-            time.sleep(2)
+            time.sleep(15)
    print("DB connection ready")
@@ -57,7 +59,8 @@ def initialize_tables():
                            ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
                            status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
                            -- status_wendy WENDY_STATUS DEFAULT NULL,
-                            -- ts_wendy TIMESTAMPTZ DEFAULT NULL
+                            -- ts_wendy TIMESTAMPTZ DEFAULT NULL,
                            -- child_abuse BOOLEAN DEFAULT NULL,
                        );
                        CREATE INDEX idx_urls_status ON urls(status);
                        CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
@@ -212,6 +215,10 @@ def initialize_data():
                print(query)
                cur.execute(query)
    # Connect to an existing database
    with psycopg.connect(connection_info) as conn:
        # Open a cursor to perform database operations
        with conn.cursor() as cur:
            # Feeds, URL host, keyword search
            for search_type, list_searches in data_json.get("SEARCH", {}).items():
                for search in list_searches:
--- a/app_urls/initialize.sh
+++ b/app_urls/initialize.sh
@@ -4,9 +4,12 @@ if [ "${INITIALIZE_DB}" = false ]; then
    echo "Initialization not required"
 else
    echo "Initializating database"
-    python init_db.py --initialize_tables --initialize_data
+    # python init_db.py --initialize_tables --initialize_data
    python manage.py makemigrations fetcher; python manage.py migrate --fake-initial
    # python manage.py migrate django_celery_beat
    python manage.py createsuperuser --noinput
    python manage.py collectstatic --no-input
-    python manage.py import --filename scheduled_tasks.json
+    # python manage.py loaddata scheduled_tasks.json
    #
    # python manage.py inspectdb # Debugging model
 fi
--- a/app_urls/requirements.txt
+++ b/app_urls/requirements.txt
@@ -1,5 +1,5 @@
 django==5.1
-django-tasks-scheduler==3.0.1
+django-celery-beat
 django-redis
 psycopg[binary]
 gunicorn
@@ -13,8 +13,9 @@ lxml[html_clean]
 googlenewsdecoder
 gnews
 GoogleNews
-duckduckgo_search
+ddgs
 git+https://github.com/tasos-py/Search-Engines-Scraper.git
 furl
 langdetect
 ollama
-PyJWT
+PyJWT
--- a/app_urls/run.sh
+++ b/app_urls/run.sh
@@ -1,8 +0,0 @@
 #!/bin/bash
 if [ "${DJANGO_DEBUG}" = true ] | [ "${DJANGO_DEBUG}" == "True" ]; then
    echo "Running in DEBUG mode"
    gunicorn core.wsgi:application --reload --log-level debug --bind 0.0.0.0:8000 --timeout 600 & python manage.py rqworker high default low
 else
    gunicorn core.wsgi:application --bind 0.0.0.0:8000 --timeout 600 & python manage.py rqworker high default low
 fi
--- a/app_urls/scheduled_tasks.json
+++ b/app_urls/scheduled_tasks.json
@@ -1,212 +1,507 @@
 [
  {
-    "model": "RepeatableTaskType",
+    "model": "django_celery_beat.periodictask",
-    "name": "Process error URLs",
+    "pk": 1,
-    "callable": "fetcher.tasks.process_error_urls",
+    "fields": {
-    "callable_args": [],
+      "name": "celery.backend_cleanup",
-    "callable_kwargs": [],
+      "task": "celery.backend_cleanup",
-    "enabled": false,
+      "interval": null,
-    "queue": "low",
+      "crontab": 1,
-    "repeat": null,
+      "solar": null,
-    "at_front": false,
+      "clocked": null,
-    "timeout": 1800,
+      "args": "[]",
-    "result_ttl": 86400,
+      "kwargs": "{}",
-    "cron_string": null,
+      "queue": null,
-    "scheduled_time": "2025-01-01T00:00:00+00:00",
+      "exchange": null,
-    "interval": 4,
+      "routing_key": null,
-    "interval_unit": "hours",
+      "headers": "{}",
-    "successful_runs": 0,
+      "priority": null,
-    "failed_runs": 0,
+      "expires": null,
-    "last_successful_run": null,
+      "expire_seconds": 43200,
-    "last_failed_run": null
+      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:07:34.609Z",
      "description": ""
    }
  },
  {
-    "model": "RepeatableTaskType",
+    "model": "django_celery_beat.periodictask",
-    "name": "Process raw URLs",
+    "pk": 2,
-    "callable": "fetcher.tasks.process_raw_urls",
+    "fields": {
-    "callable_args": [],
+      "name": "Process error URLs",
-    "callable_kwargs": [],
+      "task": "fetcher.tasks.process_error_urls",
-    "enabled": false,
+      "interval": 1,
-    "queue": "low",
+      "crontab": null,
-    "repeat": null,
+      "solar": null,
-    "at_front": false,
+      "clocked": null,
-    "timeout": 1800,
+      "args": "[]",
-    "result_ttl": 86400,
+      "kwargs": "{}",
-    "cron_string": null,
+      "queue": null,
-    "scheduled_time": "2025-01-01T00:00:00+00:00",
+      "exchange": null,
-    "interval": 10,
+      "routing_key": null,
-    "interval_unit": "minutes",
+      "headers": "{}",
-    "successful_runs": 0,
+      "priority": null,
-    "failed_runs": 0,
+      "expires": null,
-    "last_successful_run": null,
+      "expire_seconds": null,
-    "last_failed_run": null
+      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:10:08.861Z",
      "description": ""
    }
  },
  {
-    "model": "RepeatableTaskType",
+    "model": "django_celery_beat.periodictask",
-    "name": "Process MissingKids URLs",
+    "pk": 3,
-    "callable": "fetcher.tasks.process_missing_kids_urls",
+    "fields": {
-    "callable_args": [],
+      "name": "Process raw URLs",
-    "callable_kwargs": [],
+      "task": "fetcher.tasks.process_raw_urls",
-    "enabled": false,
+      "interval": 2,
-    "queue": "default",
+      "crontab": null,
-    "repeat": null,
+      "solar": null,
-    "at_front": false,
+      "clocked": null,
-    "timeout": 1800,
+      "args": "[]",
-    "result_ttl": 86400,
+      "kwargs": "{}",
-    "cron_string": null,
+      "queue": null,
-    "scheduled_time": "2025-01-01T00:00:00+00:00",
+      "exchange": null,
-    "interval": 4,
+      "routing_key": null,
-    "interval_unit": "hours",
+      "headers": "{}",
-    "successful_runs": 0,
+      "priority": null,
-    "failed_runs": 0,
+      "expires": null,
-    "last_successful_run": null,
+      "expire_seconds": null,
-    "last_failed_run": null
+      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": "2025-07-17T16:20:36.751Z",
      "total_run_count": 1,
      "date_changed": "2025-07-17T16:21:17.099Z",
      "description": ""
    }
  },
  {
-    "model": "RepeatableTaskType",
+    "model": "django_celery_beat.periodictask",
-    "name": "Process MissingKids URLs ALL",
+    "pk": 4,
-    "callable": "fetcher.tasks.process_missing_kids_urls_all",
+    "fields": {
-    "callable_args": [],
+      "name": "Process MissingKids URLs - batch=50",
-    "callable_kwargs": [],
+      "task": "fetcher.tasks.process_missing_kids_urls",
-    "enabled": false,
+      "interval": 3,
-    "queue": "default",
+      "crontab": null,
-    "repeat": null,
+      "solar": null,
-    "at_front": false,
+      "clocked": null,
-    "timeout": 7200,
+      "args": "[]",
-    "result_ttl": 86400,
+      "kwargs": "{\"batch_size\": 50}",
-    "cron_string": null,
+      "queue": null,
-    "scheduled_time": "2025-01-01T00:00:00+00:00",
+      "exchange": null,
-    "interval": 1,
+      "routing_key": null,
-    "interval_unit": "weeks",
+      "headers": "{}",
-    "successful_runs": 0,
+      "priority": null,
-    "failed_runs": 0,
+      "expires": null,
-    "last_successful_run": null,
+      "expire_seconds": null,
-    "last_failed_run": null
+      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:12:44.533Z",
      "description": ""
    }
  },
  {
-    "model": "RepeatableTaskType",
+    "model": "django_celery_beat.periodictask",
-    "name": "Fetch Feeds",
+    "pk": 5,
-    "callable": "fetcher.tasks.fetch_feeds",
+    "fields": {
-    "callable_args": [],
+      "name": "Process MissingKids URLs ALL - unknown",
-    "callable_kwargs": [],
+      "task": "fetcher.tasks.process_missing_kids_urls",
-    "enabled": false,
+      "interval": 4,
-    "queue": "default",
+      "crontab": null,
-    "repeat": null,
+      "solar": null,
-    "at_front": false,
+      "clocked": null,
-    "timeout": 1800,
+      "args": "[]",
-    "result_ttl": 86400,
+      "kwargs": "{\"process_status_only\": \"unknown\"}",
-    "cron_string": null,
+      "queue": null,
-    "scheduled_time": "2025-01-01T00:00:00+00:00",
+      "exchange": null,
-    "interval": 10,
+      "routing_key": null,
-    "interval_unit": "minutes",
+      "headers": "{}",
-    "successful_runs": 0,
+      "priority": null,
-    "failed_runs": 0,
+      "expires": null,
-    "last_successful_run": null,
+      "expire_seconds": null,
-    "last_failed_run": null
+      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:16:38.258Z",
      "description": ""
    }
  },
  {
-    "model": "RepeatableTaskType",
+    "model": "django_celery_beat.periodictask",
-    "name": "Fetch Parser",
+    "pk": 6,
-    "callable": "fetcher.tasks.fetch_parser",
+    "fields": {
-    "callable_args": [],
+      "name": "Process MissingKids URLs ALL - valid",
-    "callable_kwargs": [],
+      "task": "fetcher.tasks.process_missing_kids_urls",
-    "enabled": false,
+      "interval": 5,
-    "queue": "default",
+      "crontab": null,
-    "repeat": null,
+      "solar": null,
-    "at_front": false,
+      "clocked": null,
-    "timeout": 3600,
+      "args": "[]",
-    "result_ttl": 86400,
+      "kwargs": "{\"process_status_only\": \"valid\"}",
-    "cron_string": null,
+      "queue": null,
-    "scheduled_time": "2025-01-01T00:00:00+00:00",
+      "exchange": null,
-    "interval": 1,
+      "routing_key": null,
-    "interval_unit": "hours",
+      "headers": "{}",
-    "successful_runs": 0,
+      "priority": null,
-    "failed_runs": 0,
+      "expires": null,
-    "last_successful_run": null,
+      "expire_seconds": null,
-    "last_failed_run": null
+      "one_off": false,
      "start_time": null,
      "enabled": false,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:20:19.969Z",
      "description": ""
    }
  },
  {
-    "model": "RepeatableTaskType",
+    "model": "django_celery_beat.periodictask",
-    "name": "Fetch Search",
+    "pk": 7,
-    "callable": "fetcher.tasks.fetch_search",
+    "fields": {
-    "callable_args": [],
+      "name": "Process MissingKids URLs ALL - invalid",
-    "callable_kwargs": [],
+      "task": "fetcher.tasks.process_missing_kids_urls",
-    "enabled": false,
+      "interval": 6,
-    "queue": "default",
+      "crontab": null,
-    "repeat": null,
+      "solar": null,
-    "at_front": false,
+      "clocked": null,
-    "timeout": 3600,
+      "args": "[]",
-    "result_ttl": 86400,
+      "kwargs": "{\"process_status_only\": \"invalid\"}",
-    "cron_string": null,
+      "queue": null,
-    "scheduled_time": "2025-01-01T00:00:00+00:00",
+      "exchange": null,
-    "interval": 1,
+      "routing_key": null,
-    "interval_unit": "hours",
+      "headers": "{}",
-    "successful_runs": 0,
+      "priority": null,
-    "failed_runs": 0,
+      "expires": null,
-    "last_successful_run": null,
+      "expire_seconds": null,
-    "last_failed_run": null
+      "one_off": false,
      "start_time": null,
      "enabled": false,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:21:30.809Z",
      "description": ""
    }
  },
  {
-    "model": "RepeatableTaskType",
+    "model": "django_celery_beat.periodictask",
-    "name": "Fetch MissingKids",
+    "pk": 8,
-    "callable": "fetcher.tasks.fetch_missing_kids",
+    "fields": {
-    "callable_args": [],
+      "name": "Fetch Feeds",
-    "callable_kwargs": [],
+      "task": "fetcher.tasks.fetch_feeds",
-    "enabled": false,
+      "interval": 2,
-    "queue": "default",
+      "crontab": null,
-    "repeat": null,
+      "solar": null,
-    "at_front": false,
+      "clocked": null,
-    "timeout": 1800,
+      "args": "[]",
-    "result_ttl": 86400,
+      "kwargs": "{}",
-    "cron_string": null,
+      "queue": null,
-    "scheduled_time": "2025-01-01T00:00:00+00:00",
+      "exchange": null,
-    "interval": 4,
+      "routing_key": null,
-    "interval_unit": "hours",
+      "headers": "{}",
-    "successful_runs": 0,
+      "priority": null,
-    "failed_runs": 0,
+      "expires": null,
-    "last_successful_run": null,
+      "expire_seconds": null,
-    "last_failed_run": null
+      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:22:15.615Z",
      "description": ""
    }
  },
  {
-    "model": "RepeatableTaskType",
+    "model": "django_celery_beat.periodictask",
-    "name": "Fetch MissingKids ALL",
+    "pk": 9,
-    "callable": "fetcher.tasks.fetch_missing_kids_all",
+    "fields": {
-    "callable_args": [],
+      "name": "Fetch Parser",
-    "callable_kwargs": [],
+      "task": "fetcher.tasks.fetch_parser",
-    "enabled": false,
+      "interval": 7,
-    "queue": "default",
+      "crontab": null,
-    "repeat": null,
+      "solar": null,
-    "at_front": false,
+      "clocked": null,
-    "timeout": 7200,
+      "args": "[]",
-    "result_ttl": 86400,
+      "kwargs": "{}",
-    "cron_string": null,
+      "queue": null,
-    "scheduled_time": "2025-01-01T00:00:00+00:00",
+      "exchange": null,
-    "interval": 1,
+      "routing_key": null,
-    "interval_unit": "weeks",
+      "headers": "{}",
-    "successful_runs": 0,
+      "priority": null,
-    "failed_runs": 0,
+      "expires": null,
-    "last_successful_run": null,
+      "expire_seconds": null,
-    "last_failed_run": null
+      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:22:40.215Z",
      "description": ""
    }
  },
  {
-    "model": "RepeatableTaskType",
+    "model": "django_celery_beat.periodictask",
-    "name": "Clean old URL content",
+    "pk": 10,
-    "callable": "fetcher.tasks.clean_old_url_content",
+    "fields": {
-    "callable_args": [],
+      "name": "Fetch Search",
-    "callable_kwargs": [],
+      "task": "fetcher.tasks.fetch_search",
-    "enabled": false,
+      "interval": 8,
-    "queue": "default",
+      "crontab": null,
-    "repeat": null,
+      "solar": null,
-    "at_front": false,
+      "clocked": null,
-    "timeout": null,
+      "args": "[]",
-    "result_ttl": 86400,
+      "kwargs": "{}",
-    "cron_string": null,
+      "queue": null,
-    "scheduled_time": "2025-01-01T00:00:00+00:00",
+      "exchange": null,
-    "interval": 1,
+      "routing_key": null,
-    "interval_unit": "weeks",
+      "headers": "{}",
-    "successful_runs": 0,
+      "priority": null,
-    "failed_runs": 0,
+      "expires": null,
-    "last_successful_run": null,
+      "expire_seconds": null,
-    "last_failed_run": null
+      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:23:00.329Z",
      "description": ""
    }
  },
  {
    "model": "django_celery_beat.periodictask",
    "pk": 11,
    "fields": {
      "name": "Fetch Selenium Search",
      "task": "fetcher.tasks.fetch_selenium_search",
      "interval": 3,
      "crontab": null,
      "solar": null,
      "clocked": null,
      "args": "[]",
      "kwargs": "{}",
      "queue": null,
      "exchange": null,
      "routing_key": null,
      "headers": "{}",
      "priority": null,
      "expires": null,
      "expire_seconds": null,
      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:24:08.315Z",
      "description": ""
    }
  },
  {
    "model": "django_celery_beat.periodictask",
    "pk": 12,
    "fields": {
      "name": "Fetch MissingKids - pages=5",
      "task": "fetcher.tasks.fetch_missing_kids",
      "interval": 4,
      "crontab": null,
      "solar": null,
      "clocked": null,
      "args": "[]",
      "kwargs": "{\"number_pages\": 5}",
      "queue": null,
      "exchange": null,
      "routing_key": null,
      "headers": "{}",
      "priority": null,
      "expires": null,
      "expire_seconds": null,
      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:25:02.494Z",
      "description": ""
    }
  },
  {
    "model": "django_celery_beat.periodictask",
    "pk": 13,
    "fields": {
      "name": "Fetch MissingKids - ALL",
      "task": "fetcher.tasks.fetch_missing_kids",
      "interval": 9,
      "crontab": null,
      "solar": null,
      "clocked": null,
      "args": "[]",
      "kwargs": "{\"number_pages\": -1}",
      "queue": null,
      "exchange": null,
      "routing_key": null,
      "headers": "{}",
      "priority": null,
      "expires": null,
      "expire_seconds": null,
      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:25:50.597Z",
      "description": ""
    }
  },
  {
    "model": "django_celery_beat.periodictask",
    "pk": 14,
    "fields": {
      "name": "Clean old URL content",
      "task": "fetcher.tasks.clean_old_url_content",
      "interval": 9,
      "crontab": null,
      "solar": null,
      "clocked": null,
      "args": "[]",
      "kwargs": "{}",
      "queue": null,
      "exchange": null,
      "routing_key": null,
      "headers": "{}",
      "priority": null,
      "expires": null,
      "expire_seconds": null,
      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:26:16.272Z",
      "description": ""
    }
  },
  {
    "model": "django_celery_beat.periodictask",
    "pk": 4,
    "fields": {
      "name": "Notify status",
      "task": "fetcher.tasks.notify_status",
      "interval": 3,
      "crontab": null,
      "solar": null,
      "clocked": null,
      "args": "[]",
      "kwargs": "{}",
      "queue": null,
      "exchange": null,
      "routing_key": null,
      "headers": "{}",
      "priority": null,
      "expires": null,
      "expire_seconds": null,
      "one_off": false,
      "start_time": null,
      "enabled": true,
      "last_run_at": null,
      "total_run_count": 0,
      "date_changed": "2025-07-17T16:12:44.533Z",
      "description": ""
    }
  },
  {
    "model": "django_celery_beat.intervalschedule",
    "pk": 1,
    "fields": {
      "every": 6,
      "period": "hours"
    }
  },
  {
    "model": "django_celery_beat.intervalschedule",
    "pk": 2,
    "fields": {
      "every": 10,
      "period": "minutes"
    }
  },
  {
    "model": "django_celery_beat.intervalschedule",
    "pk": 3,
    "fields": {
      "every": 1,
      "period": "days"
    }
  },
  {
    "model": "django_celery_beat.intervalschedule",
    "pk": 4,
    "fields": {
      "every": 12,
      "period": "hours"
    }
  },
  {
    "model": "django_celery_beat.intervalschedule",
    "pk": 5,
    "fields": {
      "every": 2,
      "period": "days"
    }
  },
  {
    "model": "django_celery_beat.intervalschedule",
    "pk": 6,
    "fields": {
      "every": 28,
      "period": "days"
    }
  },
  {
    "model": "django_celery_beat.intervalschedule",
    "pk": 7,
    "fields": {
      "every": 8,
      "period": "hours"
    }
  },
  {
    "model": "django_celery_beat.intervalschedule",
    "pk": 8,
    "fields": {
      "every": 4,
      "period": "hours"
    }
  },
  {
    "model": "django_celery_beat.intervalschedule",
    "pk": 9,
    "fields": {
      "every": 7,
      "period": "days"
    }
  },
  {
    "model": "django_celery_beat.crontabschedule",
    "pk": 1,
    "fields": {
      "minute": "0",
      "hour": "4",
      "day_of_month": "*",
      "month_of_year": "*",
      "day_of_week": "*",
      "timezone": "UTC"
    }
  }
-]
+  ]
--- a/app_urls/supervisord.conf
+++ b/app_urls/supervisord.conf
@@ -0,0 +1,15 @@
 [supervisord]
 nodaemon=true
 [program:server]
 command=gunicorn core.wsgi:application --bind 0.0.0.0:8000
 directory=/opt/app
 autostart=true
 autorestart=true
 ; Unified log file
 stdout_logfile=/opt/logs/server.log
 stderr_logfile=/opt/logs/server.log
 redirect_stderr=true
 ; Rotate when file reaches max size
 stdout_logfile_maxbytes=20MB
 stdout_logfile_backups=1
--- a/docker-compose-base.yml
+++ b/docker-compose-base.yml
@@ -0,0 +1,99 @@
 services:
  fetcher_app_selenium:
    image: fetcher_app_selenium
    build:
      context: ./app_selenium
      args:
        - ARCH=${ARCH} # arm64, amd64
    container_name: fetcher_app_selenium
    restart: unless-stopped
    shm_size: 512mb
    init: true  # For zombie processes
    environment:
      - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
      - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
    ports:
      - 80
    dns:
      - 1.1.1.1
      - 1.0.0.1
  fetcher_app_urls:
    image: fetcher_app_urls
    build:
      context: ./app_urls
    container_name: fetcher_app_urls
    restart: unless-stopped
    environment:
      # Initialization
      - INITIALIZE_DB=${INITIALIZE_DB}  # Related to DB persistence
      - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
      - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
      - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
      # Django
      - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
      - DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
      - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
      - DJANGO_DEBUG=${DJANGO_DEBUG}
      - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
      # Database
      - DB_NAME=${DB_NAME}
      - DB_USER=${DB_USER}
      - DB_PASSWORD=${DB_PASSWORD}
      - DB_HOST=${DB_HOST}
      - DB_PORT=${DB_PORT}
      - REDIS_HOST=${REDIS_HOST}
      - REDIS_PORT=${REDIS_PORT}
      # Job timeout: 30 min
      - JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
      # Fetcher
      - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
      - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
      - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP}  # Sleep time between each search
      - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP}  # Sleep time between requests to same URL host
      - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR}  # Min amonut of characters to run language detection
      - FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
      - FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
      # Selenium
      - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
      - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
      # Ghost
      - GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
      - GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
      - PEXELS_API_KEY=${PEXELS_API_KEY}
      - OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
      # Telegram
      - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
      - TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID}
    ########################
    ports:
      - 8000
    depends_on:
      - fetcher_db
      - fetcher_redis
      # - fetcher_app_selenium
    dns:
      - 1.1.1.1
      - 1.0.0.1
  fetcher_redis:
    image: redis:alpine
    container_name: fetcher_redis
    restart: unless-stopped
    ports:
      - 6379
  fetcher_db:
    container_name: fetcher_db
    restart: unless-stopped
  fetcher_flower:
    image: mher/flower
    container_name: fetcher_flower
    ports:
      - 5555
    environment:
      - CELERY_BROKER_URL=redis://fetcher_redis:6379/0
    depends_on:
      - fetcher_redis
--- a/docker-compose-dev.yml
+++ b/docker-compose-dev.yml
@@ -1,24 +1,9 @@
 version: '3.9'
 services:
  fetcher_app_selenium:
-    image: fetcher_app_selenium
+    extends:
-    build:
+      file: docker-compose-base.yml
-      context: ./app_selenium
+      service: fetcher_app_selenium
      args:
        - ARCH=${ARCH} # arm64, amd64
    container_name: fetcher_app_selenium
    restart: unless-stopped
    shm_size: 512mb
    environment:
      - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
      - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
    ports:
      - 80:80
    dns:
      - 1.1.1.1
      - 1.0.0.1
    deploy:
      resources:
        limits:
@@ -26,65 +11,11 @@ services:
          memory: ${DEPLOY_RAM}
  fetcher_app_urls:
-    image: fetcher_app_urls
+    extends:
-    build:
+      file: docker-compose-base.yml
-      context: ./app_urls
+      service: fetcher_app_urls
-    container_name: fetcher_app_urls
+      #env_files:
-    restart: unless-stopped
+      #  - .env.dev
    environment:
      # Initialization
      - INITIALIZE_DB=${INITIALIZE_DB}  # Related to DB persistence
      - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
      - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
      - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
      # Django
      - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
      - DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
      - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
      - DJANGO_DEBUG=${DJANGO_DEBUG}
      - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
      # Database
      - DB_NAME=${DB_NAME}
      - DB_USER=${DB_USER}
      - DB_PASSWORD=${DB_PASSWORD}
      - DB_HOST=${DB_HOST}
      - DB_PORT=${DB_PORT}
      - REDIS_HOST=${REDIS_HOST}
      - REDIS_PORT=${REDIS_PORT}
      # Job timeout: 30 min
      - JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
      # Fetcher
      - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
      - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
      - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP}  # Sleep time between each search
      - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP}  # Sleep time between requests to same URL host
      - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR}  # Min amonut of characters to run language detection
      - FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
      - FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
      # Selenium
      - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
      - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
      # Ghost
      - GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
      - GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
      - PEXELS_API_KEY=${PEXELS_API_KEY}
    ########################
    volumes:   # Development mode
      - ./app_urls:/opt/app
    ########################
    ports:
      - 8000:8000
    depends_on:
      - fetcher_db
      - fetcher_redis
    dns:
      - 1.1.1.1
      - 1.0.0.1
    deploy:
      resources:
        limits:
          cpus: '${DEPLOY_CPUS}'
          memory: ${DEPLOY_RAM}
    #labels:  # Reverse proxy sample
    #  - "traefik.enable=true"
    #  - "traefik.http.routers.fetcher.rule=Host(`urls.yourdomain.com`)"
@@ -94,11 +25,21 @@ services:
    #networks:
    #  - default # This network
    #  - docker_default  # Reverse proxy network
    ports:
      - 8005:8000
    ## volumes:   # Development mode
    ##   - ./app_urls:/opt/app
    deploy:
      resources:
        limits:
          cpus: '${DEPLOY_CPUS}'
          memory: ${DEPLOY_RAM}
  fetcher_db:
    extends:
      file: docker-compose-base.yml
      service: fetcher_db
    image: postgres:17
    container_name: fetcher_db
    restart: unless-stopped
    # Set shared memory limit when using docker-compose
    shm_size: 128mb
    environment:
@@ -106,18 +47,21 @@ services:
      POSTGRES_PASSWORD: ${DB_PASSWORD}
      POSTGRES_USER: ${DB_USER}
      POSTGRES_INITDB_ARGS: '--data-checksums'
    #volumes:   # Persistent DB?
    #  - ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data
    ports:
      - 5432 #:5432
    #volumes:   # Persistent DB?
    #  - ./postgres:/var/lib/postgresql/data
  fetcher_redis:
-    image: redis:alpine
+    extends:
-    container_name: fetcher_redis
+      file: docker-compose-base.yml
-    restart: unless-stopped
+      service: fetcher_redis
    ports:
-      - 6379 #:6379
+      - 6379:6379
-#networks:
+  fetcher_flower:
-#  docker_default:
+    extends:
-#    external: true
+      file: docker-compose-base.yml
      service: fetcher_flower
    ports:
      - 5555:5555
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,85 +1,21 @@
 version: '3.9'
 services:
-  fetcher_app_selenium:
+  #fetcher_app_selenium:
-    image: fetcher_app_selenium
+  #  extends:
-    build:
+  #    file: docker-compose-base.yml
-      context: ./app_selenium
+  #    service: fetcher_app_selenium
-      args:
+  #  deploy:
-        - ARCH=${ARCH} # arm64, amd64
+  #    resources:
-    container_name: fetcher_app_selenium
+  #      limits:
-    restart: unless-stopped
+  #        cpus: '${DEPLOY_CPUS}'
-    shm_size: 512mb
+  #        memory: ${DEPLOY_RAM}
    environment:
      - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
      - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
    ports:
      - 80
    dns:
      - 1.1.1.1
      - 1.0.0.1
    deploy:
      resources:
        limits:
          cpus: '${DEPLOY_CPUS}'
          memory: ${DEPLOY_RAM}
  fetcher_app_urls:
-    image: fetcher_app_urls
+    extends:
-    build:
+      file: docker-compose-base.yml
-      context: ./app_urls
+      service: fetcher_app_urls
    container_name: fetcher_app_urls
    restart: unless-stopped
    environment:
      # Initialization
      - INITIALIZE_DB=${INITIALIZE_DB}  # Related to DB persistence
      - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
      - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
      - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
      # Django
      - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
      - DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
      - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
      - DJANGO_DEBUG=${DJANGO_DEBUG}
      - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
      # Database
      - DB_NAME=${DB_NAME}
      - DB_USER=${DB_USER}
      - DB_PASSWORD=${DB_PASSWORD}
      - DB_HOST=${DB_HOST}
      - DB_PORT=${DB_PORT}
      - REDIS_HOST=${REDIS_HOST}
      - REDIS_PORT=${REDIS_PORT}
      # Job timeout: 30 min
      - JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
      # Fetcher
      - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
      - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
      - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP}  # Sleep time between each search
      - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP}  # Sleep time between requests to same URL host
      - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR}  # Min amonut of characters to run language detection
      - FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
      - FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
      # Selenium
      - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
      - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
      # Ghost
      - GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
      - GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
      - PEXELS_API_KEY=${PEXELS_API_KEY}
    ########################
    #volumes:   # Development mode
    #  - ./app_urls:/opt/app
    ########################
    ports:
-      - 8000 # :8000
+      - 8067:8000
    depends_on:
      - fetcher_db
      - fetcher_redis
    dns:
      - 1.1.1.1
      - 1.0.0.1
    deploy:
      resources:
        limits:
@@ -87,36 +23,56 @@ services:
          memory: ${DEPLOY_RAM}
    labels:  # Reverse proxy sample
      - "traefik.enable=true"
-      - "traefik.http.routers.fetcher.rule=Host(`${REVERSE_PROXY_URL}`)"
+      - "traefik.http.routers.fetcher.rule=Host(`fetcher.matitos.org`)"
      - "traefik.http.routers.fetcher.entrypoints=websecure"
      - "traefik.http.routers.fetcher.tls.certresolver=myresolvercd"
      - "traefik.http.services.fetcher.loadbalancer.server.port=8000"
    networks:
      - default # This network
      - docker_default  # Reverse proxy network
  fetcher_db:
-    image: postgres:17
+    extends:
-    container_name: fetcher_db
+      file: docker-compose-base.yml
      service: fetcher_db
    image: alpine:latest
    restart: unless-stopped
-    # Set shared memory limit when using docker-compose
+    deploy:
-    shm_size: 128mb
+      resources:
-    environment:
+        limits:
-      POSTGRES_DB: ${DB_NAME}
+          memory: 256M
-      POSTGRES_PASSWORD: ${DB_PASSWORD}
+    volumes:
-      POSTGRES_USER: ${DB_USER}
+      # REQUIREMENT: Add fetcher's SSH public key into the DB's .ssh/authorized_keys machine
-      POSTGRES_INITDB_ARGS: '--data-checksums'
+      - ~/.ssh:/root/.ssh:ro
    volumes:   # Persistent DB?
      - ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data
    ports:
-      - 5432 #:5432
+      - 15889:15889
      - 5432:5432
    command:
      - sh
      - -c
      - |
        apk add --update openssh autossh
        # Monitor status on port 15889
        autossh -M 15889 -N -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
        # autossh -M 15889 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
    networks:
      - docker_default  # Reverse proxy network
  fetcher_redis:
-    image: redis:alpine
+    extends:
-    container_name: fetcher_redis
+      file: docker-compose-base.yml
-    restart: unless-stopped
+      service: fetcher_redis
    ports:
-      - 6379 #:6379
+      - 6379:6379
    networks:
      - docker_default  # Reverse proxy network
  #fetcher_flower:
  #  extends:
  #    file: docker-compose-base.yml
  #    service: fetcher_flower
  #  ports:
  #    - 5555:5555
 networks:
  docker_default:
--- a/utils/DB-Dev.ipynb
+++ b/utils/DB-Dev.ipynb
@@ -0,0 +1,79 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip install python-dotenv\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "# Specify the path to your .env file (optional if in the current dir)\n",
    "load_dotenv(dotenv_path=\".env\", override=True)\n",
    "\n",
    "import os\n",
    "import psycopg\n",
    "from sshtunnel import SSHTunnelForwarder\n",
    "\n",
    "if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
    "    print(\"SSH tunnel: True\")\n",
    "else:\n",
    "    print(\"SSH tunnel: False\")\n",
    "\n",
    "connect_info = \"host={} port={} user={} password={} dbname={}\".format(os.environ.get(\"DB_HOST\"), os.environ.get(\"DB_PORT\"), os.environ.get(\"DB_USER\"), os.environ.get(\"DB_PASSWORD\"), os.environ.get(\"DB_NAME\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
    "    ssh_tunnel = SSHTunnelForwarder(\n",
    "        (os.environ.get(\"REMOTE_HOST\"), int(os.environ.get(\"REMOTE_SSH_PORT\"))), \n",
    "        ssh_username=os.environ.get(\"REMOTE_USERNAME\"), ssh_password=os.environ.get(\"REMOTE_PASSWORD\"), \n",
    "        remote_bind_address=('localhost', int(os.environ.get(\"REMOTE_PORT\"))), local_bind_address=('localhost', int(os.environ.get(\"DB_PORT\"))) \n",
    "    )\n",
    "    ssh_tunnel.start()\n",
    "\n",
    "try:\n",
    "    with psycopg.connect(connect_info) as conn:\n",
    "        if True:\n",
    "            for t in conn.execute(\"\"\"\n",
    "                        SELECT * from URLS WHERE id IN (SELECT id_url FROM URLS_SOURCE_SEARCH INNER JOIN SEARCH ON URLS_SOURCE_SEARCH.id_search = SEARCH.id WHERE SEARCH.search LIKE '%child abuse%') LIMIT 5;\n",
    "                    \"\"\").fetchall():\n",
    "                print(t)\n",
    "            \n",
    "except Exception as e:\n",
    "    print(\"Err:\", str(e))\n",
    "\n",
    "if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
    "    ssh_tunnel.stop()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matitos_urls",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/utils/Ghost-Posts.ipynb
+++ b/utils/Ghost-Posts.ipynb
@@ -0,0 +1,164 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "import jwt\n",
    "import requests\n",
    "from datetime import datetime, timedelta, timezone\n",
    "\n",
    "admin_api_url = \"\" # .env -> GHOST_ADMIN_API_URL\n",
    "admin_api_key = \"\" # .env -> GHOST_ADMIN_API_KEY\n",
    "\n",
    "def _create_jwt(admin_api_key):\n",
    "    id_, secret = admin_api_key.split(':')\n",
    "    iat = int(time.time())\n",
    "    exp = iat + 5 * 60  # 5 minutes\n",
    "    header = {'alg': 'HS256', 'kid': id_}\n",
    "    payload = {\n",
    "        'iat': iat,\n",
    "        'exp': exp,\n",
    "        'aud': '/v5/admin/'  # Adjust depending on your Ghost version\n",
    "    }\n",
    "    token = jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)\n",
    "    return token\n",
    "\n",
    "# Get token\n",
    "jwt_token = _create_jwt(os.getenv(\"GHOST_ADMIN_API_KEY\"))\n",
    "\n",
    "headers = {\n",
    "    'Authorization': f'Ghost {jwt_token}',\n",
    "    'Content-Type': 'application/json'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DELETE_ALL_POSTS = False\n",
    "\n",
    "if DELETE_ALL_POSTS:\n",
    "    while (True):\n",
    "        # GET /admin/posts/\n",
    "        response = requests.get(os.path.join(admin_api_url, \"posts\"), headers=headers)\n",
    "        dict_response = response.json()\n",
    "\n",
    "        if (len(dict_response.get(\"posts\")) == 0):\n",
    "            break\n",
    "\n",
    "        # Iterate posts\n",
    "        for p in dict_response.get(\"posts\"):\n",
    "            # Post ID\n",
    "            post_id = p.get(\"id\")\n",
    "\n",
    "            # DELETE /admin/posts/{id}/\n",
    "            r = requests.delete(os.path.join(admin_api_url, \"posts\", \"{}\".format(post_id)), headers=headers)\n",
    "            print(\"Post:\", post_id, \"Status:\", r.status_code, r.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PUBLISH_SAMPLE = False\n",
    "\n",
    "def _create_ghost_post(jwt_token, admin_api_url, post_data):\n",
    "    # Get Admin API URL\n",
    "    admin_api_url = os.getenv(\"GHOST_ADMIN_API_URL\")\n",
    "\n",
    "    headers = {\n",
    "        'Authorization': f'Ghost {jwt_token}',\n",
    "        'Content-Type': 'application/json'\n",
    "    }\n",
    "    \n",
    "    post_data = {\"posts\": [post_data]}\n",
    "\n",
    "    response = requests.post(\n",
    "        os.path.join(admin_api_url, \"posts\"),\n",
    "        json=post_data,\n",
    "        headers=headers,\n",
    "        params={\"source\":\"html\"}\n",
    "    )\n",
    "\n",
    "    if response.status_code == 201:\n",
    "        print(\"Ghost post published successfully\")\n",
    "        return response.json()\n",
    "    else:\n",
    "        print(\"Ghost - Failed to publish post: {} {}\".format(response.status_code, response.text))\n",
    "        return None\n",
    "\n",
    "if (PUBLISH_SAMPLE):\n",
    "    url_id = 150\n",
    "\n",
    "    post_data = {\n",
    "        # \"slug\": \"hey-short\",\n",
    "        \"title\": \"Hey there, sample title\",\n",
    "        \"html\": \"<p>Hey there!</p>\",\n",
    "        # \"feature_image\": photo_url,\n",
    "        # \"feature_image_caption\": \"\",\n",
    "        \"status\": \"published\",\n",
    "        \"tags\": [\"#url-id-{}\".format(url_id)]\n",
    "    }\n",
    "\n",
    "    # Publish post\n",
    "    payload = _create_ghost_post(jwt_token, admin_api_url, post_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter by post title\n",
    "post_title = \"Funds raised for legal action over failure to stop grooming gangs\"\n",
    "# Filter by published date\n",
    "iso_time = (datetime.now(timezone.utc) - timedelta(hours=48)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'\n",
    "# Parameter for filter\n",
    "params = {\"filter\": \"title:'{}'+published_at:>{}\".format(post_title, iso_time)}\n",
    "\n",
    "# Filter by URL ID\n",
    "url_id = 150\n",
    "# Parameter for filter\n",
    "params = {\"filter\": \"tags:hash-url-id-{}\".format(url_id)}\n",
    "\n",
    "# Get posts using filter\n",
    "response = requests.get(os.path.join(admin_api_url, \"posts\"), params=params, headers=headers)\n",
    "dict_response = response.json()\n",
    "\n",
    "len(dict_response.get(\"posts\"))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matitos_urls",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/utils/Newspapers.ipynb
+++ b/utils/Newspapers.ipynb
@@ -0,0 +1,215 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "url = \"https://onlinenewspapers.com/index.shtml\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'newspaper/0.9.3.1'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "import newspaper\n",
    "\n",
    "newspaper.Config().__dict__\n",
    "\n",
    " 'requests_params': {'timeout': 7,\n",
    "  'proxies': {},\n",
    "  'headers': {'User-Agent': 'newspaper/0.9.3.1'}},\n",
    "\"\"\"\n",
    "import newspaper\n",
    "newspaper.Config().browser_user_agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "        url (str): The url of the source (news website) to build. For example,\n",
    "            `https://www.cnn.com`.\n",
    "        dry (bool): If true, the source object will be constructed but not\n",
    "            downloaded or parsed.\n",
    "        only_homepage (bool): If true, the source object will only parse\n",
    "            the homepage of the source.\n",
    "        only_in_path (bool): If true, the source object will only\n",
    "            parse the articles that are in the same path as the source's\n",
    "            homepage. You can scrape a specific category this way.\n",
    "            Defaults to False.\n",
    "        input_html (str): The HTML of the source to parse. Use this to pass cached\n",
    "            HTML to the source object.\n",
    "        config (Configuration): A configuration object to use for the source.\n",
    "        kwargs: Any other keyword arguments to pass to the Source constructor.\n",
    "            If you omit the config object, you can add any configuration\n",
    "            options here.\n",
    "\"\"\"\n",
    "\n",
    "url = \"https://www.lanacion.com.ar/deportes/\"\n",
    "\n",
    "newspaper_built = newspaper.build(url, only_in_path=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.__dict__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.article_urls()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "url = \"https://www.lanacion.com.ar/\"\n",
    "#url = \"https://www.lanacion.com.ar/deportes/\"\n",
    "newspaper_built = newspaper.build(url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "        url (str): The url of the source (news website) to build. For example,\n",
    "            `https://www.cnn.com`.\n",
    "        dry (bool): If true, the source object will be constructed but not\n",
    "            downloaded or parsed.\n",
    "        only_homepage (bool): If true, the source object will only parse\n",
    "            the homepage of the source.\n",
    "        only_in_path (bool): If true, the source object will only\n",
    "            parse the articles that are in the same path as the source's\n",
    "            homepage. You can scrape a specific category this way.\n",
    "            Defaults to False.\n",
    "        input_html (str): The HTML of the source to parse. Use this to pass cached\n",
    "            HTML to the source object.\n",
    "        config (Configuration): A configuration object to use for the source.\n",
    "        kwargs: Any other keyword arguments to pass to the Source constructor.\n",
    "            If you omit the config object, you can add any configuration\n",
    "            options here.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat = newspaper_built.categories[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.categories_to_articles()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.category_urls()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    " 'https://www.lanacion.com.ar/tema/futbol-argentino-tid57505/',\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "categories = newspaper_built.category_urls()\n",
    "url_of_interest = \"https://www.lanacion.com.ar/sabado/todo-para-ellos-nid21042025/\"\n",
    "\n",
    "potential_categories = []\n",
    "\n",
    "for c in categories:\n",
    "    if (c in url_of_interest):\n",
    "        print(c, url_of_interest)\n",
    "        potential_categories.append(c)\n",
    "\n",
    "# Get longest length category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "newspaper_built.article_urls()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matitos_urls",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/utils/Schools-NL.ipynb
+++ b/utils/Schools-NL.ipynb
@@ -11,6 +11,9 @@
    "from urllib.parse import urljoin\n",
    "import pandas as pd\n",
    "import os\n",
    "import json\n",
    "import csv\n",
    "\n",
    "\n",
    "headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
   ]
@@ -68,6 +71,154 @@
    "            # websites.append(href)\n",
    "            return href\n",
    "\n",
    "def get_num_students_per_zipcode(soup):\n",
    "    list_zipcode_students_percentage = []\n",
    "\n",
    "    h3_tag = soup.find(\"h3\", string=\"In welk postcodegebied wonen de leerlingen van deze school?\")\n",
    "    if h3_tag:\n",
    "        dialog = h3_tag.find_parent(\"dialog\")\n",
    "\n",
    "        if dialog:\n",
    "            # print(dialog.prettify())\n",
    "            table = dialog.find(\"table\")\n",
    "            if table:\n",
    "                rows = table.find_all(\"tr\")\n",
    "                for row in rows:\n",
    "                    cells = row.find_all([\"th\", \"td\"])\n",
    "                    row_data = [cell.get_text(strip=True) for cell in cells]\n",
    "                    zipcode, num_students, percentage = row_data\n",
    "                    list_zipcode_students_percentage.append( (zipcode, num_students, percentage) )\n",
    "    \n",
    "    return list_zipcode_students_percentage\n",
    "\n",
    "def get_num_students_trend(soup):\n",
    "    # Step 1: Locate the <aantal-leerlingen-trend-line-chart> tag\n",
    "    trend_chart_tag = soup.find(\"aantal-leerlingen-trend-line-chart\")\n",
    "\n",
    "    if trend_chart_tag:\n",
    "        # Step 2: Extract the 'leerlingen-trend-data' attribute\n",
    "        trend_data_attr = trend_chart_tag.get(\"leerlingen-trend-data\")\n",
    "        \n",
    "        if trend_data_attr:\n",
    "            # Step 3: Parse the JSON string into a Python object\n",
    "            trend_data = json.loads(trend_data_attr)\n",
    "            #print(\"Extracted leerlingen-trend-data:\")\n",
    "            #print(json.dumps(trend_data, indent=4))  # Pretty-print the JSON data\n",
    "            return [ (e.get(\"key\"), e.get(\"aantal\") ) for e in trend_data]\n",
    "\n",
    "def get_num_students_per_age_and_group(soup):\n",
    "    num_students_per_group, num_students_per_age = [], []\n",
    "    ############################################################################\n",
    "    # Step 1: Locate the <aantal-leerlingen-leeftijd-bar-chart> tag\n",
    "    chart_tag = soup.find('aantal-leerlingen-leeftijd-bar-chart', attrs={'aantal-per-leeftijd': True})\n",
    "    # Step 2: Extract the 'aantal-per-leeftijd' attribute\n",
    "    raw_data = chart_tag['aantal-per-leeftijd']\n",
    "\n",
    "    # Step 3: Parse the JSON data\n",
    "    try:\n",
    "        data = json.loads(raw_data)\n",
    "        # Step 4: Print the extracted data\n",
    "        # print(\"Aantal per Leeftijd:\")\n",
    "        for entry in data:\n",
    "            age = entry['key']\n",
    "            num_students = entry['aantal']\n",
    "            # school_data[\"num_students_age_{}\".format(age)] = num_students\n",
    "            num_students_per_age.append( (age, num_students) )\n",
    "            # print(f\"Age {entry['key']}: {entry['aantal']} leerlingen\")\n",
    "    except json.JSONDecodeError as e:\n",
    "        print(f\"Failed to parse JSON data: {e}\")\n",
    "\n",
    "    ############################################################################\n",
    "    # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
    "    chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
    "\n",
    "    if not chart_tag:\n",
    "        print(\"Could not find the 'aantal per leerjaar' section.\")\n",
    "    else:\n",
    "        # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
    "        raw_data = chart_tag['aantal-per-leerjaar']\n",
    "        \n",
    "        # Step 3: Parse the JSON data\n",
    "        try:\n",
    "            data = json.loads(raw_data)\n",
    "            # Step 4: Print the extracted data\n",
    "            # print(\"Aantal per Leerjaar:\")\n",
    "            for entry in data:\n",
    "                group = entry['key']\n",
    "                num_students = entry['aantal']\n",
    "                # school_data[\"num_students_group_{}\".format(group)] = num_students\n",
    "                num_students_per_group.append( (group, num_students) )\n",
    "                # print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
    "        except json.JSONDecodeError as e:\n",
    "            print(f\"Failed to parse JSON data: {e}\")\n",
    "    ############################################################################\n",
    "    return num_students_per_group, num_students_per_age\n",
    "\n",
    "\n",
    "def update_school_data(school_url, school_data):\n",
    "    try:\n",
    "        # Process school (request contact details)\n",
    "        response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
    "        response.raise_for_status()  # Raise an exception for HTTP errors\n",
    "        # Parse the HTML content using BeautifulSoup\n",
    "        soup_school = BeautifulSoup(response.text, 'html.parser')\n",
    "\n",
    "        # School details\n",
    "        school_details = soup_school.find(class_=\"school-details\")\n",
    "        for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n",
    "            data = li_detail.find('span', class_='infotip-term')['data-dfn']\n",
    "            text = li_detail.get_text(strip=True)\n",
    "            # Set data\n",
    "            school_data[\"category_{}\".format(category_idx)] = text\n",
    "            school_data[\"category_{}_description\".format(category_idx)] = data\n",
    "        \n",
    "        school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n",
    "        school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n",
    "        school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n",
    "        school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n",
    "\n",
    "        school_data[\"city\"] = school_city\n",
    "        school_data[\"postcode\"] = school_postcode\n",
    "        school_data[\"address\"] = school_address\n",
    "\n",
    "        try:\n",
    "            school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n",
    "        except Exception as e:\n",
    "            pass\n",
    "        try:\n",
    "            school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n",
    "        except Exception as e:\n",
    "            pass\n",
    "        try:\n",
    "            school_data[\"email\"] = extract_emails(soup_school)\n",
    "        except Exception as e:\n",
    "            pass\n",
    "\n",
    "        # Process school main site\n",
    "        response = requests.get(os.path.join(school_url), headers=headers)\n",
    "        response.raise_for_status()  # Raise an exception for HTTP errors\n",
    "        # Parse the HTML content using BeautifulSoup\n",
    "        soup_school = BeautifulSoup(response.text, 'html.parser')\n",
    "\n",
    "        try:\n",
    "            school_data[\"students_per_zipcode\"] = get_num_students_per_zipcode(soup_school)\n",
    "        except Exception as e:\n",
    "            pass\n",
    "        try:\n",
    "            school_data[\"students_per_year_trend\"] = get_num_students_trend(soup_school)\n",
    "        except Exception as e:\n",
    "            pass\n",
    "\n",
    "        if (school_data.get(\"category\").lower() == \"basisscholen\"):\n",
    "            try:\n",
    "                num_students_per_group, num_students_per_age = get_num_students_per_age_and_group(soup_school)\n",
    "                school_data[\"num_students_per_group\"] = num_students_per_group if len(num_students_per_group)>0 else None\n",
    "                school_data[\"num_students_per_age\"] = num_students_per_age if len(num_students_per_age)>0 else None\n",
    "            except Exception as e:\n",
    "                pass\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(school_url, str(e))\n",
    "\n",
    "def main():\n",
    "    list_urls = [\n",
@@ -128,54 +279,26 @@
    "                    \"url\": school_url,\n",
    "                }\n",
    "\n",
-    "                try:\n",
+    "                update_school_data(school_url, school_data)\n",
    "                    # Process school (request contact details)\n",
    "                    response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
    "                    response.raise_for_status()  # Raise an exception for HTTP errors\n",
    "\n",
    "                    # Parse the HTML content using BeautifulSoup\n",
    "                    soup_school = BeautifulSoup(response.text, 'html.parser')\n",
    "\n",
    "                    # School details\n",
    "                    school_details = soup_school.find(class_=\"school-details\")\n",
    "                    for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n",
    "                        data = li_detail.find('span', class_='infotip-term')['data-dfn']\n",
    "                        text = li_detail.get_text(strip=True)\n",
    "                        # Set data\n",
    "                        school_data[\"category_{}\".format(category_idx)] = text\n",
    "                        school_data[\"category_{}_description\".format(category_idx)] = data\n",
    "                    \n",
    "                    school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n",
    "                    school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n",
    "                    school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n",
    "                    school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n",
    "\n",
    "                    school_data[\"city\"] = school_city\n",
    "                    school_data[\"postcode\"] = school_postcode\n",
    "                    school_data[\"address\"] = school_address\n",
    "\n",
    "                    try:\n",
    "                        school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n",
    "                    except Exception as e:\n",
    "                        pass\n",
    "                    try:\n",
    "                        school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n",
    "                    except Exception as e:\n",
    "                        pass\n",
    "                    try:\n",
    "                        school_data[\"email\"] = extract_emails(soup_school)\n",
    "                    except Exception as e:\n",
    "                        pass\n",
    "                    \n",
    "                except Exception as e:\n",
    "                    print(school_url, str(e))\n",
    "                    # assert False\n",
    "\n",
    "                list_school_data_dicts.append(school_data)\n",
    "\n",
-    "    df = pd.DataFrame(list_school_data_dicts)\n",
+    "                # Save per processed school to track progress\n",
-    "    df.to_csv(\"scholenopdekaart.csv\")\n",
+    "                df = pd.DataFrame(list_school_data_dicts)\n",
    "                df.to_csv(\"scholenopdekaart_tmp.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n",
    "\n",
    "    df = pd.DataFrame(list_school_data_dicts)\n",
    "    df.to_csv(\"scholenopdekaart.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n",
    "    # Without extra columns\n",
    "    df.drop(columns=[\"students_per_zipcode\", \"students_per_year_trend\", \"num_students_per_group\", \"num_students_per_age\"]).to_csv(\"scholenopdekaart_.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\" # Issues with URL:\n",
    "https://scholenopdekaart.nl/middelbare-scholen/grave/1900/merletcollege-grave/\n",
    "https://scholenopdekaart.nl/middelbare-scholen/lent/4386/citadel-college-locatie-griftdijk/\n",
@@ -212,25 +335,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "school_url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n",
    "response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
    "# Parse the HTML content using BeautifulSoup\n",
    "soup_school = BeautifulSoup(response.text, 'html.parser')\n",
    "soup_school\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n",
-    "df.loc[0, \"category_3\"]"
+    "\n",
    "df.head()"
   ]
  },
  {
@@ -238,76 +345,8 @@
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
-    "import requests\n",
+    "df.tail()"
    "from bs4 import BeautifulSoup\n",
    "\n",
    "# Step 1: Fetch the webpage\n",
    "url = \"https://scholenopdekaart.nl/basisscholen/aagtekerke/25963/jhr-willem-versluijsschool/\"\n",
    "headers = {\n",
    "    \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36\"\n",
    "}\n",
    "response = requests.get(url, headers=headers)\n",
    "\n",
    "# Check if the request was successful\n",
    "if response.status_code != 200:\n",
    "    print(f\"Failed to retrieve the page. Status code: {response.status_code}\")\n",
    "    exit()\n",
    "\n",
    "# Step 2: Parse the HTML content\n",
    "soup = BeautifulSoup(response.text, 'html.parser')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Aantal per Leerjaar:\n",
      "Groep 1: 29 leerlingen\n",
      "Groep 2: 28 leerlingen\n",
      "Groep 3: 30 leerlingen\n",
      "Groep 4: 25 leerlingen\n",
      "Groep 5: 19 leerlingen\n",
      "Groep 6: 26 leerlingen\n",
      "Groep 7: 22 leerlingen\n",
      "Groep 8: 20 leerlingen\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "# Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
    "chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
    "\n",
    "if not chart_tag:\n",
    "    print(\"Could not find the 'aantal per leerjaar' section.\")\n",
    "else:\n",
    "    # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
    "    raw_data = chart_tag['aantal-per-leerjaar']\n",
    "    \n",
    "    # Step 3: Parse the JSON data\n",
    "    try:\n",
    "        data = json.loads(raw_data)\n",
    "        \n",
    "        # Step 4: Print the extracted data\n",
    "        print(\"Aantal per Leerjaar:\")\n",
    "        for entry in data:\n",
    "            print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
    "    except json.JSONDecodeError as e:\n",
    "        print(f\"Failed to parse JSON data: {e}\")"
   ]
  }
 ],
--- a/utils/Summary.ipynb
+++ b/utils/Summary.ipynb
@@ -0,0 +1,182 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# docker exec -it ollama_npu bash\n",
    "# rkllama pull\n",
    "#\n",
    "# c01zaut/Llama-3.2-3B-Instruct-rk3588-1.1.4\n",
    "# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-0-hybrid-ratio-0.0.rkllm\n",
    "# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-0-hybrid-ratio-0.5.rkllm\n",
    "# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-0.0.rkllm\n",
    "# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-0.5.rkllm\n",
    "# Llama-3.2-3B-Instruct-rk3588-w8a8_g512-opt-1-hybrid-ratio-0.5.rkllm\n",
    "#\n",
    "# c01zaut/Qwen2.5-3B-Instruct-RK3588-1.1.4\n",
    "# Qwen2.5-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-0.0.rkllm\n",
    "# Qwen2.5-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-1.0.rkllm\n",
    "# Qwen2.5-3B-Instruct-rk3588-w8a8_g256-opt-1-hybrid-ratio-1.0.rkllm\n",
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "import ollama\n",
    "import os\n",
    "import requests\n",
    "import json\n",
    "from pprint import pprint\n",
    "\n",
    "# endpoint = \"https://ollamamodelnpu.matitos.org\"\n",
    "endpoint = \"https://ollamamodel.matitos.org\"\n",
    "model = \"qwen3:0.6b\"\n",
    "model = \"qwen3:1.7b\"\n",
    "client = ollama.Client(endpoint)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r = requests.post( os.path.join(endpoint, \"unload_model\") )\n",
    "r.status_code, r.json()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r = requests.get( os.path.join(endpoint, \"models\") )\n",
    "r.json().get(\"models\"), [ m.model for m in client.list().get(\"models\") ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = \"llama3-instruct:3b\"\n",
    "model = \"qwen2.5-instruct:3b\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "article_content = \"Kevin Sutherland's message to Rowan Lumsden told of his agony at what he believed were malicious rumours about his life. The best friend of tragic Kevin Sutherland has revealed a heartbreaking message sent in the last hours of his life. Rowan Lumsden, 35, says Kevin’s death would have been avoided if his request for anonymity in the Scottish Child Abuse Inquiry had been accepted. Mum-of-one Rowan told how her friend sent a 17-minute voice message that culminated as he stood on the Forth Road Bridge, where he is thought to have plunged to his death on December 19. The Daily Record has told how Kevin, 33, had ticked a box to say he approved of his testimony of historic abuse that he suffered to be published online. Kevin’s family later revealed an email sent to the inquiry, in which he begged for his real name to be redacted, suggesting he may take his own life if he was not given that protection. His appeal was dismissed by SCAI chair Lady Smith. Rowan told how Kevin left a harrowing final message, telling of his agony at what he believed to be malicious rumours that plagued his life. Rowan said: “I was asleep when the messages came in and it was devastating to hear his voice, knowing where he was and what was going to happen. I just wish I could have helped. “Kevin was pushed to the limit and he was so troubled about what people were saying about him. “He lived in fear his testimony would be used by people to make him out to be at fault or misconstrued and he bitterly regretted his decision to allow it to be made public. “I have no doubt that he would be alive today if he was allowed to to retract his on story from the record.” Rowan, 35, said Lady Smith’s decision was wrong “in so many ways”. She said: “He begged her to let him be anonymous and he said that he would take his life if she refused. “But she said, ‘No’. I cannot see any way that can be explained away. He just needed the time it took to get the right interventions to turn his mental health and his life around. “Lady Smith was the top person in the inquiry. She knew she was dealing with a hugely vulnerable person – as all victims are. She knew that he was having suicidal thoughts.” Kevin suffered trauma, including sexual abuse, in his childhood. In his final message to Rowan, in the hours before his suspected death, Kevin didn’t refer directly to the SCAI inquiry but stated: “It’s just coming from the absolute f****** heart and I just cannot cope with this life any more. “It’s just been so f****** unbelievably brutal. I kind of feel like, what’s the point? People have got their preconceived ideas and malicious gossip has served such a toxic contribution to this final decision that I’ve made. “That’s me on the bridge. End of the road, eh? End of the road to all the liars and doubters and gossip mongrels.” Kevin’s sister Melanie Watson, who recently revealed the text of Kevin’s final appeal for anonymity, said she was aware of his final messages to friends. She added: “He was very fixated with the fear that people would make false assumptions about him, based on reading his testimony on Google.” The inquiry’s handling of Kevin is now part of an independent inquiry. An SCAI spokesperson said: “SCAI has commissioned an independent review to consider all aspects of its interactions with Kevin.”\"\n",
    "article_content = \"Child services visited a Bronx apartment while a 4-year-old girl was trapped inside with the corpses of her troubled mom and brother – but walked away after knocking, neighbors said. Lisa Cotton, 38, and her 8-year-old son, Nazir Millien, 8, had been dead for at least two weeks before relatives found them and the toddler inside the house of horrors Friday, one day after reps for the Administration for Children’s Services dropped the ball, neighbor Sabrina Coleson said. “They didn’t do s–t,” Coleson said Sunday. “They were here ringing people’s bells the day before the wellness check. They were here, but they didn’t do s–t. “One rang my bell and asked if I had any concerns for upstairs. And then a man opened his door and started yelling,” she said. “Lisa was a very cool girl. I never saw her son with her, only the girl. It’s terrible.” Concerned relatives finally checked on the family on Friday and found the 4-year-old, Promise, alone, starving and in horrid condition on her mother’s bed — as bugs crawled over her dead family. Cotton’s father, Hubert, 71, had sent his oldest granddaughter to check the apartment at East 231st Street — with the woman grabbing her young sibling and fleeing the putrid home to call police. ACS wasn’t the only city agency to leave Promise trapped in hellish conditions — neighbors said cops were also called to the apartment on Tuesday but left after not sensing the stench reported by others. Hubert Cotton said the toddler survived by “feeding herself with chocolate.” Law enforcement sources said Lisa Cotton had a history of erratic behavior, and had a pending ACS case for alleged child neglect before she was found dead. She was arrested in 2021 on child abandonment charges after police said she was caught swinging her then-infant daughter around in a stroller and lighting a wig on fire on White Plains Road, sources said. When cops arrived she was allegedly walking away, leaving Promise behind. The outcome of the case was not available because the file is sealed. One neighbor said the mom had “episodes” in the past. Sources said police believe Lisa Cotton, who suffered from asthma, may have died from cardiac arrest, while her son, who was born prematurely and had a feeding tube, may have starved to death. A spokesperson for ACS declined to comment on the case on Sunday other than to say the agency is “investigating this tragedy with the NYPD.”\"\n",
    "\n",
    "# prompt = \"Rewrite the content below into a clear and concise summary of one paragraph maximum, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article:\\n\\n{}\".format(article_content)\n",
    "# prompt = \"Provide a summary of the content below, presenting the key points as if they are newly written insights. Write in a natural, standalone format that feels like an original explanation. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Keep it brief, engaging, informative, in the style of a news article, and in one single paragraph:\\n\\n{}\".format(article_content)\n",
    "# prompt = \"Provide a summary of the content below, writing in a natural and standalone format that feels like an original explanation. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Keep it brief, engaging, informative, in the style of a news article, and in one single paragraph:\\n\\n{}\".format(article_content)\n",
    "\n",
    "# in one sentence each\n",
    "prompt = \"First, provide a summary of the content below in one paragraph. Second, specify the Who, What, When, Where and Why of the story:\\n\\n{}\".format(article_content)\n",
    "# prompt = \"Provide the 5W (Who, What, When, Where, Why) and a detailed summary of the content below:\\n\\n{}\".format(article_content)\n",
    "# Only answer with the location or address which can be extracted from this description\n",
    "\n",
    "prompt = \"Provide, in one sentence each, the who, what, when, where, why, and a detailed summary of the content below:\\n\\n{}\".format(article_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{}\n"
     ]
    }
   ],
   "source": [
    "options = {\"temperature\": 0, \"seed\": 51029}\n",
    "resp = client.generate(model=model, prompt=prompt, format=\"json\", options=options)\n",
    "r = requests.post( os.path.join(endpoint, \"unload_model\") )\n",
    "\n",
    "response_dict = json.loads(resp.response)\n",
    "pprint(response_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'{\\n\\n\\n}'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "resp.response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"<think>\\nOkay, let's tackle this query. The user wants a one-sentence summary for each element: who, what, when, where, why, and a detailed summary.\\n\\nFirst, the main event is the child services visiting a Bronx apartment with a 4-year-old trapped, but the neighbors say they knocked out the corpses. So for the first sentence, I need to include who (child services), what (visited the apartment), when (Friday), where (the apartment), why (neighbors said they didn't do it), and a summary. \\n\\nThen, for the second part, the user might want more details. Let me check the content. The summary needs to include the specific details like the family members, the days they were found dead, the agencies involved, and the outcomes. Also, mention the sources like ACS and the neighbors' statements. I need to make sure each sentence is concise and covers all the points without being too lengthy. Let me structure each sentence to fit the required format.\\n</think>\\n\\n**Who:** Child services in the Bronx, **What:** Visited an apartment containing a 4-year-old trapped with a dead mom and brother, **When:** Friday, **Where:** East 231st Street, **Why:** Neighbors reported the agency’s actions were inadequate, **Summary:** Child services visited a Bronx apartment with a 4-year-old trapped and dead, neighbors say they knocked out the corpses, and the incident is attributed to the agency’s failure to address the situation, with the family surviving by feeding themselves and the case being sealed.\""
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#resp = client.generate(model=model, prompt=prompt, format=\"json\")\n",
    "resp = client.generate(model=model, prompt=prompt)\n",
    "resp.response"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "matitos_urls",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
Author	SHA1	Message	Date
Matitos	dc56cc3365	Read only fetcher urls app	2025-08-27 12:55:00 +02:00
Luciano Gervasoni	4883b097db	Disable missing kids all urls check	2025-08-26 10:16:18 +02:00
Luciano Gervasoni	a0ced90d7c	Views base fix	2025-08-22 13:26:14 +02:00
Luciano Gervasoni	883dfcd3bd	URL redirect get before newspaper processing	2025-08-22 13:11:02 +02:00
Luciano Gervasoni	9b0a84c16a	Books category	2025-08-18 16:39:26 +02:00
Luciano Gervasoni	b08ea574d9	Notify status task	2025-08-14 15:13:18 +02:00
Luciano Gervasoni	da078a6f0f	Notify status task	2025-08-14 15:09:20 +02:00
Luciano Gervasoni	4ccff2bc02	Notify status task	2025-08-14 15:06:53 +02:00
Luciano Gervasoni	ffb0f85475	View fix (10)	2025-08-14 14:58:16 +02:00
Luciano Gervasoni	c939624687	View fix (9)	2025-08-14 14:56:00 +02:00
Luciano Gervasoni	6fb14d5e72	View fix (8)	2025-08-14 14:53:56 +02:00
Luciano Gervasoni	260a505766	View fix (7)	2025-08-14 14:50:03 +02:00
Luciano Gervasoni	bae5329b1e	View fix (6)	2025-08-14 14:48:03 +02:00
Luciano Gervasoni	b3d63f820e	View fix (5)	2025-08-14 14:28:52 +02:00
Luciano Gervasoni	1fbf4cf3d4	View fix (4)	2025-08-14 14:25:56 +02:00
Luciano Gervasoni	856a9e7562	View fix (3)	2025-08-14 14:24:03 +02:00
Luciano Gervasoni	4080154f2b	View fix (2)	2025-08-14 14:12:57 +02:00
Luciano Gervasoni	015f92a06b	View fix	2025-08-14 14:04:33 +02:00
Luciano Gervasoni	3d09c1acff	Notify status	2025-08-14 13:56:06 +02:00
Luciano Gervasoni	02f756d3c2	Ride missing kids exception	2025-08-14 10:59:19 +02:00
Luciano Gervasoni	6b5073d1b6	Pattern matching, foxnews request with header	2025-08-13 14:29:44 +02:00
Luciano Gervasoni	e3d6cf8000	Pattern matching, foxnews request with header	2025-08-13 14:12:54 +02:00
Luciano Gervasoni	30c586d49a	Selenium docker psutil	2025-08-01 20:49:28 +02:00
Luciano Gervasoni	1502f09e22	Selenium kill process to release mem, supervisor conf rotate log file	2025-07-28 11:16:15 +02:00
Luciano Gervasoni	54e41139bb	Duckduckgo search update	2025-07-22 22:53:53 +02:00
Luciano Gervasoni	b112da8bd0	Supervisor based run	2025-07-22 00:51:09 +02:00
Luciano Gervasoni	cb621c9d6b	Switching to django celery for workers	2025-07-17 22:29:06 +02:00
Luciano Gervasoni	50e8666162	Django tasks workers logger	2025-07-17 00:46:48 +02:00
Luciano Gervasoni	202e58776d	Django tasks workers	2025-07-17 00:21:26 +02:00
Luciano Gervasoni	7a91fc1a87	Django tasks workers	2025-07-17 00:11:02 +02:00
Luciano Gervasoni	b2b853b32f	Django tasks workers	2025-07-17 00:06:23 +02:00
Luciano Gervasoni	d5d80ade55	Django tasks workers	2025-07-17 00:05:04 +02:00
Luciano Gervasoni	60f021fc2d	Logger for worker	2025-07-15 17:05:06 +02:00
Luciano Gervasoni	1dcf69ab08	Logger for worker	2025-07-15 16:58:48 +02:00
Luciano Gervasoni	b9ba0d8f3d	Logger for worker	2025-07-15 16:51:22 +02:00
Luciano Gervasoni	06ded0b37d	Worker params	2025-07-15 10:28:51 +02:00
Luciano Gervasoni	a38e2bc5d1	Worker logs	2025-07-15 10:04:13 +02:00
Luciano Gervasoni	5a33012a64	Workers fix 2	2025-07-15 01:07:10 +02:00
Luciano Gervasoni	9d79a4e5c4	Workers fix	2025-07-15 01:04:57 +02:00
Luciano Gervasoni	6612a50d13	Logger fix, env sample ram	2025-07-14 23:48:34 +02:00
Luciano Gervasoni	6c88759e7b	Workers ttl	2025-07-14 23:37:48 +02:00
Luciano Gervasoni	623dfbf95a	Tasks priorities	2025-07-10 14:36:19 +02:00
Luciano Gervasoni	0cb68a876b	Logger parent pid	2025-07-10 14:19:57 +02:00
Luciano Gervasoni	fdc3263785	Django multi worker, logging pid	2025-07-10 13:08:37 +02:00
Luciano Gervasoni	da5dfe5314	Timeout adjust	2025-07-08 21:33:28 +02:00
Luciano Gervasoni	0fa4482711	Missing kids trigger types	2025-07-08 21:30:20 +02:00
Luciano Gervasoni	4985f09e56	Filters fix	2025-07-08 18:28:43 +02:00
Luciano Gervasoni	0cf61026e8	Selenium based fetch of different sources	2025-07-08 18:18:26 +02:00
Luciano Gervasoni	f729bd1cb2	Selenium control loop	2025-07-08 10:19:37 +02:00
Luciano Gervasoni	9083021674	Scheduled tasks priorities	2025-07-08 10:15:21 +02:00
Luciano Gervasoni	8d72d3af0c	debugging	2025-07-08 10:11:07 +02:00
Luciano Gervasoni	75de046dd9	selenium app wip	2025-07-08 10:01:35 +02:00
Luciano Gervasoni	7fdd93d35d	docker compose base prod	2025-07-08 09:48:17 +02:00
Luciano Gervasoni	522c1cb8b3	Missing kids selenium fixes	2025-07-08 09:43:40 +02:00
Luciano Gervasoni	e81a96f4bd	typo missing kid verif	2025-07-07 17:07:04 +02:00
Luciano Gervasoni	dd8e71aaa3	Missing kid verify timeout handle	2025-07-07 16:51:55 +02:00
Luciano Gervasoni	8cf2b52325	Selenium based missing kid verify url fix (2)	2025-07-07 16:34:21 +02:00
Luciano Gervasoni	a8b236bac0	Selenium based missing kid verify url	2025-07-07 16:02:11 +02:00
Luciano Gervasoni	15035c108d	Missing kids processing fix	2025-07-07 13:22:18 +02:00
Luciano Gervasoni	4c0dd70bc3	missing kids status code handling	2025-07-07 12:57:57 +02:00
Luciano Gervasoni	b559f8cd8c	Django tasks scheduler	2025-07-04 18:52:56 +02:00
Luciano Gervasoni	737483db9f	Tasks timeout	2025-07-04 18:51:09 +02:00
Luciano Gervasoni	d0ae91bf35	quot parser issue fx	2025-07-03 14:13:47 +02:00
Luciano Gervasoni	80f40e1a74	unquote google general search	2025-07-03 13:52:18 +02:00
Luciano Gervasoni	969e08e84a	Status pattern match fox news person	2025-07-03 13:43:30 +02:00
Luciano Gervasoni	68b56eafea	furl remove parameters on search results	2025-07-03 13:35:40 +02:00
Luciano Gervasoni	e657c3bee1	Zombie processes, quot parser issue	2025-07-03 10:56:48 +02:00
Luciano Gervasoni	8b689729bf	Docker and deployment to fetcher server	2025-06-27 09:14:44 +02:00
Luciano Gervasoni	f659d4adb3	Scheduled tasks interval, env vars, view fix	2025-06-20 09:59:27 +02:00
Luciano Gervasoni	03a2949b2b	django tasks scheduler update, .env and docker compose towards fetcher sca	2025-06-20 00:35:48 +02:00
Luciano Gervasoni	490f01d66c	Unknown instead of error for fetched urls	2025-06-19 22:43:29 +02:00
Luciano Gervasoni	a2cce62096	CV app docker fix compose	2025-04-30 22:32:52 +02:00
Luciano Gervasoni	aa7aca3e66	CV app docker fix	2025-04-30 22:31:24 +02:00
Luciano Gervasoni	d7df5b4ea4	CV app with fastapi, web nicegui based	2025-04-30 18:41:35 +02:00
Luciano Gervasoni	ccfd0f9188	Schools NL, Ghost post utils, nude + age detection	2025-04-30 15:50:54 +02:00
Luciano Gervasoni	aa369d0458	Publish from filtered URLs option	2025-04-25 16:39:13 +02:00
Luciano Gervasoni	f59d16b3fc	Publish with hidden tag, don't publish if url id already processed, typo	2025-04-24 16:53:52 +02:00
Luciano Gervasoni	b3f7cb255c	Publish with hidden tag, don't publish if url id already processed	2025-04-24 16:47:14 +02:00
Luciano Gervasoni	b8fdcae5ec	Temperature and seed LLM	2025-04-23 17:46:47 +02:00
Luciano Gervasoni	cf55c586f7	Publisher fix	2025-04-23 17:34:10 +02:00
Luciano Gervasoni	e5c574ba33	LLM refactor, NPU ollama based, publisher update json query to llm	2025-04-23 16:35:50 +02:00
Luciano Gervasoni	8ea3ec1bda	Utils	2025-04-23 16:26:08 +02:00
		`@@ -0,0 +1,3 @@`
							`from .celery import app as celery_app`

							`__all__ = ('celery_app',)`