Valid content filter, language detect on min chars, fetch missingkids.org

2025-04-03 09:44:46 +02:00
parent 3b54e247e7
commit 5addfa5ba9
18 changed files with 533 additions and 66 deletions
--- a/.env
+++ b/.env
@@ -0,0 +1 @@
+# TODO...
--- a/README.md
+++ b/README.md
@@ -1 +1,33 @@
-# Matitos
+# Matitos
+
+- Scheduled tasks
+    - Fetcher -> Inserts raw URLs
+        - Fetch parsing URL host
+        - Fetch from RSS feed
+        - Fetch searching (Google search & news, DuckDuckGo, ...)
+    - Process URLs -> Updates raw URLs
+        - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
+        - Determines if it is a valid article content
+    - Valid URLs
+        - Generate summary
+        - Classification
+            - 5W: Who, What, When, Where, Why of a Story
+            - Related to child abuse?
+            - ...
+
+Georgia Institute of Technology
+https://comm.gatech.edu › resources › writers
+
+
+- Visualization of URLs
+    - Filter URLs
+        - By status, search, source, language
+    - Charts
+
+- Content generation
+    - Select URLs:
+        - Valid content
+        - language=en
+        - published_date during last_week
+        - Use classifications
+    - Merge summaries, ...
--- a/app_selenium/Dev.ipynb
+++ b/app_selenium/Dev.ipynb
@@ -0,0 +1,46 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "endpoint = \"http://localhost:80/get_missing_kids?pages=2\"\n",
+    "r = requests.get(endpoint)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r.text"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "matitos_urls",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/app_selenium/Dockerfile
+++ b/app_selenium/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.12
+
+RUN apt update && apt install -y --no-install-recommends chromium chromium-driver curl
+RUN apt autoclean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /opt/app
+RUN pip install --no-cache-dir selenium fastapi "uvicorn[standard]"
+COPY . /opt/app/
+
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
+
+# docker build -f Dockerfile -t selenium_app .
+# docker run --rm -it --shm-size=512m --name selenium_app selenium_app
+
+# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=5"
+# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=-1"
--- a/app_selenium/README.md
+++ b/app_selenium/README.md
@@ -1,3 +1,8 @@
+# Selenium app

 * Missing kids posters fetch (num_pages=X)
-* ...
+
+```
+SELENIUM_SLEEP_PER_PAGE=4
+PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log"
+```
--- a/app_selenium/app.py
+++ b/app_selenium/app.py
@@ -0,0 +1,14 @@
+from fastapi import FastAPI
+from missing_kids import MissingKidsFetcher
+from logger import get_logger
+logger = get_logger()
+
+app = FastAPI()
+
+@app.get("/get_missing_kids/")
+def get_missing_kids(pages: int = -1):
+    try:
+        res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)}
+    except Exception as e:
+        res = {}
+    return res
--- a/app_selenium/logger.py
+++ b/app_selenium/logger.py
@@ -0,0 +1,34 @@
+import logging
+import os
+
+# Get env var
+path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log")
+
+# Directory of logs
+directory = '/'.join(path_logs_parameterization.split("/")[:-1])
+os.makedirs(directory, exist_ok=True)
+
+logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
+logger = logging.getLogger("news_fetcher")
+logger.setLevel(logging.DEBUG)
+
+# To file log: INFO / WARNING / ERROR / CRITICAL
+fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1)
+fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
+fh.setLevel(logging.DEBUG)
+logger.addHandler(fh)
+
+# To file log: INFO / WARNING / ERROR
+fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1)
+fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
+fh.setLevel(logging.INFO)
+logger.addHandler(fh)
+
+# To file log: WARNING / ERROR / CRITICAL
+fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
+fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
+fh.setLevel(logging.WARNING)
+logger.addHandler(fh)
+
+def get_logger():
+    return logger
--- a/app_selenium/missing_kids.py
+++ b/app_selenium/missing_kids.py
@@ -0,0 +1,83 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from utils import get_chrome_options
+import time
+import os
+
+from logger import get_logger
+logger = get_logger()
+
+class MissingKidsFetcher():
+    def __init__(self) -> None:
+        pass
+
+    def get_missing_kids_urls(self, first_n_pages=-1):
+        # Poster URL
+        url = "https://www.missingkids.org/gethelpnow/search/poster-search-results"
+        # URLs
+        set_urls = set()
+
+        try:
+            # Initialize
+            driver = webdriver.Chrome(options=get_chrome_options())
+            # Go to URL
+            driver.get(url)
+            # Iterate
+            i, continue_iterating, num_exceptions = 1, True, 0
+            while (continue_iterating):
+                logger.debug("Processing page: {}...".format(i))
+
+                try:
+                    time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3)
+                    # Fetch poster URLs
+                    for element_type in ["a"]: # ["a", "p", "div"]:
+                        for elem in driver.find_elements(By.TAG_NAME, element_type):
+                            href = elem.get_attribute('href')
+                            if (href is not None) and ("missingkids.org/poster" in href):
+                                set_urls.add(href)
+
+                    logger.debug("#URLS: {}".format(len(set_urls)))
+
+                    # Next page
+                    elem = driver.find_element(By.LINK_TEXT, str(i+1))
+                    logger.debug("Clicking: {}...".format(elem.text))
+                    elem.click()
+                    # Ok
+                    processed_ok = True
+                except Exception as e:
+                    # +1 exception
+                    num_exceptions += 1
+                    processed_ok = False
+
+                    if (num_exceptions == 3):
+                        continue_iterating = False
+                    else:
+                        logger.info("Exception while clicking page {}, retrying...".format(i+1))
+                        
+                        start_print = False
+                        for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""):
+                            if (e.text == "<<"):
+                                start_print = True
+                            if (e.text == ">>"):
+                                break
+                            if (start_print):
+                                logger.info(e.text)
+
+                        # driver.refresh()
+                        time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4));
+
+                if (i == first_n_pages):
+                    continue_iterating = False
+                if (processed_ok):
+                    i += 1
+                    num_exceptions = 0
+
+        except Exception as e:
+            logger.warning("Exception while clicking page {}. {}".format(i+1, str(e)), exc_info=True)
+        finally:
+            try:
+                driver.close()
+            except Exception as e:
+                pass
+
+        return set_urls
--- a/app_selenium/utils.py
+++ b/app_selenium/utils.py
@@ -0,0 +1,14 @@
+from selenium.webdriver.chrome.options import Options
+
+def get_chrome_options():
+    """Sets chrome options for Selenium.
+    Chrome options for headless browser is enabled.
+    """
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_prefs = {}
+    chrome_options.experimental_options["prefs"] = chrome_prefs
+    chrome_prefs["profile.default_content_settings"] = {"images": 2}
+    return chrome_options
--- a/app_urls/1-DB.ipynb
+++ b/app_urls/1-DB.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -11,16 +11,42 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "db_postgres\n",
+      "db_redis\n",
+      "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n",
+      " ⠿ Container db_redis     \u001b[39mStarting\u001b[0m                                         \u001b[34m0.1s \u001b[0m\n",
+      " ⠿ Container db_postgres  \u001b[39mStarting\u001b[0m                                         \u001b[34m0.1s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container dozzle       \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container adminer      \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
+      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
+      " ⠿ Container db_redis     \u001b[39mStarting\u001b[0m                                         \u001b[34m0.2s \u001b[0m\n",
+      " ⠿ Container db_postgres  \u001b[39mStarting\u001b[0m                                         \u001b[34m0.2s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container dozzle       \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container adminer      \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
+      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container db_redis     \u001b[32mStarted\u001b[0m                                          \u001b[34m0.3s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container db_postgres  \u001b[32mStarted\u001b[0m                                          \u001b[34m0.3s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container dozzle       \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container adminer      \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
+      "\u001b[?25h"
+     ]
+    }
+   ],
   "source": [
-    "!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5"
+    "!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5\n",
+    "!rm logs/*"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -37,7 +63,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -163,9 +189,41 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\t urls\n",
+      "[]\n",
+      "\t urls_duplicate\n",
+      "[]\n",
+      "\t urls_source_search\n",
+      "[]\n",
+      "\t source\n",
+      "[]\n",
+      "\t search\n",
+      "[(1,\n",
+      "  'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
+      "  'rss_feed'),\n",
+      " (2, 'missingkids.org/poster', 'url_host'),\n",
+      " (3, 'missingkids.org/new-poster', 'url_host'),\n",
+      " (4, 'breitbart.com', 'url_host'),\n",
+      " (5, 'child abuse', 'keyword_search')]\n",
+      "\t status_pattern_matching\n",
+      "[('.*youtube\\\\.com/.*', 50, 'invalid'),\n",
+      " ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n",
+      " ('.*twitter\\\\.com/.*', 50, 'invalid'),\n",
+      " ('.*reddit\\\\.com/.*', 50, 'invalid'),\n",
+      " ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n",
+      " ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n",
+      "\t url_content\n",
+      "[]\n"
+     ]
+    }
+   ],
   "source": [
    "# Connect to an existing database\n",
    "with psycopg.connect(connection_info) as conn:\n",
@@ -182,9 +240,23 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(1,\n",
+      "  'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
+      "  'rss_feed'),\n",
+      " (2, 'missingkids.org/poster', 'url_host'),\n",
+      " (3, 'missingkids.org/new-poster', 'url_host'),\n",
+      " (4, 'breitbart.com', 'url_host'),\n",
+      " (5, 'child abuse', 'keyword_search')]\n"
+     ]
+    }
+   ],
   "source": [
    "# Connect to an existing database\n",
    "with psycopg.connect(connection_info) as conn:\n",
@@ -195,9 +267,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[]\n"
+     ]
+    }
+   ],
   "source": [
    "# Connect to an existing database\n",
    "with psycopg.connect(connection_info) as conn:\n",
@@ -209,9 +289,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\n!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n    # Open a cursor to perform database operations\\n    with conn.cursor() as cur:\\n        pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n        # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "'''\n",
    "!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n",
--- a/app_urls/README.md
+++ b/app_urls/README.md
@@ -96,6 +96,9 @@ FETCHER_GNEWS_DECODE_SLEEP=2
 FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
 FETCHER_BETWEEN_SEARCHES_SLEEP=5
 FETCHER_URL_HOST_SLEEP=5
+FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
+
+SELENIUM_ENDPOINT="http://selenium_app:80"
 ```

 * Deploy
--- a/app_urls/api/src/fetch_missing_kids.py
+++ b/app_urls/api/src/fetch_missing_kids.py
@@ -0,0 +1,42 @@
+from .db_utils import DB_Handler
+from ..models import Search, Source
+import os
+import requests
+import json
+import traceback
+from .logger import get_logger
+logger = get_logger()
+
+class FetchMissingKids():
+    def __init__(self) -> None:
+        logger.debug("Initializing Fetcher MissingKids")
+
+    def run(self, number_pages=-1):
+        try:
+            logger.debug("Starting MissingKids.run(), processing #{} pages".format(number_pages))
+
+            # Get source object
+            obj_source, created = Source.objects.get_or_create(source="missingkids.org")
+            # Get search object
+            obj_search, created = Search.objects.get_or_create(search="missingkids.org/poster", type=Search.TYPE_ENUM.URL_HOST)
+
+            try:
+                # Missing kids fetching endpoint, parameter number of pages to fetch
+                missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "get_missing_kids/?pages={}".format(number_pages))
+                # Timeout
+                if (number_pages > 15) or (number_pages == -1):
+                    timeout = 60*90 # 1.5h
+                else:
+                    timeout = 60*10  # 10 min
+                # Request
+                r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
+                # Decode
+                urls_fetched = json.loads(r.text).get("list_urls", [])
+            except Exception as e:
+                logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
+                urls_fetched = []
+                
+            # Write to DB
+            DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
+        except Exception as e:
+            logger.warning("Exception in MissingKids.run(): {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/api/src/fetch_utils.py
+++ b/app_urls/api/src/fetch_utils.py
@@ -1,4 +1,3 @@
-import traceback
 import os
 from django.core.cache import cache
 from .logger import get_logger
@@ -30,7 +29,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
                    # Cache decoded URL
                    cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
                else:
-                    logger.info("Bad status while decoding news.google.com, URL {}".format(url))
+                    logger.info("Bad status while decoding news.google.com, URL {}\n{}".format(url, decoded_url_dict.get("message")))
            except Exception as e:
                logger.warning("Error decoding news.google.com, URL: {}".format(url))
    return list_decoded_urls
--- a/app_urls/api/src/url_processor.py
+++ b/app_urls/api/src/url_processor.py
@@ -69,6 +69,16 @@ def process_url(url):
    except Exception as e:
        logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
        return None
+    
+    try:
+        content_merged = "\n".join([article.title, article.meta_description, article.text])
+        if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
+            language = langdetect.detect(content_merged)
+        else:
+            language = None
+    except Exception as e:
+        logger.info("Could not detect language: {}\n{}".format(url, str(e)))
+        language = None

    dict_data = {
        "url": url,
@@ -76,8 +86,7 @@ def process_url(url):
        "url_host": article.source_url,
        "site_name": article.meta_site_name,
        "publish_date": article.publish_date,
-        # article.meta_lang -> Not always reliable
-        "language": langdetect.detect("\n".join([article.title, article.meta_description, article.text]) ),
+        "language": language, # article.meta_lang -> Not always reliable
        "title": article.title,
        "description": article.meta_description,
        "content": article.text,
--- a/app_urls/api/tasks.py
+++ b/app_urls/api/tasks.py
@@ -3,10 +3,8 @@ from scheduler import job
 from .src.fetch_feed import FetchFeeds
 from .src.fetch_parser import FetchParser
 from .src.fetch_search import FetchSearcher
+from .src.fetch_missing_kids import FetchMissingKids
 from .src.db_utils import DB_Handler
-'''
-from src.missing_kids_fetch import MissingKidsFetch
-'''

 from .src.logger import get_logger
 logger = get_logger()
@@ -32,7 +30,19 @@ def fetch_search():
    FetchSearcher().run()
    logger.info("Task completed: {}".format(task))

-# TODO: fetch_missing_kids()
+@job('default')
+def fetch_missing_kids(number_pages=5):
+    task = "Fetch MissingKids"
+    logger.info("Task triggered: {}".format(task))
+    FetchMissingKids().run(number_pages)
+    logger.info("Task completed: {}".format(task))
+
+@job('default')
+def fetch_missing_kids_all(number_pages=-1):
+    task = "Fetch MissingKids"
+    logger.info("Task triggered: {}".format(task))
+    FetchMissingKids().run(number_pages)
+    logger.info("Task completed: {}".format(task))

@job('default')
 def process_raw_urls(batch_size=50):
@@ -77,8 +87,15 @@ def background_task(process_type: str):
            FetchParser().run()
        elif (process_type == "fetch_search"):
            FetchSearcher().run()
-        #elif (process_type == "fetch_missingkids"):
-        #    FetchMissingKids().run()
+        elif (process_type == "fetch_missingkids_all"):
+            FetchMissingKids().run(number_pages=-1)
+        elif ("fetch_missingkids" in process_type):
+            # number_pages encoded in URL
+            try:
+                number_pages = int(process_type.split("_")[-1])
+            except Exception as e:
+                number_pages = -1
+            FetchMissingKids().run(number_pages=number_pages)
        elif ("process_" in process_type):
            # Batch size encoded in URL
            try:
@@ -95,14 +112,6 @@ def background_task(process_type: str):
        else:
            logger.info("Task unknown!: {}".format(process_type))

-        '''
-        # Selenium based
-        elif (process_type == "fetch_missing_kids_reduced"):
-            MissingKidsFetch(db_handler, num_pages=4).run()
-        elif (process_type == "fetch_missing_kids_full"):
-            MissingKidsFetch(db_handler, num_pages=100000).run()
-        '''
-
        logger.info("Task completed: {}".format(process_type))
    except Exception as e:
        logger.error(e)
--- a/app_urls/api/templates/filtered_urls.html
+++ b/app_urls/api/templates/filtered_urls.html
@@ -258,7 +258,7 @@ input[type="checkbox"] {
                        <span id="offText" class="off-text">OFF</span>
                    </span>
                </div>
-                -->               
+                -->

                <!-- Pages Per Page Dropdown -->
                <h3>Pages Per Page</h3>
@@ -297,6 +297,17 @@ input[type="checkbox"] {
                    </label><br>
                {% endfor %}

+                <!-- Filter by valid content -->
+                <h3>Valid content</h3>
+                <button type="button" class="toggle-all-btn" data-toggle="valid_content">Toggle All</button><br>
+                {% for vc in valid_contents %}
+                    <label>
+                        <input type="checkbox" name="valid_content" value="{{ vc }}" 
+                            {% if vc|stringformat:"s" in selected_valid_contents or 'all' in selected_valid_contents%}checked{% endif %}>
+                        {{ vc|truncatechars:50 }}
+                    </label><br>
+                {% endfor %}
+
                <!-- Filter by Search -->
                <h3>Search</h3>
                <button type="button" class="toggle-all-btn" data-toggle="search">Toggle All</button><br>
@@ -329,7 +340,7 @@ input[type="checkbox"] {
                        {{ lang|truncatechars:50 }}
                    </label><br>
                {% endfor %}
-                
+
            </form>
        </div>

@@ -526,10 +537,6 @@ input[type="checkbox"] {
            const checkboxes = document.querySelectorAll(`[name='${section}']`);
            const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
            checkboxes.forEach(cb => cb.checked = !allChecked);
-            /*
-            // Automatically submit the form when a checkbox is toggled
-            document.getElementById('filterForm').submit();
-            */
            updateFormParameter(section);
        }

@@ -545,9 +552,6 @@ input[type="checkbox"] {
        // Automatically submit the form when any checkbox changes
        document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
            checkbox.addEventListener('change', function() {
-                /*
-                document.getElementById('filterForm').submit();
-                */
                updateFormParameter(this.name);
            });
        });
--- a/app_urls/api/views.py
+++ b/app_urls/api/views.py
@@ -15,7 +15,7 @@ def trigger_task(request, task):
 ####################################################################################################
 def link_list(request):
    prefix = "http://localhost:8000/task"
-    links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
+    links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]

    list_links = [
        # DB
@@ -212,21 +212,26 @@ def filtered_urls(request):
    # TODO: Cache languages, update once every N
    languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
    # Null for visualization
-    languages = ["Null"] + [l for l in languages if l is not None]
+    languages = ["Unknown"] + [l for l in languages if l is not None]
+    valid_contents = ["True", "False", "Unknown"]
    
    # Get selected parameters
    selected_status = request.GET.getlist('status', ["null"])
    selected_search = request.GET.getlist('search', ["null"])
    selected_source = request.GET.getlist('source', ["null"])
    selected_language = request.GET.getlist('language', ["null"])
+    selected_valid_contents = request.GET.getlist('valid_content', ["null"])
    selected_days = request.GET.get("days", 30)
    per_page = request.GET.get('per_page', 100)  # Default is X URLs per page
    page_number = request.GET.get('page')  # Get the current page number

+    
    all_status = [str(status[0]) for status in statuses]
    all_search = [str(search.id) for search in searches]
    all_source = [str(source.id) for source in sources]
    all_languages = languages
+    all_valid_contents = valid_contents
+    

    # Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page" 
    if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
@@ -234,23 +239,22 @@ def filtered_urls(request):
        selected_search = ["all"]
        selected_source = ["all"]
        selected_language = ["all"]
-
-
-    # print(set(selected_status), set(all_status))
-    """
-    # List of TODO remove...
-    if (set(selected_status) == set(all_status)):
-        selected_status = ["all"]
-    if (set(selected_search) == set(all_search)):
-        selected_search = ["all"]
-    if (set(selected_source) == set(all_source)):
-        selected_source = ["all"]
-    if (set(selected_language) == set(languages)):
-        selected_language = ["all"]"
-    """
+        selected_valid_contents = ["all"]
+    else:
+        # Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
+        if (set(selected_status) == set(all_status)):
+            selected_status = ["all"]
+        if (set(selected_search) == set(all_search)):
+            selected_search = ["all"]
+        if (set(selected_source) == set(all_source)):
+            selected_source = ["all"]
+        if (set(selected_language) == set(all_languages)):
+            selected_language = ["all"]
+        if (set(selected_valid_contents) == set(all_valid_contents)):
+            selected_valid_contents = ["all"]

    # Filter URLs based on selected filters
-    if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language):
+    if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
        urls = []
    else:
        # Filter by date
@@ -262,18 +266,36 @@ def filtered_urls(request):
            query &= Q(urlssourcesearch__id_source__in=selected_source)
        if ("all" not in selected_search):
            query &= Q(urlssourcesearch__id_search__in=selected_search)
-        if ("all" not in selected_language):            
+        if ("all" not in selected_language):
            # URLs with selected languages
            subquery = Q(urlcontent__language__in=selected_language)
-            if ("Null" in selected_language):
+            if ("Unknown" in selected_language):
                # URLs with NULL language
                subquery |= Q(urlcontent__language__isnull=True)
                # URLs with no UrlContent record at all (similar to URLs with NULL language)
                subquery |= Q(urlcontent__id_url__isnull=True)
            # Update query
            query &= (subquery)
+        if ("all" not in selected_valid_contents):
+            # Boolean array
+            bool_array = []
+            if ('True' in selected_valid_contents):
+                bool_array.append(True)
+            if ('False' in selected_valid_contents):
+                bool_array.append(False)
+            # URLs with selected valid_contents
+            subquery = Q(urlcontent__valid_content__in=bool_array)
+            if ("Unknown" in selected_valid_contents):
+                # URLs with NULL valid_content
+                subquery |= Q(urlcontent__valid_content__isnull=True)
+                # URLs with no UrlContent record at all (similar to URLs with NULL valid_content)
+                subquery |= Q(urlcontent__id_url__isnull=True)
+            # Update query
+            query &= (subquery)

+        # Run query
        urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
+        # print(urls.query)

    # Pagination
    paginator = Paginator(urls, per_page)  # Paginate the filtered URLs
@@ -300,11 +322,13 @@ def filtered_urls(request):
        'searches': sorted(searches, key=lambda x: (x.type, x.search)),
        'sources': sorted(sources, key=lambda x: x.source),
        'languages': sorted(languages, key=lambda x: (x is None, x)),
+        'valid_contents': valid_contents,
        # Selection
        'selected_status': selected_status,
        'selected_search': selected_search,
        'selected_source': selected_source,
        'selected_language': selected_language,
+        'selected_valid_contents': selected_valid_contents,
        "selected_days": selected_days,
        # Map
        "sources_map": sources_map,
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -2,7 +2,48 @@ version: '3.9'

 services:

-  matitos_db:
+  fetcher_selenium:
+    build:
+      context: ./app_selenium
+    container_name: selenium_app
+    restart: unless-stopped
+    shm_size: 512mb
+    environment:
+      - SELENIUM_SLEEP_PER_PAGE=4
+      - PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log"
+    ports:
+      - 80
+
+  fetcher_urls_app:
+    build:
+      context: ./app_urls
+    container_name: urls_app
+    restart: unless-stopped
+    environment:
+      #- name=value
+      # Database
+      - DB_NAME=${DB_NAME:-matitos}
+      - DB_USER=${DB_NAME:-supermatitos}
+      - DB_PASSWORD=${DB_NAME:-supermatitos}
+      - DB_HOST=${DB_NAME:-localhost} # db_postgres
+      - DB_PORT=${DB_NAME:-5432}
+      - REDIS_HOST=${REDIS_HOST:-localhost}
+      - REDIS_PORT=${REDIS_PORT:-6379}
+      # Job timeout: 30 min
+      - JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
+      # Logs path
+      - PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
+      # Fetcher
+      - FETCHER_GNEWS_DECODE_SLEEP=2
+      - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
+      - FETCHER_BETWEEN_SEARCHES_SLEEP=5
+      - FETCHER_URL_HOST_SLEEP=5
+      # Selenium
+      - SELENIUM_ENDPOINT="http://selenium_app:80"
+    ports:
+      - 80
+
+  fetcher_db:
    image: postgres:17
    container_name: db_postgres
    restart: unless-stopped
@@ -18,7 +59,7 @@ services:
    ports:
      - 5432:5432

-  matitos_redis:
+  fetcher_redis:
    image: redis:alpine
    container_name: db_redis
    restart: unless-stopped
@@ -27,7 +68,7 @@ services:
    #expose:
    #  - 6379

-  matitos_adminer:
+  fetcher_adminer:
    # http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public
    image: adminer
    container_name: adminer
@@ -41,7 +82,7 @@ services:
    ports:
      - 8080:8080

-  matitos_dozzle:
+  fetcher_dozzle:
    container_name: dozzle
    image: amir20/dozzle:latest
    volumes: