Better lang detect, fetch parser handling, rss google news search

2025-03-31 17:44:53 +02:00
parent b3f896b35a
commit 077219fcb6
6 changed files with 201 additions and 140 deletions
--- a/1-DB.ipynb
+++ b/1-DB.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -11,7 +11,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
@@ -20,108 +20,20 @@
     "text": [
      "db_postgres\n",
      "db_redis\n",
-      "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
-      " ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m0.1s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m0.2s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m0.3s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m0.4s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m0.5s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m0.6s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m0.7s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠇ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m0.8s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠏ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m0.9s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠋ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m1.0s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m1.1s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m1.2s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m1.3s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m1.4s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m1.5s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m1.6s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m                                                 \u001b[34m1.7s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
-      " ⠇ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m]      0B/0B      Pulling \u001b[39m\u001b[0m                  \u001b[34m1.8s \u001b[0m\n",
-      "   ⠋ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.0s \u001b[0m\n",
-      "   ⠋ 764914624645 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.0s \u001b[0m\n",
-      "   ⠋ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.0s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
-      " ⠏ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m]      0B/0B      Pulling \u001b[39m\u001b[0m                  \u001b[34m1.9s \u001b[0m\n",
-      "   ⠙ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.1s \u001b[0m\n",
-      "   ⠙ 764914624645 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.1s \u001b[0m\n",
-      "   ⠙ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.1s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
-      " ⠋ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m]      0B/0B      Pulling \u001b[39m\u001b[0m                  \u001b[34m2.0s \u001b[0m\n",
-      "   ⠹ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.2s \u001b[0m\n",
-      "   ⠹ 764914624645 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.2s \u001b[0m\n",
-      "   ⠹ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.2s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
-      " ⠙ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m]      0B/0B      Pulling \u001b[39m\u001b[0m                  \u001b[34m2.1s \u001b[0m\n",
-      "   ⠸ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.3s \u001b[0m\n",
-      "   ⠸ 764914624645 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.3s \u001b[0m\n",
-      "   ⠸ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.3s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n",
-      " ⠹ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⠀⠀\u001b[0m]      0B/0B      Pulling \u001b[39m\u001b[0m                  \u001b[34m2.2s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m b5b68a794063 Pull complete  \u001b[32m\u001b[0m                                          \u001b[34m0.4s \u001b[0m\n",
-      "   ⠼ 764914624645 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.4s \u001b[0m\n",
-      "   ⠼ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m                                        \u001b[34m0.4s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
-      " ⠸ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⠀\u001b[0m] 166.8kB/16.38MB Pulling \u001b[39m\u001b[0m                  \u001b[34m2.3s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m                                           \u001b[34m0.4s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m                                           \u001b[34m0.4s \u001b[0m\n",
-      "   ⠴ 82780b9b6d69 Downloading \u001b[39m 166.8kB/16.38MB\u001b[0m                             \u001b[34m0.5s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
-      " ⠼ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.833MB/16.38MB Pulling \u001b[39m\u001b[0m                  \u001b[34m2.4s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m                                           \u001b[34m0.4s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m                                           \u001b[34m0.4s \u001b[0m\n",
-      "   ⠦ 82780b9b6d69 Downloading \u001b[39m 9.833MB/16.38MB\u001b[0m                             \u001b[34m0.6s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
-      " ⠴ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⠀\u001b[0m] 163.8kB/16.38MB Pulling \u001b[39m\u001b[0m                  \u001b[34m2.5s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m                                           \u001b[34m0.4s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m                                           \u001b[34m0.4s \u001b[0m\n",
-      "   ⠿ 82780b9b6d69 Extracting \u001b[39m 163.8kB/16.38MB\u001b[0m                              \u001b[34m0.7s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
-      " ⠦ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.667MB/16.38MB Pulling \u001b[39m\u001b[0m                  \u001b[34m2.6s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m                                           \u001b[34m0.4s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m                                           \u001b[34m0.4s \u001b[0m\n",
-      "   ⠿ 82780b9b6d69 Extracting \u001b[39m 9.667MB/16.38MB\u001b[0m                              \u001b[34m0.8s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
-      " \u001b[32m✔\u001b[0m matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣿\u001b[0m]      0B/0B      Pulled \u001b[32m\u001b[0m                   \u001b[34m2.7s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m                                           \u001b[34m0.4s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m                                           \u001b[34m0.4s \u001b[0m\n",
-      "   \u001b[32m✔\u001b[0m 82780b9b6d69 Pull complete \u001b[32m\u001b[0m                                           \u001b[34m0.9s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
-      " ⠋ Container db_redis     \u001b[39mCreating\u001b[0m                                         \u001b[34m0.0s \u001b[0m\n",
-      " ⠋ Container db_postgres  \u001b[39mCreating\u001b[0m                                         \u001b[34m0.0s \u001b[0m\n",
-      " ⠋ Container dozzle       \u001b[39mCreating\u001b[0m                                         \u001b[34m0.0s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
+      "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n",
      " ⠿ Container db_redis     \u001b[39mStarting\u001b[0m                                         \u001b[34m0.1s \u001b[0m\n",
      " ⠿ Container db_postgres  \u001b[39mStarting\u001b[0m                                         \u001b[34m0.1s \u001b[0m\n",
-      " ⠿ Container dozzle       \u001b[39mStarting\u001b[0m                                         \u001b[34m0.1s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container dozzle       \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
      " \u001b[32m✔\u001b[0m Container adminer      \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
-      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n",
+      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
      " ⠿ Container db_redis     \u001b[39mStarting\u001b[0m                                         \u001b[34m0.2s \u001b[0m\n",
      " ⠿ Container db_postgres  \u001b[39mStarting\u001b[0m                                         \u001b[34m0.2s \u001b[0m\n",
-      " ⠿ Container dozzle       \u001b[39mStarting\u001b[0m                                         \u001b[34m0.2s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container dozzle       \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
      " \u001b[32m✔\u001b[0m Container adminer      \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
      "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
-      " \u001b[32m✔\u001b[0m Container db_redis     \u001b[32mStarted\u001b[0m                                          \u001b[34m0.3s \u001b[0m\n",
-      " \u001b[32m✔\u001b[0m Container db_postgres  \u001b[32mStarted\u001b[0m                                          \u001b[34m0.3s \u001b[0m\n",
-      " \u001b[32m✔\u001b[0m Container dozzle       \u001b[32mStarted\u001b[0m                                          \u001b[34m0.3s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container db_redis     \u001b[32mStarted\u001b[0m                                          \u001b[34m0.2s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container db_postgres  \u001b[32mStarted\u001b[0m                                          \u001b[34m0.2s \u001b[0m\n",
+      " \u001b[32m✔\u001b[0m Container dozzle       \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
      " \u001b[32m✔\u001b[0m Container adminer      \u001b[32mRunning\u001b[0m                                          \u001b[34m0.0s \u001b[0m\n",
      "\u001b[?25h"
     ]
@@ -133,7 +45,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -145,8 +57,15 @@
    "\n",
    "from datetime import datetime, timezone\n",
    "import re\n",
-    "from pprint import pprint\n",
-    "\n",
+    "from pprint import pprint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
    "if INSERT_TABLES:\n",
    "    # Connect to an existing database\n",
    "    with psycopg.connect(connection_info) as conn:\n",
@@ -234,9 +153,9 @@
    "                # Feeds\n",
    "                cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n",
    "                # Websites of interest\n",
-    "                cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.missingkids.org/poster', 'url_host');\" )\n",
-    "                cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n",
    "                cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n",
+    "                cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');\" )\n",
+    "                cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');\" )\n",
    "                # Search keywords\n",
    "                cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
    "                \n",
@@ -252,7 +171,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -304,7 +223,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
@@ -323,8 +242,10 @@
      "[(1,\n",
      "  'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
      "  'rss_feed'),\n",
-      " (2, 'www.breitbart.com', 'url_host'),\n",
-      " (3, 'child abuse', 'keyword_search')]\n",
+      " (2, 'missingkids.org/poster', 'url_host'),\n",
+      " (3, 'missingkids.org/new-poster', 'url_host'),\n",
+      " (4, 'breitbart.com', 'url_host'),\n",
+      " (5, 'child abuse', 'keyword_search')]\n",
      "\t status_pattern_matching\n",
      "[('.*youtube\\\\.com/.*', 50, 'invalid'),\n",
      " ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n",
@@ -353,7 +274,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
@@ -363,8 +284,10 @@
      "[(1,\n",
      "  'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
      "  'rss_feed'),\n",
-      " (2, 'www.breitbart.com', 'url_host'),\n",
-      " (3, 'child abuse', 'keyword_search')]\n"
+      " (2, 'missingkids.org/poster', 'url_host'),\n",
+      " (3, 'missingkids.org/new-poster', 'url_host'),\n",
+      " (4, 'breitbart.com', 'url_host'),\n",
+      " (5, 'child abuse', 'keyword_search')]\n"
     ]
    }
   ],
@@ -378,7 +301,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
@@ -397,6 +320,49 @@
    "        pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )\n",
    "        #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\n!docker rm -f db_redis; docker compose -f docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n    # Open a cursor to perform database operations\\n    with conn.cursor() as cur:\\n        pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n        # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "'''\n",
+    "!docker rm -f db_redis; docker compose -f docker/docker-compose.yml up -d\n",
+    "\n",
+    "# Connect to an existing database\n",
+    "with psycopg.connect(connection_info) as conn:\n",
+    "    # Open a cursor to perform database operations\n",
+    "    with conn.cursor() as cur:\n",
+    "        pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\n",
+    "        # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org', 'url_host');\" )\n",
+    "'''"
+   ]
  }
 ],
 "metadata": {
--- a/app_urls/README.md
+++ b/app_urls/README.md
@@ -5,7 +5,7 @@ conda activate matitos_urls
 # Core
 pip install django psycopg[binary] django-redis django-tasks-scheduler
 # Fetcher
-pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews
+pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect
 # News visualization
 pip install ollama
 ```
--- a/app_urls/api/src/fetch_parser.py
+++ b/app_urls/api/src/fetch_parser.py
@@ -28,11 +28,18 @@ class FetchParser():
                
                # Make sure no requests made for the last X seconds
                url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
-                # Source object
-                url_host_built = newspaper.build(url_host_protocol)
-                # Get articles URL list
-                urls_fetched = url_host_built.article_urls()
-
+                try:
+                    # Source object
+                    url_host_built = newspaper.build(url_host_protocol)
+                    # Get articles URL list
+                    urls_fetched = url_host_built.article_urls()
+                except newspaper.exceptions.ArticleException as e:
+                    logger.debug("ArticleException while parsing input URL {}\n{}".format(url_host_protocol, str(e.args)))
+                    urls_fetched = []
+                except Exception as e:
+                    logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
+                    urls_fetched = []
+                
                # Write to DB
                DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
        except Exception as e:
--- a/app_urls/api/src/fetch_search.py
+++ b/app_urls/api/src/fetch_search.py
@@ -3,10 +3,36 @@ from ..models import Search, Source
 from django.db.models import Q
 import traceback
 import time
-from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news
+from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news, search_googlenews_rss
 from .logger import get_logger
 logger = get_logger()

+'''
+from abc import ABC, abstractmethod
+
+# Generic fetcher (fetches articles, writes to DB)
+class FetcherAbstract(ABC):
+    @abstractmethod
+    def _fetch_raw_urls_list(self):
+        pass
+
+    def fetch_articles(self, db_writer):
+        logger.debug("Starting fetch() for {}".format(self.name))
+        # Fetch articles
+        list_news = self._fetch()
+        logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
+        # Write to DB
+        db_writer.write_batch(list_news, self.name)
+
+
+        self._fetch_raw_urls_list()
+        raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
+        raw_urls = self._post_process_urls(raw_urls, obj_search)
+        # Write to DB
+        DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
+'''
+
+
 class FetchSearcher():
    def __init__(self) -> None:
        logger.debug("Initializing Fetcher Searcher")
@@ -18,6 +44,16 @@ class FetchSearcher():
        obj_source, created = Source.objects.get_or_create(source=source)
        return obj_source

+    def _post_process_urls(self, raw_urls, obj_search):
+        # Searching URL Host based? Make sure results belong to that site
+        if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
+            # Get clean URL host
+            url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
+            # Ensure URL host in URL
+            raw_urls = [u for u in raw_urls if url_host_clean in u]
+
+        return raw_urls
+
    def run(self):
        try:
            logger.debug("Starting FetchSearcher.run()")
@@ -33,49 +69,55 @@ class FetchSearcher():
                # TODO: intitle: "child abuse"
                
                # Search
-                keyword_search = "{}{}".format("site:" if obj_search.type is Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
+                keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
+
+                logger.debug("Starting keyword search: {}".format(keyword_search))
+                logger.debug("Search type: {}".format(obj_search.type))
+
+                # news.google.com/rss
+                time.sleep(5)
+                raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
+                # Write to DB
+                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
+                
                
                # DDG News
                time.sleep(5)
-                raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "wt-wt")
+                raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "en-US")
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
                # Write to DB
                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)

                # GNews
                time.sleep(5)
                raw_urls, source = search_gnews(keyword_search, language="en", country="US")
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
                # Write to DB
                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)

-                # DDG Text
+                # DDG Text (week, 20 results)
                time.sleep(5)
-                raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=None, region = "wt-wt")
+                raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=20, region = "en-US")
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
                # Write to DB
                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
                
                # GoogleNews news
                time.sleep(5)
                raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US")
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
                # Write to DB
                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)

                # GoogleNews general
                time.sleep(5)
-                raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5)
+                raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=2)
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
                # Write to DB
                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
+                

-                # TODO:
-                # SearxNG
-                """
-                period = "day"
-                for searx_instance in get_searxng_instances():
-                    dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
-                    dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
-                    # Append thread
-                    FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
-                    FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)"
-                """
                # TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
        except Exception as e:
            logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/api/src/fetch_search_utils.py
+++ b/app_urls/api/src/fetch_search_utils.py
@@ -2,6 +2,9 @@ from django.core.cache import cache
 import traceback
 import random
 import time
+import feedparser
+import urllib
+import dateutil
 from .logger import get_logger
 logger = get_logger()

@@ -33,7 +36,7 @@ def decode_gnews_urls(encoded_urls, interval=2):
                    # Cache decoded URL
                    cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
                else:
-                    logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, decoded_url["message"]))
+                    logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, str(decoded_url)))
            except Exception as e:
                logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
    return list_decoded_urls
@@ -124,7 +127,7 @@ def search_googlenews_general(keyword_search, period="1d", language="en", countr

        # Iterate pages
        for i in range(max_pages):
-            time.sleep(random.uniform(1, 2.5))
+            time.sleep(random.uniform(2, 4.5))
            num_before = len(set_links)

            # Get page
@@ -148,4 +151,47 @@ def search_googlenews_general(keyword_search, period="1d", language="en", countr
    
    return urls, source

-###########################################################################
+###########################################################################
+
+def search_googlenews_rss(keyword_search, language="en", country="US"):
+    # [source] [category] [period] [language-country] [max_results]
+    source = "googlenews-rss {}-{}".format(language, country).replace("None", "").strip()
+    logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
+
+    # https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
+
+    try:
+        # Search URL with parameters filled
+        search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(language, country.upper()), country.upper(), country.upper(), language)
+        # Control characters
+        search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
+        # Initialize
+        encoded_urls = []
+        # Fetch feeds
+        feeds = feedparser.parse(search_url)
+        # Parse
+        for f in feeds.get("entries", []):
+            # Encoded URL
+            encoded_url = f.get("link", None)
+            '''
+            # Available publish date?
+            publish_date_parsed = f.get("published_parsed")
+            if (publish_date_parsed is None):
+                publish_date = f.get("published", None)
+                if (publish_date is not None):
+                    publish_date_parsed = dateutil.parser.parse(publish_date)
+
+            # Published date
+            urls_publish_date.append(publish_date_parsed)'
+            '''
+            # Append
+            encoded_urls.append(encoded_url)
+        
+        # Decode
+        urls = decode_gnews_urls(encoded_urls)
+        
+    except Exception as e:
+        logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
+        urls = []
+    
+    return urls, source
--- a/app_urls/api/src/url_processor.py
+++ b/app_urls/api/src/url_processor.py
@@ -4,9 +4,8 @@ logger = get_logger()
 import newspaper
 import time
 from urllib.parse import unquote
-# pip install langdetect
-#import langdetect
-#langdetect.DetectorFactory.seed = 0
+import langdetect
+langdetect.DetectorFactory.seed = 0

 def get_with_protocol(url):
    # http:// -> https://
@@ -76,7 +75,8 @@ def process_url(url):
        "url_host": article.source_url,
        "site_name": article.meta_site_name,
        "publish_date": article.publish_date,
-        "language": article.meta_lang, # langdetect.detect(article.text)
+        # article.meta_lang -> Not always reliable
+        "language": langdetect.detect("\n".join([article.title, article.meta_description, article.text]) ),
        "title": article.title,
        "description": article.meta_description,
        "content": article.text,