Better lang detect, fetch parser handling, rss google news search

This commit is contained in:
Luciano Gervasoni
2025-03-31 17:44:53 +02:00
parent b3f896b35a
commit 077219fcb6
6 changed files with 201 additions and 140 deletions

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
@@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 24,
"metadata": {},
"outputs": [
{
@@ -20,108 +20,20 @@
"text": [
"db_postgres\n",
"db_redis\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
" ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.5s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.6s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.7s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠇ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.8s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠏ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.9s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠋ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.1s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.2s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.3s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.4s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.5s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.6s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.7s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠇ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m1.8s \u001b[0m\n",
" ⠋ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" ⠋ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" ⠋ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
" ⠏ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m1.9s \u001b[0m\n",
" ⠙ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠙ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠙ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
" ⠋ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.0s \u001b[0m\n",
" ⠹ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠹ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠹ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
" ⠙ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.1s \u001b[0m\n",
" ⠸ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" ⠸ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" ⠸ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n",
" ⠹ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠼ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠼ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠸ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿\u001b[0m] 166.8kB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠴ 82780b9b6d69 Downloading \u001b[39m 166.8kB/16.38MB\u001b[0m \u001b[34m0.5s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠼ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.833MB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠦ 82780b9b6d69 Downloading \u001b[39m 9.833MB/16.38MB\u001b[0m \u001b[34m0.6s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠴ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿\u001b[0m] 163.8kB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.5s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠿ 82780b9b6d69 Extracting \u001b[39m 163.8kB/16.38MB\u001b[0m \u001b[34m0.7s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠦ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.667MB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.6s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠿ 82780b9b6d69 Extracting \u001b[39m 9.667MB/16.38MB\u001b[0m \u001b[34m0.8s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
" \u001b[32m✔\u001b[0m matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣿\u001b[0m] 0B/0B Pulled \u001b[32m\u001b[0m \u001b[34m2.7s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 82780b9b6d69 Pull complete \u001b[32m\u001b[0m \u001b[34m0.9s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
" ⠋ Container db_redis \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" ⠋ Container db_postgres \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" ⠋ Container dozzle \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" Container dozzle \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" Container dozzle \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h"
]
@@ -133,7 +45,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
@@ -145,8 +57,15 @@
"\n",
"from datetime import datetime, timezone\n",
"import re\n",
"from pprint import pprint\n",
"\n",
"from pprint import pprint"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"if INSERT_TABLES:\n",
" # Connect to an existing database\n",
" with psycopg.connect(connection_info) as conn:\n",
@@ -234,9 +153,9 @@
" # Feeds\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n",
" # Websites of interest\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.missingkids.org/poster', 'url_host');\" )\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');\" )\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');\" )\n",
" # Search keywords\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
" \n",
@@ -252,7 +171,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
@@ -304,7 +223,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 28,
"metadata": {},
"outputs": [
{
@@ -323,8 +242,10 @@
"[(1,\n",
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
" 'rss_feed'),\n",
" (2, 'www.breitbart.com', 'url_host'),\n",
" (3, 'child abuse', 'keyword_search')]\n",
" (2, 'missingkids.org/poster', 'url_host'),\n",
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
" (4, 'breitbart.com', 'url_host'),\n",
" (5, 'child abuse', 'keyword_search')]\n",
"\t status_pattern_matching\n",
"[('.*youtube\\\\.com/.*', 50, 'invalid'),\n",
" ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n",
@@ -353,7 +274,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 29,
"metadata": {},
"outputs": [
{
@@ -363,8 +284,10 @@
"[(1,\n",
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
" 'rss_feed'),\n",
" (2, 'www.breitbart.com', 'url_host'),\n",
" (3, 'child abuse', 'keyword_search')]\n"
" (2, 'missingkids.org/poster', 'url_host'),\n",
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
" (4, 'breitbart.com', 'url_host'),\n",
" (5, 'child abuse', 'keyword_search')]\n"
]
}
],
@@ -378,7 +301,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 30,
"metadata": {},
"outputs": [
{
@@ -397,6 +320,49 @@
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )\n",
" #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\n!docker rm -f db_redis; docker compose -f docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"!docker rm -f db_redis; docker compose -f docker/docker-compose.yml up -d\n",
"\n",
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\n",
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org', 'url_host');\" )\n",
"'''"
]
}
],
"metadata": {