424 lines
28 KiB
Plaintext
424 lines
28 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# !pip install psycopg[binary]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"db_postgres\n",
|
||
"db_redis\n",
|
||
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
|
||
" ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.5s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.6s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.7s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠇ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.8s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠏ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.9s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠋ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.0s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.1s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.2s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.3s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.4s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.5s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.6s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.7s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||
" ⠇ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m1.8s \u001b[0m\n",
|
||
" ⠋ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||
" ⠋ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||
" ⠋ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
|
||
" ⠏ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m1.9s \u001b[0m\n",
|
||
" ⠙ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||
" ⠙ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||
" ⠙ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
|
||
" ⠋ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.0s \u001b[0m\n",
|
||
" ⠹ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||
" ⠹ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||
" ⠹ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
|
||
" ⠙ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.1s \u001b[0m\n",
|
||
" ⠸ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||
" ⠸ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||
" ⠸ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n",
|
||
" ⠹ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.2s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" ⠼ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" ⠼ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
|
||
" ⠸ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⠀\u001b[0m] 166.8kB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.3s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" ⠴ 82780b9b6d69 Downloading \u001b[39m 166.8kB/16.38MB\u001b[0m \u001b[34m0.5s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
|
||
" ⠼ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.833MB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.4s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" ⠦ 82780b9b6d69 Downloading \u001b[39m 9.833MB/16.38MB\u001b[0m \u001b[34m0.6s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
|
||
" ⠴ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⠀\u001b[0m] 163.8kB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.5s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" ⠿ 82780b9b6d69 Extracting \u001b[39m 163.8kB/16.38MB\u001b[0m \u001b[34m0.7s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
|
||
" ⠦ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.667MB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.6s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" ⠿ 82780b9b6d69 Extracting \u001b[39m 9.667MB/16.38MB\u001b[0m \u001b[34m0.8s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣿\u001b[0m] 0B/0B Pulled \u001b[32m\u001b[0m \u001b[34m2.7s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m 82780b9b6d69 Pull complete \u001b[32m\u001b[0m \u001b[34m0.9s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
|
||
" ⠋ Container db_redis \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||
" ⠋ Container db_postgres \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||
" ⠋ Container dozzle \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
|
||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||
" ⠿ Container dozzle \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n",
|
||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||
" ⠿ Container dozzle \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||
"\u001b[?25h"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"INSERT_TABLES = True\n",
|
||
"INSERT_SAMPLE_DATA = False\n",
|
||
"\n",
|
||
"import psycopg\n",
|
||
"connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n",
|
||
"\n",
|
||
"from datetime import datetime, timezone\n",
|
||
"import re\n",
|
||
"from pprint import pprint\n",
|
||
"\n",
|
||
"if INSERT_TABLES:\n",
|
||
" # Connect to an existing database\n",
|
||
" with psycopg.connect(connection_info) as conn:\n",
|
||
" # Open a cursor to perform database operations\n",
|
||
" with conn.cursor() as cur:\n",
|
||
" # Autocommit at end of transaction (Atomic insert of URLs and sources)\n",
|
||
" with conn.transaction() as tx:\n",
|
||
" # Create URLs table\n",
|
||
" c = cur.execute(\"\"\"\n",
|
||
" CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');\n",
|
||
"\n",
|
||
" CREATE TABLE URLS (\n",
|
||
" id SERIAL PRIMARY KEY,\n",
|
||
" url TEXT NOT NULL UNIQUE,\n",
|
||
" ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n",
|
||
" status URL_STATUS NOT NULL DEFAULT 'raw' -- ,\n",
|
||
" -- status_wendy WENDY_STATUS DEFAULT NULL,\n",
|
||
" -- ts_wendy TIMESTAMPTZ DEFAULT NULL\n",
|
||
" );\n",
|
||
" CREATE INDEX idx_urls_status ON urls(status);\n",
|
||
" CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\n",
|
||
"\n",
|
||
" CREATE TABLE URLS_DUPLICATE (\n",
|
||
" id_url_canonical INTEGER REFERENCES URLS(id),\n",
|
||
" id_url_duplicated INTEGER REFERENCES URLS(id),\n",
|
||
" PRIMARY KEY (id_url_canonical, id_url_duplicated)\n",
|
||
" );\n",
|
||
" \n",
|
||
" CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');\n",
|
||
" CREATE TABLE SEARCH (\n",
|
||
" id SMALLSERIAL PRIMARY KEY,\n",
|
||
" search TEXT NOT NULL UNIQUE,\n",
|
||
" type SEARCH_TYPE NOT NULL\n",
|
||
" );\n",
|
||
" CREATE INDEX idx_search_type ON SEARCH(type);\n",
|
||
" \n",
|
||
" CREATE TABLE SOURCE (\n",
|
||
" id SMALLSERIAL PRIMARY KEY,\n",
|
||
" source TEXT NOT NULL UNIQUE\n",
|
||
" );\n",
|
||
" \n",
|
||
" CREATE TABLE URLS_SOURCE_SEARCH (\n",
|
||
" id_url INTEGER REFERENCES URLS(id),\n",
|
||
" id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n",
|
||
" id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n",
|
||
" PRIMARY KEY(id_url, id_source, id_search)\n",
|
||
" );\n",
|
||
" CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);\n",
|
||
" CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);\n",
|
||
"\n",
|
||
" CREATE TABLE STATUS_PATTERN_MATCHING (\n",
|
||
" pattern TEXT PRIMARY KEY,\n",
|
||
" priority SMALLINT NOT NULL,\n",
|
||
" status URL_STATUS NOT NULL\n",
|
||
" );\n",
|
||
" \n",
|
||
" \n",
|
||
" CREATE TABLE URL_CONTENT (\n",
|
||
" id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\n",
|
||
" date_published TIMESTAMPTZ DEFAULT NOW(),\n",
|
||
" title TEXT,\n",
|
||
" description TEXT,\n",
|
||
" content TEXT,\n",
|
||
" valid_content BOOLEAN,\n",
|
||
" language CHAR(2), -- ISO 639-1 Code\n",
|
||
" keywords TEXT[],\n",
|
||
" tags TEXT[],\n",
|
||
" authors TEXT[],\n",
|
||
" image_main_url TEXT,\n",
|
||
" images_url TEXT[],\n",
|
||
" videos_url TEXT[],\n",
|
||
" url_host TEXT, -- www.breitbart.com\n",
|
||
" site_name TEXT -- Breitbart News\n",
|
||
" );\n",
|
||
" CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n",
|
||
" CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n",
|
||
" CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\n",
|
||
" CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\n",
|
||
" CREATE INDEX idx_language ON URL_CONTENT (language);\n",
|
||
" CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n",
|
||
" \"\"\")\n",
|
||
"\n",
|
||
" ### Default insert values\n",
|
||
" \n",
|
||
" # Feeds\n",
|
||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n",
|
||
" # Websites of interest\n",
|
||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.missingkids.org/poster', 'url_host');\" )\n",
|
||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n",
|
||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n",
|
||
" # Search keywords\n",
|
||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
|
||
" \n",
|
||
" # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n",
|
||
" # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n",
|
||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"youtube.com/\"))) )\n",
|
||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"tiktok.com/\"))) )\n",
|
||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"twitter.com/\"))) )\n",
|
||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"reddit.com/\"))) )\n",
|
||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"libreddit.de/\"))) )\n",
|
||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"if INSERT_SAMPLE_DATA:\n",
|
||
" # Connect to an existing database\n",
|
||
" with psycopg.connect(connection_info) as conn:\n",
|
||
" # Open a cursor to perform database operations\n",
|
||
" with conn.cursor() as cur:\n",
|
||
" # Autocommit at end of transaction (Atomic insert of URLs and sources)\n",
|
||
" with conn.transaction() as tx:\n",
|
||
" # Valid\n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame', 'valid')\")\n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.bbc.com/news/articles/ckg843y8y7no', 'valid')\")\n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/', 'valid')\")\n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895', 'valid')\")\n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/', 'valid')\")\n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality', 'valid')\")\n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.google.com', 'invalid')\")\n",
|
||
"\n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/USVA/VA25-0820/1', 'valid')\")\n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/NCMC/2045193/1', 'valid')\")\n",
|
||
"\n",
|
||
" cur.execute(\"INSERT INTO SOURCE (source) values ('news.google.com')\")\n",
|
||
" cur.execute(\"INSERT INTO SOURCE (source) values ('qwant.com')\")\n",
|
||
"\n",
|
||
" cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source, id_search) values (1, 1, 1)\")\n",
|
||
"\n",
|
||
" for j in range(5):\n",
|
||
" import time\n",
|
||
" time.sleep(0.25)\n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_{}.org', 'invalid')\".format(j))\n",
|
||
" \n",
|
||
" # Long URLs \n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html', 'invalid')\".format(j))\n",
|
||
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))\n",
|
||
"\n",
|
||
" # URL Content\n",
|
||
" language, content = \"en\", \"Bla Bla Bla!!!\"*25\n",
|
||
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, images_url) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n",
|
||
" (1, datetime.now(tz=timezone.utc), content, language, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\t urls\n",
|
||
"[]\n",
|
||
"\t urls_duplicate\n",
|
||
"[]\n",
|
||
"\t urls_source_search\n",
|
||
"[]\n",
|
||
"\t source\n",
|
||
"[]\n",
|
||
"\t search\n",
|
||
"[(1,\n",
|
||
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
|
||
" 'rss_feed'),\n",
|
||
" (2, 'www.breitbart.com', 'url_host'),\n",
|
||
" (3, 'child abuse', 'keyword_search')]\n",
|
||
"\t status_pattern_matching\n",
|
||
"[('.*youtube\\\\.com/.*', 50, 'invalid'),\n",
|
||
" ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n",
|
||
" ('.*twitter\\\\.com/.*', 50, 'invalid'),\n",
|
||
" ('.*reddit\\\\.com/.*', 50, 'invalid'),\n",
|
||
" ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n",
|
||
" ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n",
|
||
"\t url_content\n",
|
||
"[]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Connect to an existing database\n",
|
||
"with psycopg.connect(connection_info) as conn:\n",
|
||
" # Open a cursor to perform database operations\n",
|
||
" with conn.cursor() as cur:\n",
|
||
" # Get tables\n",
|
||
" cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema='public';\")\n",
|
||
" tables = [t[0] for t in cur.fetchall()]\n",
|
||
"\n",
|
||
" for t in tables:\n",
|
||
" print(\"\\t\", t)\n",
|
||
" pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[(1,\n",
|
||
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
|
||
" 'rss_feed'),\n",
|
||
" (2, 'www.breitbart.com', 'url_host'),\n",
|
||
" (3, 'child abuse', 'keyword_search')]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Connect to an existing database\n",
|
||
"with psycopg.connect(connection_info) as conn:\n",
|
||
" # Open a cursor to perform database operations\n",
|
||
" with conn.cursor() as cur:\n",
|
||
" pprint( cur.execute(\"SELECT * FROM SEARCH;\").fetchall() )"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Connect to an existing database\n",
|
||
"with psycopg.connect(connection_info) as conn:\n",
|
||
" # Open a cursor to perform database operations\n",
|
||
" with conn.cursor() as cur:\n",
|
||
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )\n",
|
||
" #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "matitos",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.9"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|