{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# !pip install psycopg[binary]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "db_postgres\n", "db_redis\n", "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n", " ⠋ Container db_redis \u001b[39mCreating\u001b[0m \u001b[34m0.1s \u001b[0m\n", " ⠋ Container db_postgres \u001b[39mCreating\u001b[0m \u001b[34m0.1s \u001b[0m\n", "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/2\n", " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n", " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.3s \u001b[0m\n", " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.3s \u001b[0m\n", " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n", " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.4s \u001b[0m\n", " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.4s \u001b[0m\n", " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n", " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.5s \u001b[0m\n", " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.5s \u001b[0m\n", " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n", " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.5s \u001b[0m\n", " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.5s \u001b[0m\n", " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", "\u001b[?25h" ] } ], "source": [ "!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "ename": "UndefinedTable", "evalue": "relation \"urls_source\" does not exist", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mUndefinedTable\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 19\u001b[39m\n\u001b[32m 15\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m conn.cursor() \u001b[38;5;28;01mas\u001b[39;00m cur:\n\u001b[32m 16\u001b[39m \u001b[38;5;66;03m# Autocommit at end of transaction (Atomic insert of URLs and sources)\u001b[39;00m\n\u001b[32m 17\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m conn.transaction() \u001b[38;5;28;01mas\u001b[39;00m tx:\n\u001b[32m 18\u001b[39m \u001b[38;5;66;03m# Create URLs table\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m19\u001b[39m c = \u001b[43mcur\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\"\"\u001b[39;49m\n\u001b[32m 20\u001b[39m \u001b[33;43m CREATE TYPE URL_STATUS AS ENUM (\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mraw\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43merror\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mvalid\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43munknown\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43minvalid\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mduplicate\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m);\u001b[39;49m\n\u001b[32m 21\u001b[39m \n\u001b[32m 22\u001b[39m \u001b[33;43m CREATE TABLE URLS (\u001b[39;49m\n\u001b[32m 23\u001b[39m \u001b[33;43m id SERIAL PRIMARY KEY,\u001b[39;49m\n\u001b[32m 24\u001b[39m \u001b[33;43m url TEXT NOT NULL UNIQUE,\u001b[39;49m\n\u001b[32m 25\u001b[39m \u001b[33;43m ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\u001b[39;49m\n\u001b[32m 26\u001b[39m \u001b[33;43m status URL_STATUS NOT NULL DEFAULT \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mraw\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m -- ,\u001b[39;49m\n\u001b[32m 27\u001b[39m \u001b[33;43m -- status_wendy WENDY_STATUS DEFAULT NULL,\u001b[39;49m\n\u001b[32m 28\u001b[39m \u001b[33;43m -- ts_wendy TIMESTAMPTZ DEFAULT NULL\u001b[39;49m\n\u001b[32m 29\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 30\u001b[39m \u001b[33;43m CREATE INDEX idx_urls_status ON urls(status);\u001b[39;49m\n\u001b[32m 31\u001b[39m \u001b[33;43m CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\u001b[39;49m\n\u001b[32m 32\u001b[39m \n\u001b[32m 33\u001b[39m \u001b[33;43m CREATE TABLE URLS_DUPLICATE (\u001b[39;49m\n\u001b[32m 34\u001b[39m \u001b[33;43m id_url_canonical INTEGER REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 35\u001b[39m \u001b[33;43m id_url_duplicated INTEGER REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 36\u001b[39m \u001b[33;43m PRIMARY KEY (id_url_canonical, id_url_duplicated)\u001b[39;49m\n\u001b[32m 37\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 38\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 39\u001b[39m \u001b[33;43m CREATE TYPE SEARCH_TYPE AS ENUM (\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mrss_feed\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mkeyword_search\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43murl_host\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m);\u001b[39;49m\n\u001b[32m 40\u001b[39m \u001b[33;43m CREATE TABLE SEARCH (\u001b[39;49m\n\u001b[32m 41\u001b[39m \u001b[33;43m id SMALLSERIAL PRIMARY KEY,\u001b[39;49m\n\u001b[32m 42\u001b[39m \u001b[33;43m search TEXT NOT NULL UNIQUE,\u001b[39;49m\n\u001b[32m 43\u001b[39m \u001b[33;43m type SEARCH_TYPE NOT NULL\u001b[39;49m\n\u001b[32m 44\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 45\u001b[39m \u001b[33;43m CREATE INDEX idx_search_type ON SEARCH(type);\u001b[39;49m\n\u001b[32m 46\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 47\u001b[39m \u001b[33;43m CREATE TABLE SOURCE (\u001b[39;49m\n\u001b[32m 48\u001b[39m \u001b[33;43m id SMALLSERIAL PRIMARY KEY,\u001b[39;49m\n\u001b[32m 49\u001b[39m \u001b[33;43m source TEXT NOT NULL UNIQUE\u001b[39;49m\n\u001b[32m 50\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 51\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 52\u001b[39m \u001b[33;43m CREATE TABLE URLS_SOURCE_SEARCH (\u001b[39;49m\n\u001b[32m 53\u001b[39m \u001b[33;43m id_url INTEGER REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 54\u001b[39m \u001b[33;43m id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\u001b[39;49m\n\u001b[32m 55\u001b[39m \u001b[33;43m id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\u001b[39;49m\n\u001b[32m 56\u001b[39m \u001b[33;43m PRIMARY KEY(id_url, id_source)\u001b[39;49m\n\u001b[32m 57\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 58\u001b[39m \u001b[33;43m CREATE INDEX idx_source ON urls_source(id_source);\u001b[39;49m\n\u001b[32m 59\u001b[39m \n\u001b[32m 60\u001b[39m \u001b[33;43m CREATE TABLE STATUS_PATTERN_MATCHING (\u001b[39;49m\n\u001b[32m 61\u001b[39m \u001b[33;43m pattern TEXT PRIMARY KEY,\u001b[39;49m\n\u001b[32m 62\u001b[39m \u001b[33;43m priority SMALLINT NOT NULL,\u001b[39;49m\n\u001b[32m 63\u001b[39m \u001b[33;43m status URL_STATUS NOT NULL\u001b[39;49m\n\u001b[32m 64\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 65\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 66\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 67\u001b[39m \u001b[33;43m CREATE TABLE URL_CONTENT (\u001b[39;49m\n\u001b[32m 68\u001b[39m \u001b[33;43m id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 69\u001b[39m \u001b[33;43m date_published TIMESTAMPTZ DEFAULT NOW(),\u001b[39;49m\n\u001b[32m 70\u001b[39m \u001b[33;43m title TEXT,\u001b[39;49m\n\u001b[32m 71\u001b[39m \u001b[33;43m description TEXT,\u001b[39;49m\n\u001b[32m 72\u001b[39m \u001b[33;43m content TEXT,\u001b[39;49m\n\u001b[32m 73\u001b[39m \u001b[33;43m valid_content BOOLEAN,\u001b[39;49m\n\u001b[32m 74\u001b[39m \u001b[33;43m language CHAR(2), -- ISO 639-1 Code\u001b[39;49m\n\u001b[32m 75\u001b[39m \u001b[33;43m keywords TEXT[],\u001b[39;49m\n\u001b[32m 76\u001b[39m \u001b[33;43m tags TEXT[],\u001b[39;49m\n\u001b[32m 77\u001b[39m \u001b[33;43m authors TEXT[],\u001b[39;49m\n\u001b[32m 78\u001b[39m \u001b[33;43m image_main_url TEXT,\u001b[39;49m\n\u001b[32m 79\u001b[39m \u001b[33;43m images_url TEXT[],\u001b[39;49m\n\u001b[32m 80\u001b[39m \u001b[33;43m videos_url TEXT[],\u001b[39;49m\n\u001b[32m 81\u001b[39m \u001b[33;43m url_host TEXT, -- www.breitbart.com\u001b[39;49m\n\u001b[32m 82\u001b[39m \u001b[33;43m site_name TEXT -- Breitbart News\u001b[39;49m\n\u001b[32m 83\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 84\u001b[39m \u001b[33;43m CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\u001b[39;49m\n\u001b[32m 85\u001b[39m \u001b[33;43m CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\u001b[39;49m\n\u001b[32m 86\u001b[39m \u001b[33;43m CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\u001b[39;49m\n\u001b[32m 87\u001b[39m \u001b[33;43m CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\u001b[39;49m\n\u001b[32m 88\u001b[39m \u001b[33;43m CREATE INDEX idx_language ON URL_CONTENT (language);\u001b[39;49m\n\u001b[32m 89\u001b[39m \u001b[33;43m CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\u001b[39;49m\n\u001b[32m 90\u001b[39m \u001b[33;43m \u001b[39;49m\u001b[33;43m\"\"\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 92\u001b[39m \u001b[38;5;66;03m# Feeds\u001b[39;00m\n\u001b[32m 93\u001b[39m cur.execute( \u001b[33m\"\u001b[39m\u001b[33mINSERT INTO SEARCH (search, type) VALUES (\u001b[39m\u001b[33m'\u001b[39m\u001b[33mhttps://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC\u001b[39m\u001b[33m'\u001b[39m\u001b[33m, \u001b[39m\u001b[33m'\u001b[39m\u001b[33mrss_feed\u001b[39m\u001b[33m'\u001b[39m\u001b[33m);\u001b[39m\u001b[33m\"\u001b[39m )\n", "\u001b[36mFile \u001b[39m\u001b[32m~/anaconda3/envs/matitos/lib/python3.12/site-packages/psycopg/cursor.py:97\u001b[39m, in \u001b[36mCursor.execute\u001b[39m\u001b[34m(self, query, params, prepare, binary)\u001b[39m\n\u001b[32m 93\u001b[39m \u001b[38;5;28mself\u001b[39m._conn.wait(\n\u001b[32m 94\u001b[39m \u001b[38;5;28mself\u001b[39m._execute_gen(query, params, prepare=prepare, binary=binary)\n\u001b[32m 95\u001b[39m )\n\u001b[32m 96\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m e._NO_TRACEBACK \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[32m---> \u001b[39m\u001b[32m97\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ex.with_traceback(\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m 98\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n", "\u001b[31mUndefinedTable\u001b[39m: relation \"urls_source\" does not exist" ] } ], "source": [ "INSERT_TABLES = True\n", "INSERT_SAMPLE_DATA = False\n", "\n", "import psycopg\n", "connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n", "\n", "from datetime import datetime, timezone\n", "import re\n", "from pprint import pprint\n", "\n", "if INSERT_TABLES:\n", " # Connect to an existing database\n", " with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " # Autocommit at end of transaction (Atomic insert of URLs and sources)\n", " with conn.transaction() as tx:\n", " # Create URLs table\n", " c = cur.execute(\"\"\"\n", " CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');\n", "\n", " CREATE TABLE URLS (\n", " id SERIAL PRIMARY KEY,\n", " url TEXT NOT NULL UNIQUE,\n", " ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n", " status URL_STATUS NOT NULL DEFAULT 'raw' -- ,\n", " -- status_wendy WENDY_STATUS DEFAULT NULL,\n", " -- ts_wendy TIMESTAMPTZ DEFAULT NULL\n", " );\n", " CREATE INDEX idx_urls_status ON urls(status);\n", " CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\n", "\n", " CREATE TABLE URLS_DUPLICATE (\n", " id_url_canonical INTEGER REFERENCES URLS(id),\n", " id_url_duplicated INTEGER REFERENCES URLS(id),\n", " PRIMARY KEY (id_url_canonical, id_url_duplicated)\n", " );\n", " \n", " CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');\n", " CREATE TABLE SEARCH (\n", " id SMALLSERIAL PRIMARY KEY,\n", " search TEXT NOT NULL UNIQUE,\n", " type SEARCH_TYPE NOT NULL\n", " );\n", " CREATE INDEX idx_search_type ON SEARCH(type);\n", " \n", " CREATE TABLE SOURCE (\n", " id SMALLSERIAL PRIMARY KEY,\n", " source TEXT NOT NULL UNIQUE\n", " );\n", " \n", " CREATE TABLE URLS_SOURCE_SEARCH (\n", " id_url INTEGER REFERENCES URLS(id),\n", " id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n", " id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n", " PRIMARY KEY(id_url, id_source)\n", " );\n", " CREATE INDEX idx_source ON urls_source(id_source);\n", "\n", " CREATE TABLE STATUS_PATTERN_MATCHING (\n", " pattern TEXT PRIMARY KEY,\n", " priority SMALLINT NOT NULL,\n", " status URL_STATUS NOT NULL\n", " );\n", " \n", " \n", " CREATE TABLE URL_CONTENT (\n", " id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\n", " date_published TIMESTAMPTZ DEFAULT NOW(),\n", " title TEXT,\n", " description TEXT,\n", " content TEXT,\n", " valid_content BOOLEAN,\n", " language CHAR(2), -- ISO 639-1 Code\n", " keywords TEXT[],\n", " tags TEXT[],\n", " authors TEXT[],\n", " image_main_url TEXT,\n", " images_url TEXT[],\n", " videos_url TEXT[],\n", " url_host TEXT, -- www.breitbart.com\n", " site_name TEXT -- Breitbart News\n", " );\n", " CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n", " CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n", " CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\n", " CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\n", " CREATE INDEX idx_language ON URL_CONTENT (language);\n", " CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n", " \"\"\")\n", " \n", " # Feeds\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n", " # Websites of interest\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.unicef.org', 'url_host');\" )\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n", " # Search keywords\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n", " \n", " # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n", " # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"youtube.com/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"tiktok.com/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"twitter.com/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"reddit.com/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"libreddit.de/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if INSERT_SAMPLE_DATA:\n", " # Connect to an existing database\n", " with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " # Autocommit at end of transaction (Atomic insert of URLs and sources)\n", " with conn.transaction() as tx:\n", " # Valid\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.bbc.com/news/articles/ckg843y8y7no', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.google.com', 'invalid')\")\n", "\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/USVA/VA25-0820/1', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/NCMC/2045193/1', 'valid')\")\n", "\n", " cur.execute(\"INSERT INTO SOURCE (source) values ('news.google.com')\")\n", " cur.execute(\"INSERT INTO SOURCE (source) values ('qwant.com')\")\n", "\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (1, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (2, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (3, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (4, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (5, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (6, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (7, 1)\")\n", "\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (1, 2)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (2, 2)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (3, 2)\")\n", "\n", " for j in range(5):\n", " import time\n", " time.sleep(0.25)\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_{}.org', 'invalid')\".format(j))\n", " \n", " # Long URLs \n", " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html', 'invalid')\".format(j))\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))\n", "\n", " # URL Content\n", " language, content = \"en\", \"Bla Bla Bla!!!\"*25\n", " cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, images_url) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n", " (1, datetime.now(tz=timezone.utc), content, language, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " pprint( cur.execute(\"SELECT * FROM SEARCH;\").fetchall() )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " # Get tables\n", " cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema='public';\")\n", " tables = [t[0] for t in cur.fetchall()]\n", "\n", " for t in tables:\n", " print(\"\\t\", t)\n", " pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )\n", " #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )" ] } ], "metadata": { "kernelspec": { "display_name": "matitos", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 2 }