{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !pip install psycopg[binary]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "INSERT_TABLES = False\n", "INSERT_SAMPLE_DATA = False\n", "\n", "import psycopg\n", "connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n", "\n", "\n", "if INSERT_TABLES:\n", " # Connect to an existing database\n", " with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " # Autocommit at end of transaction (Atomic insert of URLs and sources)\n", " with conn.transaction() as tx:\n", " # Create URLs table\n", " c = cur.execute(\"\"\"\n", " CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');\n", "\n", " CREATE TABLE URLS (\n", " id SERIAL PRIMARY KEY,\n", " url TEXT NOT NULL UNIQUE,\n", " ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n", " status URL_STATUS NOT NULL DEFAULT 'raw' -- ,\n", " -- status_wendy WENDY_STATUS DEFAULT NULL,\n", " -- ts_wendy TIMESTAMPTZ DEFAULT NULL\n", " );\n", " CREATE INDEX idx_urls_status ON urls(status);\n", " CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\n", "\n", " CREATE TABLE URLS_DUPLICATE (\n", " id_url_canonical INTEGER REFERENCES URLS(id),\n", " id_url_duplicated INTEGER REFERENCES URLS(id),\n", " PRIMARY KEY (id_url_canonical, id_url_duplicated)\n", " );\n", "\n", " CREATE TABLE FEED (\n", " id SMALLSERIAL PRIMARY KEY,\n", " rss_feed TEXT NOT NULL UNIQUE\n", " );\n", " CREATE TABLE WEBSITE_OF_INTEREST (\n", " id SMALLSERIAL PRIMARY KEY,\n", " url_host TEXT NOT NULL UNIQUE\n", " );\n", " CREATE TABLE SEARCH (\n", " id SMALLSERIAL PRIMARY KEY,\n", " keyword_search TEXT NOT NULL UNIQUE\n", " );\n", " CREATE TABLE SOURCE (\n", " id SMALLSERIAL PRIMARY KEY,\n", " source TEXT NOT NULL UNIQUE\n", " );\n", "\n", " CREATE TABLE URLS_SOURCE (\n", " id_url INTEGER REFERENCES URLS(id),\n", " id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT, -- Source encodes search information\n", " PRIMARY KEY(id_url, id_source)\n", " );\n", " CREATE INDEX idx_source ON urls_source(id_source);\n", "\n", " CREATE TABLE WEBSITE_TO_FILTER (\n", " id SMALLSERIAL PRIMARY KEY,\n", " url_host TEXT NOT NULL UNIQUE\n", " );\n", "\n", " CREATE TABLE STATUS_PATTERN_MATCHING (\n", " pattern TEXT PRIMARY KEY,\n", " priority SMALLINT NOT NULL,\n", " status URL_STATUS NOT NULL\n", " );\n", " \n", " \n", " CREATE TABLE URL_CONTENT (\n", " id_url INTEGER REFERENCES URLS(id),\n", " date_published TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n", " title TEXT,\n", " description TEXT,\n", " content TEXT,\n", " tags TEXT[],\n", " authors TEXT[],\n", " image_urls TEXT[],\n", " );\n", " CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n", " CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n", " \"\"\")\n", "\n", " # Feeds\n", " cur.execute( \"INSERT INTO FEED (rss_feed) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC');\" )\n", " # Websites of interest\n", " cur.execute( \"INSERT INTO WEBSITE_OF_INTEREST (url_host) VALUES ('www.unicef.org');\" )\n", " # Search keywords\n", " cur.execute( \"INSERT INTO SEARCH (keyword_search) VALUES ('child abuse');\" )\n", " # Domains to filter\n", " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('yewtu.be');\" )\n", " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('twitter.com');\" )\n", " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('libreddit.de');\" )\n", " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('youtube.com');\" )\n", " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('tiktok.com');\" )\n", " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('radio.foxnews.com');\" )\n", " # Status update based on pattern matching (with priority to apply in order)\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*missingkids.org/poster/.*', 50, 'valid');\" )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if INSERT_SAMPLE_DATA:\n", " # Connect to an existing database\n", " with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " # Autocommit at end of transaction (Atomic insert of URLs and sources)\n", " with conn.transaction() as tx:\n", " # Valid\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.bbc.com/news/articles/ckg843y8y7no', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality', 'valid')\")\n", " # Invalid\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.google.com', 'invalid')\")\n", "\n", " cur.execute(\"INSERT INTO SOURCE (source) values ('news.google.com')\")\n", " cur.execute(\"INSERT INTO SOURCE (source) values ('qwant.com')\")\n", "\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (1, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (2, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (3, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (4, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (5, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (6, 1)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (7, 1)\")\n", "\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (1, 2)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (2, 2)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (3, 2)\")\n", "\n", " for j in range(15):\n", " import time\n", " time.sleep(1)\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_{}.org', 'invalid')\".format(j))\n", " \n", " # Long URLs \n", " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html', 'invalid')\".format(j))\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pprint import pprint\n", "\n", "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " # Get tables\n", " cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema='public';\")\n", " tables = [t[0] for t in cur.fetchall()]\n", "\n", " for t in tables:\n", " print(\"\\t\", t)\n", " pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }