{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !pip install psycopg[binary]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "INSERT_TABLES = True\n", "INSERT_SAMPLE_DATA = False\n", "\n", "import psycopg\n", "connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n", "\n", "from datetime import datetime, timezone\n", "import re\n", "from pprint import pprint" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if INSERT_TABLES:\n", " # Connect to an existing database\n", " with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " # Autocommit at end of transaction (Atomic insert of URLs and sources)\n", " with conn.transaction() as tx:\n", " # Create URLs table\n", " c = cur.execute(\"\"\"\n", " CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');\n", "\n", " CREATE TABLE URLS (\n", " id SERIAL PRIMARY KEY,\n", " url TEXT NOT NULL UNIQUE,\n", " ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n", " status URL_STATUS NOT NULL DEFAULT 'raw' -- ,\n", " -- status_wendy WENDY_STATUS DEFAULT NULL,\n", " -- ts_wendy TIMESTAMPTZ DEFAULT NULL\n", " );\n", " CREATE INDEX idx_urls_status ON urls(status);\n", " CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\n", "\n", " CREATE TABLE URLS_DUPLICATE (\n", " id_url_canonical INTEGER REFERENCES URLS(id),\n", " id_url_duplicated INTEGER REFERENCES URLS(id),\n", " PRIMARY KEY (id_url_canonical, id_url_duplicated)\n", " );\n", " \n", " CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');\n", " CREATE TABLE SEARCH (\n", " id SMALLSERIAL PRIMARY KEY,\n", " search TEXT NOT NULL UNIQUE,\n", " type SEARCH_TYPE NOT NULL\n", " -- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search\n", " -- UNIQUE(search, language_country)\n", " );\n", " CREATE INDEX idx_search_type ON SEARCH(type);\n", " \n", " CREATE TABLE SOURCE (\n", " id SMALLSERIAL PRIMARY KEY,\n", " source TEXT NOT NULL UNIQUE\n", " );\n", " \n", " -- CREATE TABLE SEARCH_LANGUAGE (\n", " -- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. \"en\"\n", " -- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. \"us\"\n", " -- PRIMARY KEY (language, country)\n", " -- );\n", " \n", " CREATE TABLE URLS_SOURCE_SEARCH (\n", " id_url INTEGER REFERENCES URLS(id),\n", " id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n", " id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n", " PRIMARY KEY(id_url, id_source, id_search)\n", " );\n", " CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);\n", " CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);\n", "\n", " CREATE TABLE STATUS_PATTERN_MATCHING (\n", " pattern TEXT PRIMARY KEY,\n", " priority SMALLINT NOT NULL,\n", " status URL_STATUS NOT NULL\n", " );\n", " \n", " \n", " CREATE TABLE URL_CONTENT (\n", " id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\n", " date_published TIMESTAMPTZ DEFAULT NOW(),\n", " title TEXT,\n", " description TEXT,\n", " content TEXT,\n", " valid_content BOOLEAN,\n", " language CHAR(2), -- ISO 639-1 Code\n", " keywords TEXT[],\n", " tags TEXT[],\n", " authors TEXT[],\n", " image_main_url TEXT,\n", " images_url TEXT[],\n", " videos_url TEXT[],\n", " url_host TEXT, -- www.breitbart.com\n", " site_name TEXT -- Breitbart News\n", " );\n", " CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n", " CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n", " CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\n", " CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\n", " CREATE INDEX idx_language ON URL_CONTENT (language);\n", " CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n", " \"\"\")\n", "\n", " ### Default insert values\n", " \n", " # Feeds\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n", " # Websites of interest\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');\" )\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');\" )\n", " # Search keywords\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n", " # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');\" )\n", " # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');\" )\n", " \n", " # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n", " # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"youtube.com/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"tiktok.com/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"twitter.com/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"reddit.com/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"libreddit.de/\"))) )\n", " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " # Get tables\n", " cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema='public';\")\n", " tables = [t[0] for t in cur.fetchall()]\n", "\n", " for t in tables:\n", " print(\"\\t\", t)\n", " pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " pprint( cur.execute(\"SELECT * FROM SEARCH;\").fetchall() )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " pprint( cur.execute(\"SELECT * FROM URLS LIMIT 50;\").fetchall() )\n", " #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "'''\n", "!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n", "\n", "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", " pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\n", " # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org', 'url_host');\" )\n", "'''" ] } ], "metadata": { "kernelspec": { "display_name": "matitos", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 2 }