In [1]:
# !pip install psycopg[binary]

In [2]:
!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5

db_postgres
db_redis
[1A[1B[0G[?25l[+] Running 0/0
 ⠙ matitos_dozzle Pulling [39m[0m                                                 [34m0.1s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 ⠹ matitos_dozzle Pulling [39m[0m                                                 [34m0.2s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 ⠸ matitos_dozzle Pulling [39m[0m                                                 [34m0.3s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 ⠼ matitos_dozzle Pulling [39m[0m                                                 [34m0.4s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 ⠴ matitos_dozzle Pulling [39m[0m                                                 [34m0.5s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 ⠦ matitos_dozzle Pulling [39m[0m                                                 [34m0.6s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 ⠧ matitos_dozzle Pulling [39m[0m                                                 [34m0.7s [0m
[?25h[1A[

In [3]:
INSERT_TABLES = True
INSERT_SAMPLE_DATA = False

import psycopg
connection_info = "host={} port={} user={} password={} dbname={}".format("localhost", "5432", "supermatitos", "supermatitos", "matitos")

from datetime import datetime, timezone
import re
from pprint import pprint

if INSERT_TABLES:
    # Connect to an existing database
    with psycopg.connect(connection_info) as conn:
        # Open a cursor to perform database operations
        with conn.cursor() as cur:
            # Autocommit at end of transaction (Atomic insert of URLs and sources)
            with conn.transaction() as tx:
                # Create URLs table
                c = cur.execute("""
                    CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');

                    CREATE TABLE URLS (
                        id SERIAL PRIMARY KEY,
                        url TEXT NOT NULL UNIQUE,
                        ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
                        status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
                        -- status_wendy WENDY_STATUS DEFAULT NULL,
                        -- ts_wendy TIMESTAMPTZ DEFAULT NULL
                    );
                    CREATE INDEX idx_urls_status ON urls(status);
                    CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);

                    CREATE TABLE URLS_DUPLICATE (
                        id_url_canonical INTEGER REFERENCES URLS(id),
                        id_url_duplicated INTEGER REFERENCES URLS(id),
                        PRIMARY KEY (id_url_canonical, id_url_duplicated)
                    );
                    
                    CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');
                    CREATE TABLE SEARCH (
                        id SMALLSERIAL PRIMARY KEY,
                        search TEXT NOT NULL UNIQUE,
                        type SEARCH_TYPE NOT NULL
                    );
                    CREATE INDEX idx_search_type ON SEARCH(type);
                    
                    CREATE TABLE SOURCE (
                        id SMALLSERIAL PRIMARY KEY,
                        source TEXT NOT NULL UNIQUE
                    );
                                
                    CREATE TABLE URLS_SOURCE_SEARCH (
                        id_url INTEGER REFERENCES URLS(id),
                        id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,
                        id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,
                        PRIMARY KEY(id_url, id_source, id_search)
                    );
                    CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);
                    CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);

                    CREATE TABLE STATUS_PATTERN_MATCHING (
                        pattern TEXT PRIMARY KEY,
                        priority SMALLINT NOT NULL,
                        status URL_STATUS NOT NULL
                    );
                    
                    
                    CREATE TABLE URL_CONTENT (
                        id_url INTEGER PRIMARY KEY REFERENCES URLS(id),
                        date_published TIMESTAMPTZ DEFAULT NOW(),
                        title TEXT,
                        description TEXT,
                        content TEXT,
                        valid_content BOOLEAN,
                        language CHAR(2), -- ISO 639-1 Code
                        keywords TEXT[],
                        tags TEXT[],
                        authors TEXT[],
                        image_main_url TEXT,
                        images_url TEXT[],
                        videos_url TEXT[],
                        url_host TEXT,    -- www.breitbart.com
                        site_name TEXT    -- Breitbart News
                    );
                    CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);
                    CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);
                    CREATE INDEX idx_date_published ON URL_CONTENT (date_published);
                    CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);
                    CREATE INDEX idx_language ON URL_CONTENT (language);
                    CREATE INDEX idx_url_host ON URL_CONTENT (url_host);
                """)

                ### Default insert values
                
                # Feeds
                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" )
                # Websites of interest
                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('www.missingkids.org/poster', 'url_host');" )
                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');" )
                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" )
                # Search keywords
                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" )
                
                # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/
                # cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');".format(".*{}.*".format(re.escape("missingkids.org/poster/"))) )
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) )
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) )
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) )
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) )
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) )
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) )

In [4]:
if INSERT_SAMPLE_DATA:
    # Connect to an existing database
    with psycopg.connect(connection_info) as conn:
        # Open a cursor to perform database operations
        with conn.cursor() as cur:
            # Autocommit at end of transaction (Atomic insert of URLs and sources)
                with conn.transaction() as tx:
                    # Valid
                    cur.execute("INSERT INTO URLS (url, status) values ('https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame', 'valid')")
                    cur.execute("INSERT INTO URLS (url, status) values ('https://www.bbc.com/news/articles/ckg843y8y7no', 'valid')")
                    cur.execute("INSERT INTO URLS (url, status) values ('https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/', 'valid')")
                    cur.execute("INSERT INTO URLS (url, status) values ('https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895', 'valid')")
                    cur.execute("INSERT INTO URLS (url, status) values ('https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/', 'valid')")
                    cur.execute("INSERT INTO URLS (url, status) values ('https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality', 'valid')")
                    cur.execute("INSERT INTO URLS (url, status) values ('https://www.google.com', 'invalid')")

                    cur.execute("INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/USVA/VA25-0820/1', 'valid')")
                    cur.execute("INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/NCMC/2045193/1', 'valid')")

                    cur.execute("INSERT INTO SOURCE (source) values ('news.google.com')")
                    cur.execute("INSERT INTO SOURCE (source) values ('qwant.com')")

                    cur.execute("INSERT INTO URLS_SOURCE (id_url, id_source, id_search) values (1, 1, 1)")

                    for j in range(5):
                        import time
                        time.sleep(0.25)
                        cur.execute("INSERT INTO URLS (url, status) values ('www.super_{}.org', 'invalid')".format(j))
                        
                    # Long URLs                    
                    cur.execute("INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html', 'invalid')".format(j))
                    cur.execute("INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')".format(j))

                    # URL Content
                    language, content = "en", "Bla Bla Bla!!!"*25
                    cur.execute("INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, images_url) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)", 
                                (1, datetime.now(tz=timezone.utc), content, language, ["child abuse", "social media"], ["Audrey Conklin"], ["https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1"]))

In [5]:
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
    # Open a cursor to perform database operations
    with conn.cursor() as cur:
        # Get tables
        cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema='public';")
        tables = [t[0] for t in cur.fetchall()]

        for t in tables:
            print("\t", t)
            pprint( cur.execute("SELECT * FROM {} LIMIT 50;".format(t)).fetchall() )

	 urls
[]
	 urls_duplicate
[]
	 urls_source_search
[]
	 source
[]
	 search
[(1,
  'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',
  'rss_feed'),
 (2, 'www.breitbart.com', 'url_host'),
 (3, 'child abuse', 'keyword_search')]
	 status_pattern_matching
[('.*youtube\\.com/.*', 50, 'invalid'),
 ('.*tiktok\\.com/.*', 50, 'invalid'),
 ('.*twitter\\.com/.*', 50, 'invalid'),
 ('.*reddit\\.com/.*', 50, 'invalid'),
 ('.*libreddit\\.de/.*', 50, 'invalid'),
 ('.*radio\\.foxnews\\.com/.*', 50, 'invalid')]
	 url_content
[]


In [6]:
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
    # Open a cursor to perform database operations
    with conn.cursor() as cur:
        pprint( cur.execute("SELECT * FROM SEARCH;").fetchall() )

[(1,
  'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',
  'rss_feed'),
 (2, 'www.breitbart.com', 'url_host'),
 (3, 'child abuse', 'keyword_search')]


In [7]:
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
    # Open a cursor to perform database operations
    with conn.cursor() as cur:
        pprint( cur.execute("SELECT * FROM URLS LIMIT 150;").fetchall() )
        #pprint( cur.execute("SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;").fetchall() )

[]
