Working fetch feeds and process raw urls

This commit is contained in:
Luciano Gervasoni
2025-03-13 18:23:28 +01:00
parent 61c31ee9aa
commit 7d7bce1e72
11 changed files with 318 additions and 136 deletions

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
@@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 132,
"metadata": {},
"outputs": [
{
@@ -19,23 +19,30 @@
"output_type": "stream",
"text": [
"db_postgres\n",
"db_redis\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 2/2\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
" Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h"
]
}
],
"source": [
"!docker rm -f db_postgres; docker compose -f docker/docker-compose.yml up -d ; sleep 10"
"!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 133,
"metadata": {},
"outputs": [],
"source": [
@@ -123,7 +130,7 @@
" keywords TEXT[],\n",
" tags TEXT[],\n",
" authors TEXT[],\n",
" image_main TEXT,\n",
" image_main_url TEXT,\n",
" images_url TEXT[],\n",
" videos_url TEXT[],\n",
" url_host TEXT, -- www.breitbart.com\n",
@@ -156,7 +163,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 134,
"metadata": {},
"outputs": [],
"source": [
@@ -191,9 +198,9 @@
" cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (2, 2)\")\n",
" cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (3, 2)\")\n",
"\n",
" for j in range(15):\n",
" for j in range(5):\n",
" import time\n",
" time.sleep(1)\n",
" time.sleep(0.25)\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_{}.org', 'invalid')\".format(j))\n",
" \n",
" # Long URLs \n",
@@ -202,13 +209,13 @@
"\n",
" # URL Content\n",
" language, content = \"en\", \"Bla Bla Bla!!!\"*25\n",
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, image_urls) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n",
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, images_url) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n",
" (1, datetime.now(tz=timezone.utc), content, language, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 135,
"metadata": {},
"outputs": [
{
@@ -218,99 +225,59 @@
"\t urls\n",
"[(1,\n",
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (2,\n",
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (3,\n",
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (4,\n",
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (5,\n",
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (6,\n",
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (7,\n",
" 'https://www.google.com',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (8,\n",
" 'www.super_0.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (9,\n",
" 'www.super_1.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (10,\n",
" 'www.super_2.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (11,\n",
" 'www.super_3.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (12,\n",
" 'www.super_4.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (13,\n",
" 'www.super_5.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (14,\n",
" 'www.super_6.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (15,\n",
" 'www.super_7.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (16,\n",
" 'www.super_8.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (17,\n",
" 'www.super_9.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (18,\n",
" 'www.super_10.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (19,\n",
" 'www.super_11.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (20,\n",
" 'www.super_12.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (21,\n",
" 'www.super_13.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (22,\n",
" 'www.super_14.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (23,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (24,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid')]\n",
"\t urls_duplicate\n",
"[]\n",
@@ -336,7 +303,7 @@
"[('.*missingkids.org/poster/.*', 50, 'valid')]\n",
"\t url_content\n",
"[(1,\n",
" datetime.datetime(2025, 3, 7, 16, 57, 38, 54447, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 13, 17, 19, 5, 639334, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'Mommy blogger turned child abuser',\n",
" 'Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla '\n",
" 'Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
@@ -344,9 +311,16 @@
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!',\n",
" 'Hello there!',\n",
" None,\n",
" 'en',\n",
" None,\n",
" ['child abuse', 'social media'],\n",
" ['Audrey Conklin'],\n",
" ['https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1'])]\n"
" None,\n",
" ['https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1'],\n",
" None,\n",
" None,\n",
" None)]\n"
]
}
],
@@ -365,6 +339,91 @@
" print(\"\\t\", t)\n",
" pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(1,\n",
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (2,\n",
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (3,\n",
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (4,\n",
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (5,\n",
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (6,\n",
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (7,\n",
" 'https://www.google.com',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (8,\n",
" 'www.super_0.org',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (9,\n",
" 'www.super_1.org',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (10,\n",
" 'www.super_2.org',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (11,\n",
" 'www.super_3.org',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (12,\n",
" 'www.super_4.org',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (13,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (14,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid')]\n"
]
}
],
"source": [
"from pprint import pprint\n",
"\n",
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )"
]
}
],
"metadata": {