Working fetch feeds and process raw urls

This commit is contained in:
Luciano Gervasoni
2025-03-13 18:23:28 +01:00
parent 61c31ee9aa
commit 7d7bce1e72
11 changed files with 318 additions and 136 deletions

View File

@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 131,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -11,7 +11,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 132,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -19,23 +19,30 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"db_postgres\n", "db_postgres\n",
"db_redis\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n", "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 2/2\u001b[0m\n", "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n", " Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", " Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h" "\u001b[?25h"
] ]
} }
], ],
"source": [ "source": [
"!docker rm -f db_postgres; docker compose -f docker/docker-compose.yml up -d ; sleep 10" "!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 133,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -123,7 +130,7 @@
" keywords TEXT[],\n", " keywords TEXT[],\n",
" tags TEXT[],\n", " tags TEXT[],\n",
" authors TEXT[],\n", " authors TEXT[],\n",
" image_main TEXT,\n", " image_main_url TEXT,\n",
" images_url TEXT[],\n", " images_url TEXT[],\n",
" videos_url TEXT[],\n", " videos_url TEXT[],\n",
" url_host TEXT, -- www.breitbart.com\n", " url_host TEXT, -- www.breitbart.com\n",
@@ -156,7 +163,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 134,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -191,9 +198,9 @@
" cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (2, 2)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (2, 2)\")\n",
" cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (3, 2)\")\n", " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (3, 2)\")\n",
"\n", "\n",
" for j in range(15):\n", " for j in range(5):\n",
" import time\n", " import time\n",
" time.sleep(1)\n", " time.sleep(0.25)\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_{}.org', 'invalid')\".format(j))\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_{}.org', 'invalid')\".format(j))\n",
" \n", " \n",
" # Long URLs \n", " # Long URLs \n",
@@ -202,13 +209,13 @@
"\n", "\n",
" # URL Content\n", " # URL Content\n",
" language, content = \"en\", \"Bla Bla Bla!!!\"*25\n", " language, content = \"en\", \"Bla Bla Bla!!!\"*25\n",
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, image_urls) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n", " cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, images_url) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n",
" (1, datetime.now(tz=timezone.utc), content, language, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))" " (1, datetime.now(tz=timezone.utc), content, language, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 135,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@@ -218,99 +225,59 @@
"\t urls\n", "\t urls\n",
"[(1,\n", "[(1,\n",
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n", " 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n", " 'valid'),\n",
" (2,\n", " (2,\n",
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n", " 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n", " 'valid'),\n",
" (3,\n", " (3,\n",
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n", " 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n", " 'valid'),\n",
" (4,\n", " (4,\n",
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n", " 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n", " 'valid'),\n",
" (5,\n", " (5,\n",
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n", " 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n", " 'valid'),\n",
" (6,\n", " (6,\n",
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n", " 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n", " 'valid'),\n",
" (7,\n", " (7,\n",
" 'https://www.google.com',\n", " 'https://www.google.com',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n", " 'invalid'),\n",
" (8,\n", " (8,\n",
" 'www.super_0.org',\n", " 'www.super_0.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n", " 'invalid'),\n",
" (9,\n", " (9,\n",
" 'www.super_1.org',\n", " 'www.super_1.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n", " 'invalid'),\n",
" (10,\n", " (10,\n",
" 'www.super_2.org',\n", " 'www.super_2.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n", " 'invalid'),\n",
" (11,\n", " (11,\n",
" 'www.super_3.org',\n", " 'www.super_3.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n", " 'invalid'),\n",
" (12,\n", " (12,\n",
" 'www.super_4.org',\n", " 'www.super_4.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n", " 'invalid'),\n",
" (13,\n", " (13,\n",
" 'www.super_5.org',\n", " 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n", " 'invalid'),\n",
" (14,\n", " (14,\n",
" 'www.super_6.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (15,\n",
" 'www.super_7.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (16,\n",
" 'www.super_8.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (17,\n",
" 'www.super_9.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (18,\n",
" 'www.super_10.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (19,\n",
" 'www.super_11.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (20,\n",
" 'www.super_12.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (21,\n",
" 'www.super_13.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (22,\n",
" 'www.super_14.org',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (23,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (24,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n", " 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid')]\n", " 'invalid')]\n",
"\t urls_duplicate\n", "\t urls_duplicate\n",
"[]\n", "[]\n",
@@ -336,7 +303,7 @@
"[('.*missingkids.org/poster/.*', 50, 'valid')]\n", "[('.*missingkids.org/poster/.*', 50, 'valid')]\n",
"\t url_content\n", "\t url_content\n",
"[(1,\n", "[(1,\n",
" datetime.datetime(2025, 3, 7, 16, 57, 38, 54447, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", " datetime.datetime(2025, 3, 13, 17, 19, 5, 639334, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'Mommy blogger turned child abuser',\n", " 'Mommy blogger turned child abuser',\n",
" 'Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla '\n", " 'Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla '\n",
" 'Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n", " 'Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
@@ -344,9 +311,16 @@
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n", " 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!',\n", " 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!',\n",
" 'Hello there!',\n", " 'Hello there!',\n",
" None,\n",
" 'en',\n",
" None,\n",
" ['child abuse', 'social media'],\n", " ['child abuse', 'social media'],\n",
" ['Audrey Conklin'],\n", " ['Audrey Conklin'],\n",
" ['https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1'])]\n" " None,\n",
" ['https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1'],\n",
" None,\n",
" None,\n",
" None)]\n"
] ]
} }
], ],
@@ -365,6 +339,91 @@
" print(\"\\t\", t)\n", " print(\"\\t\", t)\n",
" pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )" " pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(1,\n",
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (2,\n",
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (3,\n",
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (4,\n",
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (5,\n",
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (6,\n",
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (7,\n",
" 'https://www.google.com',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (8,\n",
" 'www.super_0.org',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (9,\n",
" 'www.super_1.org',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (10,\n",
" 'www.super_2.org',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (11,\n",
" 'www.super_3.org',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (12,\n",
" 'www.super_4.org',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (13,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (14,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid')]\n"
]
}
],
"source": [
"from pprint import pprint\n",
"\n",
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )"
]
} }
], ],
"metadata": { "metadata": {

View File

@@ -1,5 +1,34 @@
{ {
"cells": [ "cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import newspaper\n",
"url = \"http://www.missingkids.org/poster/NCMC/2045193/1\"\n",
"#url = \"https://www.missingkids.org/new-poster/NCMC/2045193/1\"\n",
"\n",
"art = newspaper.article(url)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"art.__dict__"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,

View File

@@ -33,6 +33,16 @@ python manage.py inspectdb
# Fields default: # Fields default:
ts_fetch = models.DateTimeField(auto_now_add=True) ts_fetch = models.DateTimeField(auto_now_add=True)
status = models.TextField(default='raw') # This field type is a guess. status = models.TextField(default='raw') # This field type is a guess.
# URLContent:
from django.contrib.postgres.fields import ArrayField
keywords = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
tags = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
authors = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
image_main_url = models.TextField(blank=True, null=True)
images_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
videos_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
``` ```
* Environment variables * Environment variables
@@ -51,8 +61,8 @@ REDIS_PORT=${REDIS_PORT:-6379}
``` ```
# Generate content for models.py # Generate content for models.py
python manage.py inspectdb python manage.py inspectdb
python manage.py makemigrations # Migrations
python manage.py migrate --fake python manage.py makemigrations api; python manage.py migrate --fake-initial
``` ```
* Deploy * Deploy

View File

@@ -1,4 +1,4 @@
# Generated by Django 5.1.7 on 2025-03-07 16:56 # Generated by Django 5.1.7 on 2025-03-13 17:01
import django.db.models.deletion import django.db.models.deletion
from django.db import migrations, models from django.db import migrations, models
@@ -62,8 +62,8 @@ class Migration(migrations.Migration):
fields=[ fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('url', models.TextField(unique=True)), ('url', models.TextField(unique=True)),
('ts_fetch', models.DateTimeField()), ('ts_fetch', models.DateTimeField(auto_now_add=True)),
('status', models.TextField()), ('status', models.TextField(choices=[('raw', 'Raw'), ('error', 'Error'), ('valid', 'Valid'), ('unknown', 'Unknown'), ('invalid', 'Invalid'), ('duplicate', 'Duplicate')], default='raw')),
], ],
options={ options={
'db_table': 'urls', 'db_table': 'urls',
@@ -100,9 +100,16 @@ class Migration(migrations.Migration):
('title', models.TextField(blank=True, null=True)), ('title', models.TextField(blank=True, null=True)),
('description', models.TextField(blank=True, null=True)), ('description', models.TextField(blank=True, null=True)),
('content', models.TextField(blank=True, null=True)), ('content', models.TextField(blank=True, null=True)),
('valid_content', models.BooleanField(blank=True, null=True)),
('language', models.CharField(blank=True, max_length=2, null=True)),
('keywords', models.TextField(blank=True, null=True)),
('tags', models.TextField(blank=True, null=True)), ('tags', models.TextField(blank=True, null=True)),
('authors', models.TextField(blank=True, null=True)), ('authors', models.TextField(blank=True, null=True)),
('image_urls', models.TextField(blank=True, null=True)), ('image_main', models.TextField(blank=True, null=True)),
('images_url', models.TextField(blank=True, null=True)),
('videos_url', models.TextField(blank=True, null=True)),
('url_host', models.TextField(blank=True, null=True)),
('site_name', models.TextField(blank=True, null=True)),
], ],
options={ options={
'db_table': 'url_content', 'db_table': 'url_content',

View File

@@ -1,4 +1,5 @@
from django.db import models from django.db import models
from django.contrib.postgres.fields import ArrayField
# Create your models here. # Create your models here.
class Feed(models.Model): class Feed(models.Model):
@@ -44,9 +45,16 @@ class UrlContent(models.Model):
title = models.TextField(blank=True, null=True) title = models.TextField(blank=True, null=True)
description = models.TextField(blank=True, null=True) description = models.TextField(blank=True, null=True)
content = models.TextField(blank=True, null=True) content = models.TextField(blank=True, null=True)
tags = models.TextField(blank=True, null=True) # This field type is a guess. valid_content = models.BooleanField(blank=True, null=True)
authors = models.TextField(blank=True, null=True) # This field type is a guess. language = models.CharField(max_length=2, blank=True, null=True)
image_urls = models.TextField(blank=True, null=True) # This field type is a guess. keywords = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
tags = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
authors = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
image_main_url = models.TextField(blank=True, null=True)
images_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
videos_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
url_host = models.TextField(blank=True, null=True)
site_name = models.TextField(blank=True, null=True)
class Meta: class Meta:
managed = False managed = False
@@ -54,9 +62,17 @@ class UrlContent(models.Model):
class Urls(models.Model): class Urls(models.Model):
class STATUS_ENUM(models.TextChoices):
RAW = "raw", "Raw"
ERROR = "error", "Error"
VALID = "valid", "Valid"
UNKNOWN = "unknown", "Unknown"
INVALID = "invalid", "Invalid"
DUPLICATE = "duplicate", "Duplicate"
url = models.TextField(unique=True) url = models.TextField(unique=True)
ts_fetch = models.DateTimeField(auto_now_add=True) ts_fetch = models.DateTimeField(auto_now_add=True)
status = models.TextField(default='raw') # This field type is a guess. status = models.TextField(choices=STATUS_ENUM.choices, default=STATUS_ENUM.RAW) # This field type is a guess.
class Meta: class Meta:
managed = False managed = False

View File

@@ -2,6 +2,7 @@ from ..models import Urls, UrlContent, UrlsSource, Source, WebsiteToFilter, Stat
from .url_processor import process_url from .url_processor import process_url
from django.utils import timezone from django.utils import timezone
from django.core.cache import cache from django.core.cache import cache
from django.db import IntegrityError
import hashlib import hashlib
from datetime import timedelta from datetime import timedelta
import re import re
@@ -25,16 +26,32 @@ class DB_Handler():
return cache.get(self._get_safe_cache_key(cache_key)) is not None return cache.get(self._get_safe_cache_key(cache_key)) is not None
def insert_raw_urls(self, urls, source): def insert_raw_urls(self, urls, source):
def clean_protocol(url):
# http:// -> https://
url = url.replace("http://", "https://")
# "" -> https://
if not (url.startswith("https://")):
url = "https://" + url
return url
try: try:
logger.debug("Inserting raw URLs") logger.debug("Inserting raw URLs")
# Empty? # Empty?
if (len(urls) == 0): if (len(urls) == 0):
logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source)) logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
return return
# Default protocol https://
urls_clean = [clean_protocol(url) for url in urls]
# Get the source (create if not exists)
source_obj, created = Source.objects.get_or_create(source=source)
url_object_to_insert = [] urls_to_insert = []
# Per URL # Per URL
for url in urls: for url in urls_clean:
### Already processed URL? ### Already processed URL?
if (self._is_cached_key(url)): if (self._is_cached_key(url)):
logger.debug("Already cached URL: {}".format(url)) logger.debug("Already cached URL: {}".format(url))
@@ -42,25 +59,43 @@ class DB_Handler():
if (self._is_cached_key("{}{}".format(source, url))): if (self._is_cached_key("{}{}".format(source, url))):
logger.debug("Already cached (source, URL): {} {}".format(source, url)) logger.debug("Already cached (source, URL): {} {}".format(source, url))
else: else:
### Insert source ### Insert (URL_id, source_id), since not cached
# Get the source (create if not exists) # Get URL ID (should already be created)
source_obj, created = Source.objects.get_or_create(source=source) url_obj, created = Urls.objects.get_or_create(url=url)
# Get URL ID # Create (id_source, id_url) (shouldn't exist)
url_obj = Urls.objects.get(url=url) UrlsSource.objects.get_or_create(id_source=source_obj, id_url=url_obj)
# Create (id_source, id_url)
UrlsSource.objects.create(id_source=source_obj.id, id_url=url_obj.id)
else: else:
# Add object to insert # Add object to insert
url_object_to_insert.append(Urls(url=url)) # url_object_to_insert.append(Urls(url=url))
urls_to_insert.append(url)
### Insert URLs & (URL_id, source_id)
try:
### Bulk insert, fails if duplicated URL (not retuning IDs when using ignore_conflicts=True)
# URLs (ignore_conflicts=False to return IDs)
bulk_created_urls = Urls.objects.bulk_create([Urls(url=url) for url in urls_to_insert], ignore_conflicts=False)
# (URL_id, source_id)
UrlsSource.objects.bulk_create([UrlsSource(id_source=source_obj, id_url=url_obj) for url_obj in bulk_created_urls], ignore_conflicts=True)
except IntegrityError as e:
### Fallback to one-by-one insert
logger.debug("bulk_create exception while inserting raw URLs, falling back to non-bulk method")
# One by one
for url in urls_to_insert:
# URL
url_obj, created = Urls.objects.get_or_create(url=url)
# (URL, source)
UrlsSource.objects.get_or_create(id_source=source_obj, id_url=url_obj)
except Exception as e:
logger.warning("bulk_create unknown exception while inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
# Avoid caching due to error on insertion
urls_clean = []
### Bulk insert URLs, ignore conflicts if a url exists
bulk_created_obj = Urls.objects.bulk_create(url_object_to_insert, ignore_conflicts=True)
# Insert or update cache # Insert or update cache
for url in urls: for url in urls_clean:
self._cache_key(url) self._cache_key(url)
self._cache_key("{}{}".format(source, url)) self._cache_key("{}{}".format(source, url))
logger.info("Inserted #{} raw URLs".format(len(url_object_to_insert))) logger.info("Inserted #{} raw URLs".format(len(urls_to_insert)))
except Exception as e: except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
@@ -83,6 +118,12 @@ class DB_Handler():
# Pattern matching not required or not found, original article status # Pattern matching not required or not found, original article status
return article_status return article_status
def process_error_urls(self, batch_size=50):
# Get batch of URLs, status='error'
#error_urls = Urls.objects.SORTBY TS_FETCH....filter(status=Urls.STATUS_ENUM.RAW, ts_fetch__gte=time_delta_ts)[:batch_size]
pass
def process_raw_urls(self, time_delta=timedelta(days=1), batch_size=50): def process_raw_urls(self, time_delta=timedelta(days=1), batch_size=50):
try: try:
logger.debug("Processing raw URLs") logger.debug("Processing raw URLs")
@@ -95,19 +136,18 @@ class DB_Handler():
# Fetched during last 24 hours # Fetched during last 24 hours
time_delta_ts = timezone.now() - time_delta time_delta_ts = timezone.now() - time_delta
# Get batch of URLs, status='raw' and fetched X days ago # Get batch of URLs, status='raw' and fetched X days ago
raw_urls = Urls.objects.filter(status='raw', ts_fetch__gte=time_delta_ts)[:batch_size] raw_urls = Urls.objects.filter(status=Urls.STATUS_ENUM.RAW, ts_fetch__gte=time_delta_ts)[:batch_size]
# List of objects to bulk update # List of objects to bulk update
updating_urls = [] updating_urls = []
# Per URL # Per URL
for obj_url in raw_urls: for obj_url in raw_urls:
##### Any domain to filter included in URL? -> Invalid ##### Any domain to filter included in URL? -> Invalid
if (any([d in obj_url.url for d in list_domains_to_filter])): if (any([d in obj_url.url for d in list_domains_to_filter])):
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url)) logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
# Update status # Update status
obj_url.status = 'invalid' obj_url.status = Urls.STATUS_ENUM.INVALID
# Append to bulk update obj_url.save()
updating_urls.append(obj_url) updating_urls.append(obj_url)
# Next URL # Next URL
continue continue
@@ -119,10 +159,10 @@ class DB_Handler():
# Not none or handle as exception # Not none or handle as exception
assert(dict_url_data is not None) assert(dict_url_data is not None)
except Exception as e: except Exception as e:
logger.debug("Error processing URL: {}\n{}".format(obj_url.url, str(e))) logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc()))
# Update status # Update status
obj_url.status = 'error' obj_url.status = Urls.STATUS_ENUM.ERROR
# Append to bulk update obj_url.save()
updating_urls.append(obj_url) updating_urls.append(obj_url)
# Next URL # Next URL
continue continue
@@ -130,30 +170,30 @@ class DB_Handler():
##### Canonical URL different? -> Duplicate ##### Canonical URL different? -> Duplicate
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")): if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status # Update status
obj_url.status = 'duplicate' obj_url.status = Urls.STATUS_ENUM.DUPLICATE
# Append to bulk update obj_url.save()
updating_urls.append(obj_url) updating_urls.append(obj_url)
# Get or create URL with canonical form # Get or create URL with canonical form
obj_url_canonical = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical")) obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
# Associate same sources to url -> url_canonical
# Get the sources id associated to obj_url.id # Get the sources id associated to obj_url.id
url_sources = UrlsSource.objects.filter(id_url=obj_url.id) url_sources = UrlsSource.objects.filter(id_url=obj_url)
for url_source_obj in url_sources: for url_source_obj in url_sources:
# Associate same sources to url_canonical (it might already exist) # Associate same sources to url_canonical (it might already exist)
UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical.id) UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical)
# Next URL # Next URL
continue continue
##### Valid URL ##### Valid URL
# Update status # Update status
obj_url.status = 'valid' obj_url.status = Urls.STATUS_ENUM.VALID
# Append to bulk update obj_url.save()
updating_urls.append(obj_url) updating_urls.append(obj_url)
# Create extracted URL data # Create extracted URL data
UrlContent.objects.create_or_update( UrlContent.objects.create(
id_url=obj_url.id, id_url=obj_url,
date_published=dict_url_data.get("publish_date"), date_published=dict_url_data.get("publish_date"),
title=dict_url_data.get("title"), title=dict_url_data.get("title"),
description=dict_url_data.get("description"), description=dict_url_data.get("description"),
@@ -163,7 +203,7 @@ class DB_Handler():
keywords=dict_url_data.get("keywords"), keywords=dict_url_data.get("keywords"),
tags=dict_url_data.get("tags"), tags=dict_url_data.get("tags"),
authors=dict_url_data.get("authors"), authors=dict_url_data.get("authors"),
image_main=dict_url_data.get("image_main"), image_main_url=dict_url_data.get("image_main_url"),
images_url=dict_url_data.get("images_url"), images_url=dict_url_data.get("images_url"),
videos_url=dict_url_data.get("videos_url"), videos_url=dict_url_data.get("videos_url"),
url_host=dict_url_data.get("url_host"), url_host=dict_url_data.get("url_host"),
@@ -180,11 +220,11 @@ class DB_Handler():
logger.debug("Pattern matching, overriding with status {} for URL: {}".format(status_pattern_matching, obj_url.url)) logger.debug("Pattern matching, overriding with status {} for URL: {}".format(status_pattern_matching, obj_url.url))
# Update, no need to append to updating_urls, already included # Update, no need to append to updating_urls, already included
obj_url.status = status_pattern_matching obj_url.status = status_pattern_matching
obj_url.save()
# Bulk update # TODO: Fix enum type issue. Bulk update
Urls.objects.bulk_update(updating_urls, ['status']) # Urls.objects.bulk_update(updating_urls, ['status'])
logger.debug("Finished processing raw URLs") logger.info("Updated #{} raw URLs".format(len(updating_urls)))
except Exception as e: except Exception as e:
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc())) logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))

View File

@@ -1,6 +1,7 @@
from .logger import get_logger from .logger import get_logger
logger = get_logger() logger = get_logger()
import newspaper import newspaper
from urllib.parse import unquote
# pip install langdetect # pip install langdetect
#import langdetect #import langdetect
#langdetect.DetectorFactory.seed = 0 #langdetect.DetectorFactory.seed = 0
@@ -30,9 +31,9 @@ def process_url(url):
"keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""], "keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
"tags": article.tags, "tags": article.tags,
"authors": article.authors, "authors": article.authors,
"image_main": article.top_image, # article.meta_img "image_main_url": article.top_image, # article.meta_img
"images": article.images, "images_url": article.images,
"videos": article.videos, "videos_url": article.movies,
} }
''' '''
@@ -46,13 +47,16 @@ def process_url(url):
# Sanity check # Sanity check
for k in dict_data.keys(): for k in dict_data.keys():
if (type(k) is list): if (type(dict_data[k]) is list):
# Remove empty string # Remove empty string, unquote special characters, e.g. "%20" -> " "
dict_data[k] = [ e for e in dict_data[k] if e != "" ] dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ]
# NULL instead of empty list # NULL instead of empty list
if (len(dict_data[k]) == 0): if (len(dict_data[k]) == 0):
dict_data[k] = None dict_data[k] = None
else: elif (type(dict_data[k]) is str):
# Unquote special characters
if (dict_data[k] is not None):
dict_data[k] = unquote(dict_data[k])
# NULL instead of empty string # NULL instead of empty string
if (dict_data[k] == ""): if (dict_data[k] == ""):
dict_data[k] = None dict_data[k] = None

View File

@@ -15,18 +15,20 @@ from src.credentials import db_connect_info, redis_connect_info
db_handler = DB_Handler(db_connect_info, redis_connect_info) db_handler = DB_Handler(db_connect_info, redis_connect_info)
''' '''
import logging from .src.logger import get_logger
logger = logging.getLogger(__name__) logger = get_logger()
@job @job
def background_task(process_type: str): def background_task(process_type: str):
logger.info("Task triggered: {}".format(process_type)) logger.info("Task triggered: {}".format(process_type))
try: try:
FetchFeeds().run() if (process_type == "fetch_feeds"):
FetchFeeds().run()
# DB_Handler().process_raw_urls() elif (process_type == "process_raw_urls"):
DB_Handler().process_raw_urls(batch_size=3)
else:
logger.info("Task unknown!: {}".format(process_type))
''' '''

View File

@@ -2,5 +2,5 @@ from django.urls import path
from .views import trigger_task from .views import trigger_task
urlpatterns = [ urlpatterns = [
path('fetch', trigger_task, name='trigger_task') path('<str:task>', trigger_task, name='trigger_task'),
] ]

View File

@@ -1,10 +1,11 @@
import django_rq import django_rq
from django.http import JsonResponse from django.http import JsonResponse
from .tasks import background_task from .tasks import background_task
from .src.logger import get_logger
logger = get_logger()
def trigger_task(request): def trigger_task(request, task):
"""View that enqueues a task.""" """View that enqueues a task."""
queue = django_rq.get_queue('default') # Get the default queue queue = django_rq.get_queue('default') # Get the default queue
job = queue.enqueue(background_task, "Hello from Django RQ!") job = queue.enqueue(background_task, task)
return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id}) return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})

View File

@@ -27,6 +27,20 @@ services:
#expose: #expose:
# - 6379 # - 6379
matitos_adminer:
# http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public
image: adminer
container_name: adminer
restart: unless-stopped
environment:
- ADMINER_DEFAULT_DB_DRIVER=pgsql
#- ADMINER_DEFAULT_DB_HOST
#- ADMINER_DEFAULT_DB_NAME
depends_on:
- matitos_db
ports:
- 8080:8080
# django: # django:
# Env: DB_HOST=matitos_db # Env: DB_HOST=matitos_db
# DJANGO_DB_NAME=${DB_DATABASE_NAME:-matitos} # DJANGO_DB_NAME=${DB_DATABASE_NAME:-matitos}