Working fetch feeds and process raw urls
This commit is contained in:
201
1-DB.ipynb
201
1-DB.ipynb
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 131,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -11,7 +11,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 132,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -19,23 +19,30 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"db_postgres\n",
|
||||
"db_redis\n",
|
||||
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n",
|
||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 2/2\u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
|
||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!docker rm -f db_postgres; docker compose -f docker/docker-compose.yml up -d ; sleep 10"
|
||||
"!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 133,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -123,7 +130,7 @@
|
||||
" keywords TEXT[],\n",
|
||||
" tags TEXT[],\n",
|
||||
" authors TEXT[],\n",
|
||||
" image_main TEXT,\n",
|
||||
" image_main_url TEXT,\n",
|
||||
" images_url TEXT[],\n",
|
||||
" videos_url TEXT[],\n",
|
||||
" url_host TEXT, -- www.breitbart.com\n",
|
||||
@@ -156,7 +163,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 134,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -191,9 +198,9 @@
|
||||
" cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (2, 2)\")\n",
|
||||
" cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (3, 2)\")\n",
|
||||
"\n",
|
||||
" for j in range(15):\n",
|
||||
" for j in range(5):\n",
|
||||
" import time\n",
|
||||
" time.sleep(1)\n",
|
||||
" time.sleep(0.25)\n",
|
||||
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_{}.org', 'invalid')\".format(j))\n",
|
||||
" \n",
|
||||
" # Long URLs \n",
|
||||
@@ -202,13 +209,13 @@
|
||||
"\n",
|
||||
" # URL Content\n",
|
||||
" language, content = \"en\", \"Bla Bla Bla!!!\"*25\n",
|
||||
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, image_urls) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n",
|
||||
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, images_url) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n",
|
||||
" (1, datetime.now(tz=timezone.utc), content, language, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 135,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@@ -218,99 +225,59 @@
|
||||
"\t urls\n",
|
||||
"[(1,\n",
|
||||
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (2,\n",
|
||||
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (3,\n",
|
||||
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (4,\n",
|
||||
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (5,\n",
|
||||
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (6,\n",
|
||||
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (7,\n",
|
||||
" 'https://www.google.com',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (8,\n",
|
||||
" 'www.super_0.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (9,\n",
|
||||
" 'www.super_1.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (10,\n",
|
||||
" 'www.super_2.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (11,\n",
|
||||
" 'www.super_3.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (12,\n",
|
||||
" 'www.super_4.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (13,\n",
|
||||
" 'www.super_5.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (14,\n",
|
||||
" 'www.super_6.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (15,\n",
|
||||
" 'www.super_7.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (16,\n",
|
||||
" 'www.super_8.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (17,\n",
|
||||
" 'www.super_9.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (18,\n",
|
||||
" 'www.super_10.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (19,\n",
|
||||
" 'www.super_11.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (20,\n",
|
||||
" 'www.super_12.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (21,\n",
|
||||
" 'www.super_13.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (22,\n",
|
||||
" 'www.super_14.org',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (23,\n",
|
||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (24,\n",
|
||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid')]\n",
|
||||
"\t urls_duplicate\n",
|
||||
"[]\n",
|
||||
@@ -336,7 +303,7 @@
|
||||
"[('.*missingkids.org/poster/.*', 50, 'valid')]\n",
|
||||
"\t url_content\n",
|
||||
"[(1,\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 38, 54447, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 5, 639334, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'Mommy blogger turned child abuser',\n",
|
||||
" 'Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla '\n",
|
||||
" 'Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
|
||||
@@ -344,9 +311,16 @@
|
||||
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
|
||||
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!',\n",
|
||||
" 'Hello there!',\n",
|
||||
" None,\n",
|
||||
" 'en',\n",
|
||||
" None,\n",
|
||||
" ['child abuse', 'social media'],\n",
|
||||
" ['Audrey Conklin'],\n",
|
||||
" ['https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1'])]\n"
|
||||
" None,\n",
|
||||
" ['https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1'],\n",
|
||||
" None,\n",
|
||||
" None,\n",
|
||||
" None)]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -365,6 +339,91 @@
|
||||
" print(\"\\t\", t)\n",
|
||||
" pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 136,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[(1,\n",
|
||||
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (2,\n",
|
||||
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (3,\n",
|
||||
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (4,\n",
|
||||
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (5,\n",
|
||||
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (6,\n",
|
||||
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (7,\n",
|
||||
" 'https://www.google.com',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (8,\n",
|
||||
" 'www.super_0.org',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (9,\n",
|
||||
" 'www.super_1.org',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (10,\n",
|
||||
" 'www.super_2.org',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (11,\n",
|
||||
" 'www.super_3.org',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (12,\n",
|
||||
" 'www.super_4.org',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (13,\n",
|
||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (14,\n",
|
||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
|
||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid')]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"# Connect to an existing database\n",
|
||||
"with psycopg.connect(connection_info) as conn:\n",
|
||||
" # Open a cursor to perform database operations\n",
|
||||
" with conn.cursor() as cur:\n",
|
||||
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
||||
@@ -1,5 +1,34 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import newspaper\n",
|
||||
"url = \"http://www.missingkids.org/poster/NCMC/2045193/1\"\n",
|
||||
"#url = \"https://www.missingkids.org/new-poster/NCMC/2045193/1\"\n",
|
||||
"\n",
|
||||
"art = newspaper.article(url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"art.__dict__"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
@@ -33,6 +33,16 @@ python manage.py inspectdb
|
||||
# Fields default:
|
||||
ts_fetch = models.DateTimeField(auto_now_add=True)
|
||||
status = models.TextField(default='raw') # This field type is a guess.
|
||||
|
||||
# URLContent:
|
||||
from django.contrib.postgres.fields import ArrayField
|
||||
|
||||
keywords = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
tags = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
authors = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
image_main_url = models.TextField(blank=True, null=True)
|
||||
images_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
videos_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
```
|
||||
|
||||
* Environment variables
|
||||
@@ -51,8 +61,8 @@ REDIS_PORT=${REDIS_PORT:-6379}
|
||||
```
|
||||
# Generate content for models.py
|
||||
python manage.py inspectdb
|
||||
python manage.py makemigrations
|
||||
python manage.py migrate --fake
|
||||
# Migrations
|
||||
python manage.py makemigrations api; python manage.py migrate --fake-initial
|
||||
```
|
||||
|
||||
* Deploy
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# Generated by Django 5.1.7 on 2025-03-07 16:56
|
||||
# Generated by Django 5.1.7 on 2025-03-13 17:01
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
@@ -62,8 +62,8 @@ class Migration(migrations.Migration):
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('url', models.TextField(unique=True)),
|
||||
('ts_fetch', models.DateTimeField()),
|
||||
('status', models.TextField()),
|
||||
('ts_fetch', models.DateTimeField(auto_now_add=True)),
|
||||
('status', models.TextField(choices=[('raw', 'Raw'), ('error', 'Error'), ('valid', 'Valid'), ('unknown', 'Unknown'), ('invalid', 'Invalid'), ('duplicate', 'Duplicate')], default='raw')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'urls',
|
||||
@@ -100,9 +100,16 @@ class Migration(migrations.Migration):
|
||||
('title', models.TextField(blank=True, null=True)),
|
||||
('description', models.TextField(blank=True, null=True)),
|
||||
('content', models.TextField(blank=True, null=True)),
|
||||
('valid_content', models.BooleanField(blank=True, null=True)),
|
||||
('language', models.CharField(blank=True, max_length=2, null=True)),
|
||||
('keywords', models.TextField(blank=True, null=True)),
|
||||
('tags', models.TextField(blank=True, null=True)),
|
||||
('authors', models.TextField(blank=True, null=True)),
|
||||
('image_urls', models.TextField(blank=True, null=True)),
|
||||
('image_main', models.TextField(blank=True, null=True)),
|
||||
('images_url', models.TextField(blank=True, null=True)),
|
||||
('videos_url', models.TextField(blank=True, null=True)),
|
||||
('url_host', models.TextField(blank=True, null=True)),
|
||||
('site_name', models.TextField(blank=True, null=True)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'url_content',
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from django.db import models
|
||||
from django.contrib.postgres.fields import ArrayField
|
||||
|
||||
# Create your models here.
|
||||
class Feed(models.Model):
|
||||
@@ -44,9 +45,16 @@ class UrlContent(models.Model):
|
||||
title = models.TextField(blank=True, null=True)
|
||||
description = models.TextField(blank=True, null=True)
|
||||
content = models.TextField(blank=True, null=True)
|
||||
tags = models.TextField(blank=True, null=True) # This field type is a guess.
|
||||
authors = models.TextField(blank=True, null=True) # This field type is a guess.
|
||||
image_urls = models.TextField(blank=True, null=True) # This field type is a guess.
|
||||
valid_content = models.BooleanField(blank=True, null=True)
|
||||
language = models.CharField(max_length=2, blank=True, null=True)
|
||||
keywords = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
tags = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
authors = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
image_main_url = models.TextField(blank=True, null=True)
|
||||
images_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
videos_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
url_host = models.TextField(blank=True, null=True)
|
||||
site_name = models.TextField(blank=True, null=True)
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
@@ -54,9 +62,17 @@ class UrlContent(models.Model):
|
||||
|
||||
|
||||
class Urls(models.Model):
|
||||
class STATUS_ENUM(models.TextChoices):
|
||||
RAW = "raw", "Raw"
|
||||
ERROR = "error", "Error"
|
||||
VALID = "valid", "Valid"
|
||||
UNKNOWN = "unknown", "Unknown"
|
||||
INVALID = "invalid", "Invalid"
|
||||
DUPLICATE = "duplicate", "Duplicate"
|
||||
|
||||
url = models.TextField(unique=True)
|
||||
ts_fetch = models.DateTimeField(auto_now_add=True)
|
||||
status = models.TextField(default='raw') # This field type is a guess.
|
||||
status = models.TextField(choices=STATUS_ENUM.choices, default=STATUS_ENUM.RAW) # This field type is a guess.
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
|
||||
@@ -2,6 +2,7 @@ from ..models import Urls, UrlContent, UrlsSource, Source, WebsiteToFilter, Stat
|
||||
from .url_processor import process_url
|
||||
from django.utils import timezone
|
||||
from django.core.cache import cache
|
||||
from django.db import IntegrityError
|
||||
import hashlib
|
||||
from datetime import timedelta
|
||||
import re
|
||||
@@ -25,16 +26,32 @@ class DB_Handler():
|
||||
return cache.get(self._get_safe_cache_key(cache_key)) is not None
|
||||
|
||||
def insert_raw_urls(self, urls, source):
|
||||
|
||||
def clean_protocol(url):
|
||||
# http:// -> https://
|
||||
url = url.replace("http://", "https://")
|
||||
# "" -> https://
|
||||
if not (url.startswith("https://")):
|
||||
url = "https://" + url
|
||||
return url
|
||||
|
||||
try:
|
||||
logger.debug("Inserting raw URLs")
|
||||
# Empty?
|
||||
if (len(urls) == 0):
|
||||
logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
|
||||
return
|
||||
|
||||
# Default protocol https://
|
||||
urls_clean = [clean_protocol(url) for url in urls]
|
||||
|
||||
# Get the source (create if not exists)
|
||||
source_obj, created = Source.objects.get_or_create(source=source)
|
||||
|
||||
url_object_to_insert = []
|
||||
urls_to_insert = []
|
||||
# Per URL
|
||||
for url in urls:
|
||||
for url in urls_clean:
|
||||
|
||||
### Already processed URL?
|
||||
if (self._is_cached_key(url)):
|
||||
logger.debug("Already cached URL: {}".format(url))
|
||||
@@ -42,25 +59,43 @@ class DB_Handler():
|
||||
if (self._is_cached_key("{}{}".format(source, url))):
|
||||
logger.debug("Already cached (source, URL): {} {}".format(source, url))
|
||||
else:
|
||||
### Insert source
|
||||
# Get the source (create if not exists)
|
||||
source_obj, created = Source.objects.get_or_create(source=source)
|
||||
# Get URL ID
|
||||
url_obj = Urls.objects.get(url=url)
|
||||
# Create (id_source, id_url)
|
||||
UrlsSource.objects.create(id_source=source_obj.id, id_url=url_obj.id)
|
||||
### Insert (URL_id, source_id), since not cached
|
||||
# Get URL ID (should already be created)
|
||||
url_obj, created = Urls.objects.get_or_create(url=url)
|
||||
# Create (id_source, id_url) (shouldn't exist)
|
||||
UrlsSource.objects.get_or_create(id_source=source_obj, id_url=url_obj)
|
||||
else:
|
||||
# Add object to insert
|
||||
url_object_to_insert.append(Urls(url=url))
|
||||
# url_object_to_insert.append(Urls(url=url))
|
||||
urls_to_insert.append(url)
|
||||
|
||||
### Insert URLs & (URL_id, source_id)
|
||||
try:
|
||||
### Bulk insert, fails if duplicated URL (not retuning IDs when using ignore_conflicts=True)
|
||||
# URLs (ignore_conflicts=False to return IDs)
|
||||
bulk_created_urls = Urls.objects.bulk_create([Urls(url=url) for url in urls_to_insert], ignore_conflicts=False)
|
||||
# (URL_id, source_id)
|
||||
UrlsSource.objects.bulk_create([UrlsSource(id_source=source_obj, id_url=url_obj) for url_obj in bulk_created_urls], ignore_conflicts=True)
|
||||
except IntegrityError as e:
|
||||
### Fallback to one-by-one insert
|
||||
logger.debug("bulk_create exception while inserting raw URLs, falling back to non-bulk method")
|
||||
# One by one
|
||||
for url in urls_to_insert:
|
||||
# URL
|
||||
url_obj, created = Urls.objects.get_or_create(url=url)
|
||||
# (URL, source)
|
||||
UrlsSource.objects.get_or_create(id_source=source_obj, id_url=url_obj)
|
||||
except Exception as e:
|
||||
logger.warning("bulk_create unknown exception while inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
# Avoid caching due to error on insertion
|
||||
urls_clean = []
|
||||
|
||||
### Bulk insert URLs, ignore conflicts if a url exists
|
||||
bulk_created_obj = Urls.objects.bulk_create(url_object_to_insert, ignore_conflicts=True)
|
||||
# Insert or update cache
|
||||
for url in urls:
|
||||
for url in urls_clean:
|
||||
self._cache_key(url)
|
||||
self._cache_key("{}{}".format(source, url))
|
||||
|
||||
logger.info("Inserted #{} raw URLs".format(len(url_object_to_insert)))
|
||||
logger.info("Inserted #{} raw URLs".format(len(urls_to_insert)))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
@@ -83,6 +118,12 @@ class DB_Handler():
|
||||
# Pattern matching not required or not found, original article status
|
||||
return article_status
|
||||
|
||||
|
||||
def process_error_urls(self, batch_size=50):
|
||||
# Get batch of URLs, status='error'
|
||||
#error_urls = Urls.objects.SORTBY TS_FETCH....filter(status=Urls.STATUS_ENUM.RAW, ts_fetch__gte=time_delta_ts)[:batch_size]
|
||||
pass
|
||||
|
||||
def process_raw_urls(self, time_delta=timedelta(days=1), batch_size=50):
|
||||
try:
|
||||
logger.debug("Processing raw URLs")
|
||||
@@ -95,19 +136,18 @@ class DB_Handler():
|
||||
# Fetched during last 24 hours
|
||||
time_delta_ts = timezone.now() - time_delta
|
||||
# Get batch of URLs, status='raw' and fetched X days ago
|
||||
raw_urls = Urls.objects.filter(status='raw', ts_fetch__gte=time_delta_ts)[:batch_size]
|
||||
raw_urls = Urls.objects.filter(status=Urls.STATUS_ENUM.RAW, ts_fetch__gte=time_delta_ts)[:batch_size]
|
||||
# List of objects to bulk update
|
||||
updating_urls = []
|
||||
|
||||
# Per URL
|
||||
for obj_url in raw_urls:
|
||||
|
||||
##### Any domain to filter included in URL? -> Invalid
|
||||
if (any([d in obj_url.url for d in list_domains_to_filter])):
|
||||
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
|
||||
# Update status
|
||||
obj_url.status = 'invalid'
|
||||
# Append to bulk update
|
||||
obj_url.status = Urls.STATUS_ENUM.INVALID
|
||||
obj_url.save()
|
||||
updating_urls.append(obj_url)
|
||||
# Next URL
|
||||
continue
|
||||
@@ -119,10 +159,10 @@ class DB_Handler():
|
||||
# Not none or handle as exception
|
||||
assert(dict_url_data is not None)
|
||||
except Exception as e:
|
||||
logger.debug("Error processing URL: {}\n{}".format(obj_url.url, str(e)))
|
||||
logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc()))
|
||||
# Update status
|
||||
obj_url.status = 'error'
|
||||
# Append to bulk update
|
||||
obj_url.status = Urls.STATUS_ENUM.ERROR
|
||||
obj_url.save()
|
||||
updating_urls.append(obj_url)
|
||||
# Next URL
|
||||
continue
|
||||
@@ -130,30 +170,30 @@ class DB_Handler():
|
||||
##### Canonical URL different? -> Duplicate
|
||||
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# Update status
|
||||
obj_url.status = 'duplicate'
|
||||
# Append to bulk update
|
||||
obj_url.status = Urls.STATUS_ENUM.DUPLICATE
|
||||
obj_url.save()
|
||||
updating_urls.append(obj_url)
|
||||
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||
# Associate same sources to url -> url_canonical
|
||||
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||
# Get the sources id associated to obj_url.id
|
||||
url_sources = UrlsSource.objects.filter(id_url=obj_url.id)
|
||||
url_sources = UrlsSource.objects.filter(id_url=obj_url)
|
||||
for url_source_obj in url_sources:
|
||||
# Associate same sources to url_canonical (it might already exist)
|
||||
UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical.id)
|
||||
UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical)
|
||||
|
||||
# Next URL
|
||||
continue
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
obj_url.status = 'valid'
|
||||
# Append to bulk update
|
||||
obj_url.status = Urls.STATUS_ENUM.VALID
|
||||
obj_url.save()
|
||||
updating_urls.append(obj_url)
|
||||
|
||||
# Create extracted URL data
|
||||
UrlContent.objects.create_or_update(
|
||||
id_url=obj_url.id,
|
||||
UrlContent.objects.create(
|
||||
id_url=obj_url,
|
||||
date_published=dict_url_data.get("publish_date"),
|
||||
title=dict_url_data.get("title"),
|
||||
description=dict_url_data.get("description"),
|
||||
@@ -163,7 +203,7 @@ class DB_Handler():
|
||||
keywords=dict_url_data.get("keywords"),
|
||||
tags=dict_url_data.get("tags"),
|
||||
authors=dict_url_data.get("authors"),
|
||||
image_main=dict_url_data.get("image_main"),
|
||||
image_main_url=dict_url_data.get("image_main_url"),
|
||||
images_url=dict_url_data.get("images_url"),
|
||||
videos_url=dict_url_data.get("videos_url"),
|
||||
url_host=dict_url_data.get("url_host"),
|
||||
@@ -180,11 +220,11 @@ class DB_Handler():
|
||||
logger.debug("Pattern matching, overriding with status {} for URL: {}".format(status_pattern_matching, obj_url.url))
|
||||
# Update, no need to append to updating_urls, already included
|
||||
obj_url.status = status_pattern_matching
|
||||
obj_url.save()
|
||||
|
||||
# Bulk update
|
||||
Urls.objects.bulk_update(updating_urls, ['status'])
|
||||
# TODO: Fix enum type issue. Bulk update
|
||||
# Urls.objects.bulk_update(updating_urls, ['status'])
|
||||
|
||||
logger.debug("Finished processing raw URLs")
|
||||
logger.info("Updated #{} raw URLs".format(len(updating_urls)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
import newspaper
|
||||
import newspaper
|
||||
from urllib.parse import unquote
|
||||
# pip install langdetect
|
||||
#import langdetect
|
||||
#langdetect.DetectorFactory.seed = 0
|
||||
@@ -30,9 +31,9 @@ def process_url(url):
|
||||
"keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
|
||||
"tags": article.tags,
|
||||
"authors": article.authors,
|
||||
"image_main": article.top_image, # article.meta_img
|
||||
"images": article.images,
|
||||
"videos": article.videos,
|
||||
"image_main_url": article.top_image, # article.meta_img
|
||||
"images_url": article.images,
|
||||
"videos_url": article.movies,
|
||||
}
|
||||
|
||||
'''
|
||||
@@ -46,13 +47,16 @@ def process_url(url):
|
||||
|
||||
# Sanity check
|
||||
for k in dict_data.keys():
|
||||
if (type(k) is list):
|
||||
# Remove empty string
|
||||
dict_data[k] = [ e for e in dict_data[k] if e != "" ]
|
||||
if (type(dict_data[k]) is list):
|
||||
# Remove empty string, unquote special characters, e.g. "%20" -> " "
|
||||
dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ]
|
||||
# NULL instead of empty list
|
||||
if (len(dict_data[k]) == 0):
|
||||
dict_data[k] = None
|
||||
else:
|
||||
elif (type(dict_data[k]) is str):
|
||||
# Unquote special characters
|
||||
if (dict_data[k] is not None):
|
||||
dict_data[k] = unquote(dict_data[k])
|
||||
# NULL instead of empty string
|
||||
if (dict_data[k] == ""):
|
||||
dict_data[k] = None
|
||||
|
||||
@@ -15,18 +15,20 @@ from src.credentials import db_connect_info, redis_connect_info
|
||||
db_handler = DB_Handler(db_connect_info, redis_connect_info)
|
||||
'''
|
||||
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .src.logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
@job
|
||||
def background_task(process_type: str):
|
||||
logger.info("Task triggered: {}".format(process_type))
|
||||
|
||||
try:
|
||||
FetchFeeds().run()
|
||||
|
||||
# DB_Handler().process_raw_urls()
|
||||
if (process_type == "fetch_feeds"):
|
||||
FetchFeeds().run()
|
||||
elif (process_type == "process_raw_urls"):
|
||||
DB_Handler().process_raw_urls(batch_size=3)
|
||||
else:
|
||||
logger.info("Task unknown!: {}".format(process_type))
|
||||
|
||||
|
||||
'''
|
||||
|
||||
@@ -2,5 +2,5 @@ from django.urls import path
|
||||
from .views import trigger_task
|
||||
|
||||
urlpatterns = [
|
||||
path('fetch', trigger_task, name='trigger_task')
|
||||
path('<str:task>', trigger_task, name='trigger_task'),
|
||||
]
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import django_rq
|
||||
from django.http import JsonResponse
|
||||
from .tasks import background_task
|
||||
from .src.logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
def trigger_task(request):
|
||||
def trigger_task(request, task):
|
||||
"""View that enqueues a task."""
|
||||
queue = django_rq.get_queue('default') # Get the default queue
|
||||
job = queue.enqueue(background_task, "Hello from Django RQ!")
|
||||
|
||||
job = queue.enqueue(background_task, task)
|
||||
return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})
|
||||
|
||||
@@ -27,6 +27,20 @@ services:
|
||||
#expose:
|
||||
# - 6379
|
||||
|
||||
matitos_adminer:
|
||||
# http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public
|
||||
image: adminer
|
||||
container_name: adminer
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- ADMINER_DEFAULT_DB_DRIVER=pgsql
|
||||
#- ADMINER_DEFAULT_DB_HOST
|
||||
#- ADMINER_DEFAULT_DB_NAME
|
||||
depends_on:
|
||||
- matitos_db
|
||||
ports:
|
||||
- 8080:8080
|
||||
|
||||
# django:
|
||||
# Env: DB_HOST=matitos_db
|
||||
# DJANGO_DB_NAME=${DB_DATABASE_NAME:-matitos}
|
||||
|
||||
Reference in New Issue
Block a user