Working fetch feeds and parser, process raw and error urls
This commit is contained in:
238
1-DB.ipynb
238
1-DB.ipynb
@@ -2,7 +2,7 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 131,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -11,38 +11,16 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 132,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"db_postgres\n",
|
|
||||||
"db_redis\n",
|
|
||||||
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n",
|
|
||||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
|
||||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
|
||||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
|
|
||||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
|
||||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
|
||||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
|
||||||
"\u001b[?25h"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5"
|
"!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 133,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -53,7 +31,7 @@
|
|||||||
"connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n",
|
"connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from datetime import datetime, timezone\n",
|
"from datetime import datetime, timezone\n",
|
||||||
"\n",
|
"import re\n",
|
||||||
"\n",
|
"\n",
|
||||||
"if INSERT_TABLES:\n",
|
"if INSERT_TABLES:\n",
|
||||||
" # Connect to an existing database\n",
|
" # Connect to an existing database\n",
|
||||||
@@ -107,11 +85,6 @@
|
|||||||
" );\n",
|
" );\n",
|
||||||
" CREATE INDEX idx_source ON urls_source(id_source);\n",
|
" CREATE INDEX idx_source ON urls_source(id_source);\n",
|
||||||
"\n",
|
"\n",
|
||||||
" CREATE TABLE WEBSITE_TO_FILTER (\n",
|
|
||||||
" id SMALLSERIAL PRIMARY KEY,\n",
|
|
||||||
" url_host TEXT NOT NULL UNIQUE\n",
|
|
||||||
" );\n",
|
|
||||||
"\n",
|
|
||||||
" CREATE TABLE STATUS_PATTERN_MATCHING (\n",
|
" CREATE TABLE STATUS_PATTERN_MATCHING (\n",
|
||||||
" pattern TEXT PRIMARY KEY,\n",
|
" pattern TEXT PRIMARY KEY,\n",
|
||||||
" priority SMALLINT NOT NULL,\n",
|
" priority SMALLINT NOT NULL,\n",
|
||||||
@@ -148,22 +121,23 @@
|
|||||||
" cur.execute( \"INSERT INTO FEED (rss_feed) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC');\" )\n",
|
" cur.execute( \"INSERT INTO FEED (rss_feed) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC');\" )\n",
|
||||||
" # Websites of interest\n",
|
" # Websites of interest\n",
|
||||||
" cur.execute( \"INSERT INTO WEBSITE_OF_INTEREST (url_host) VALUES ('www.unicef.org');\" )\n",
|
" cur.execute( \"INSERT INTO WEBSITE_OF_INTEREST (url_host) VALUES ('www.unicef.org');\" )\n",
|
||||||
|
" cur.execute( \"INSERT INTO WEBSITE_OF_INTEREST (url_host) VALUES ('www.breitbart.com/');\" )\n",
|
||||||
" # Search keywords\n",
|
" # Search keywords\n",
|
||||||
" cur.execute( \"INSERT INTO SEARCH (keyword_search) VALUES ('child abuse');\" )\n",
|
" cur.execute( \"INSERT INTO SEARCH (keyword_search) VALUES ('child abuse');\" )\n",
|
||||||
" # Domains to filter\n",
|
" \n",
|
||||||
" cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('yewtu.be');\" )\n",
|
" # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n",
|
||||||
" cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('twitter.com');\" )\n",
|
" # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n",
|
||||||
" cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('libreddit.de');\" )\n",
|
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"youtube.com/\"))) )\n",
|
||||||
" cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('youtube.com');\" )\n",
|
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"tiktok.com/\"))) )\n",
|
||||||
" cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('tiktok.com');\" )\n",
|
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"twitter.com/\"))) )\n",
|
||||||
" cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('radio.foxnews.com');\" )\n",
|
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"reddit.com/\"))) )\n",
|
||||||
" # Status update based on pattern matching (with priority to apply in order)\n",
|
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"libreddit.de/\"))) )\n",
|
||||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*missingkids.org/poster/.*', 50, 'valid');\" )"
|
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 134,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -215,115 +189,9 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 135,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"\t urls\n",
|
|
||||||
"[(1,\n",
|
|
||||||
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (2,\n",
|
|
||||||
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (3,\n",
|
|
||||||
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (4,\n",
|
|
||||||
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (5,\n",
|
|
||||||
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (6,\n",
|
|
||||||
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (7,\n",
|
|
||||||
" 'https://www.google.com',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (8,\n",
|
|
||||||
" 'www.super_0.org',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (9,\n",
|
|
||||||
" 'www.super_1.org',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (10,\n",
|
|
||||||
" 'www.super_2.org',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (11,\n",
|
|
||||||
" 'www.super_3.org',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (12,\n",
|
|
||||||
" 'www.super_4.org',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (13,\n",
|
|
||||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (14,\n",
|
|
||||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid')]\n",
|
|
||||||
"\t urls_duplicate\n",
|
|
||||||
"[]\n",
|
|
||||||
"\t feed\n",
|
|
||||||
"[(1,\n",
|
|
||||||
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC')]\n",
|
|
||||||
"\t website_of_interest\n",
|
|
||||||
"[(1, 'www.unicef.org')]\n",
|
|
||||||
"\t search\n",
|
|
||||||
"[(1, 'child abuse')]\n",
|
|
||||||
"\t urls_source\n",
|
|
||||||
"[(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (1, 2), (2, 2), (3, 2)]\n",
|
|
||||||
"\t source\n",
|
|
||||||
"[(1, 'news.google.com'), (2, 'qwant.com')]\n",
|
|
||||||
"\t website_to_filter\n",
|
|
||||||
"[(1, 'yewtu.be'),\n",
|
|
||||||
" (2, 'twitter.com'),\n",
|
|
||||||
" (3, 'libreddit.de'),\n",
|
|
||||||
" (4, 'youtube.com'),\n",
|
|
||||||
" (5, 'tiktok.com'),\n",
|
|
||||||
" (6, 'radio.foxnews.com')]\n",
|
|
||||||
"\t status_pattern_matching\n",
|
|
||||||
"[('.*missingkids.org/poster/.*', 50, 'valid')]\n",
|
|
||||||
"\t url_content\n",
|
|
||||||
"[(1,\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 5, 639334, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'Mommy blogger turned child abuser',\n",
|
|
||||||
" 'Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla '\n",
|
|
||||||
" 'Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
|
|
||||||
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
|
|
||||||
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
|
|
||||||
" 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!',\n",
|
|
||||||
" 'Hello there!',\n",
|
|
||||||
" None,\n",
|
|
||||||
" 'en',\n",
|
|
||||||
" None,\n",
|
|
||||||
" ['child abuse', 'social media'],\n",
|
|
||||||
" ['Audrey Conklin'],\n",
|
|
||||||
" None,\n",
|
|
||||||
" ['https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1'],\n",
|
|
||||||
" None,\n",
|
|
||||||
" None,\n",
|
|
||||||
" None)]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from pprint import pprint\n",
|
"from pprint import pprint\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -349,72 +217,9 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 136,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[(1,\n",
|
|
||||||
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (2,\n",
|
|
||||||
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (3,\n",
|
|
||||||
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (4,\n",
|
|
||||||
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (5,\n",
|
|
||||||
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (6,\n",
|
|
||||||
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'valid'),\n",
|
|
||||||
" (7,\n",
|
|
||||||
" 'https://www.google.com',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (8,\n",
|
|
||||||
" 'www.super_0.org',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (9,\n",
|
|
||||||
" 'www.super_1.org',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (10,\n",
|
|
||||||
" 'www.super_2.org',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (11,\n",
|
|
||||||
" 'www.super_3.org',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (12,\n",
|
|
||||||
" 'www.super_4.org',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (13,\n",
|
|
||||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid'),\n",
|
|
||||||
" (14,\n",
|
|
||||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
|
|
||||||
" datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
|
||||||
" 'invalid')]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from pprint import pprint\n",
|
"from pprint import pprint\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -422,7 +227,8 @@
|
|||||||
"with psycopg.connect(connection_info) as conn:\n",
|
"with psycopg.connect(connection_info) as conn:\n",
|
||||||
" # Open a cursor to perform database operations\n",
|
" # Open a cursor to perform database operations\n",
|
||||||
" with conn.cursor() as cur:\n",
|
" with conn.cursor() as cur:\n",
|
||||||
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )"
|
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )\n",
|
||||||
|
" #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -72,7 +72,10 @@ python manage.py runserver
|
|||||||
|
|
||||||
# Worker
|
# Worker
|
||||||
python manage.py rqworker default
|
python manage.py rqworker default
|
||||||
while true; do python manage.py rqworker default --burst; sleep 5; done
|
while true; do python manage.py rqworker default --burst -v 0; sleep 5; done
|
||||||
|
|
||||||
|
# Visualize DB
|
||||||
|
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id
|
||||||
```
|
```
|
||||||
|
|
||||||
* Utils
|
* Utils
|
||||||
|
|||||||
@@ -106,12 +106,3 @@ class WebsiteOfInterest(models.Model):
|
|||||||
class Meta:
|
class Meta:
|
||||||
managed = False
|
managed = False
|
||||||
db_table = 'website_of_interest'
|
db_table = 'website_of_interest'
|
||||||
|
|
||||||
|
|
||||||
class WebsiteToFilter(models.Model):
|
|
||||||
id = models.SmallAutoField(primary_key=True)
|
|
||||||
url_host = models.TextField(unique=True)
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
managed = False
|
|
||||||
db_table = 'website_to_filter'
|
|
||||||
@@ -1,11 +1,10 @@
|
|||||||
from ..models import Urls, UrlContent, UrlsSource, Source, WebsiteToFilter, StatusPatternMatching
|
from ..models import Urls, UrlContent, UrlsSource, UrlsDuplicate, Source, StatusPatternMatching
|
||||||
from .url_processor import process_url
|
from .url_processor import process_url
|
||||||
from django.utils import timezone
|
|
||||||
from django.core.cache import cache
|
from django.core.cache import cache
|
||||||
from django.db import IntegrityError
|
from django.db import IntegrityError
|
||||||
import hashlib
|
import hashlib
|
||||||
from datetime import timedelta
|
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from .logger import get_logger
|
from .logger import get_logger
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
@@ -13,17 +12,29 @@ logger = get_logger()
|
|||||||
class DB_Handler():
|
class DB_Handler():
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
logger.debug("Initializing URL DB Handler")
|
logger.debug("Initializing URL DB Handler")
|
||||||
|
# Inserting raw URL, cache time: 1 day
|
||||||
|
self._cache_timeout_insert_url = 86400
|
||||||
|
# Processing error URL, cache time: 2 days
|
||||||
|
self._cache_timeout_error_url = 86400*2
|
||||||
|
# URL host slowdown
|
||||||
|
self.url_host_slowdown_seconds = 5
|
||||||
|
|
||||||
def _get_safe_cache_key(self, raw_key):
|
def _get_safe_cache_key(self, raw_key):
|
||||||
"""Generate a safe cache key using an MD5 hash"""
|
"""Generate a safe cache key using an MD5 hash"""
|
||||||
return hashlib.md5(raw_key.encode()).hexdigest()
|
return hashlib.md5(raw_key.encode()).hexdigest()
|
||||||
|
|
||||||
def _cache_key(self, cache_key, cache_timeout=86400):
|
def _cache_key(self, cache_key, hash_encode, cache_timeout):
|
||||||
cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout)
|
if (hash_encode):
|
||||||
|
cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout)
|
||||||
|
else:
|
||||||
|
cache.set(cache_key, True, timeout=cache_timeout)
|
||||||
|
|
||||||
def _is_cached_key(self, cache_key):
|
def _is_cached_key(self, cache_key, hash_encoded):
|
||||||
# Returns True if cached
|
# Returns True if cached
|
||||||
return cache.get(self._get_safe_cache_key(cache_key)) is not None
|
if (hash_encoded):
|
||||||
|
return cache.get(self._get_safe_cache_key(cache_key)) is not None
|
||||||
|
else:
|
||||||
|
return cache.get(cache_key) is not None
|
||||||
|
|
||||||
def insert_raw_urls(self, urls, source):
|
def insert_raw_urls(self, urls, source):
|
||||||
|
|
||||||
@@ -53,10 +64,10 @@ class DB_Handler():
|
|||||||
for url in urls_clean:
|
for url in urls_clean:
|
||||||
|
|
||||||
### Already processed URL?
|
### Already processed URL?
|
||||||
if (self._is_cached_key(url)):
|
if (self._is_cached_key(url, hash_encoded=True)):
|
||||||
logger.debug("Already cached URL: {}".format(url))
|
logger.debug("Already cached URL: {}".format(url))
|
||||||
|
|
||||||
if (self._is_cached_key("{}{}".format(source, url))):
|
if (self._is_cached_key("{}{}".format(source, url), hash_encoded=True)):
|
||||||
logger.debug("Already cached (source, URL): {} {}".format(source, url))
|
logger.debug("Already cached (source, URL): {} {}".format(source, url))
|
||||||
else:
|
else:
|
||||||
### Insert (URL_id, source_id), since not cached
|
### Insert (URL_id, source_id), since not cached
|
||||||
@@ -92,139 +103,189 @@ class DB_Handler():
|
|||||||
|
|
||||||
# Insert or update cache
|
# Insert or update cache
|
||||||
for url in urls_clean:
|
for url in urls_clean:
|
||||||
self._cache_key(url)
|
# Hash encode URLs for special characters
|
||||||
self._cache_key("{}{}".format(source, url))
|
self._cache_key(url, hash_encode=True, cache_timeout=self._cache_timeout_insert_url)
|
||||||
|
self._cache_key("{}{}".format(source, url), hash_encode=True, cache_timeout=self._cache_timeout_insert_url)
|
||||||
|
|
||||||
logger.info("Inserted #{} raw URLs".format(len(urls_to_insert)))
|
logger.info("Inserted #{} raw URLs".format(len(urls_to_insert)))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||||
|
|
||||||
def _get_status_pattern_matching(self, url, article_status, list_pattern_status_tuple):
|
def _get_url_host(self, url):
|
||||||
# Sort pattern tuples by priority. (pattern, priority, status)
|
# URL no protocol, first substring before '/'
|
||||||
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
|
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||||
|
return url_host
|
||||||
|
|
||||||
|
def _url_host_slowdown(self, url, url_host_slowdown_seconds):
|
||||||
|
### Avoid (frequent) too many requests to the same URL host
|
||||||
|
# Get URL host
|
||||||
|
url_host = self._get_url_host(url)
|
||||||
|
# Recently processed URL host? -> Slow down required
|
||||||
|
last_cached_timestamp = cache.get("processed_{}".format(url_host), None)
|
||||||
|
if last_cached_timestamp:
|
||||||
|
# Get time since last processed URL host (in seconds)
|
||||||
|
time_since_last_processed = time.time() - last_cached_timestamp
|
||||||
|
# Amount of time required to sleep?
|
||||||
|
slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
|
||||||
|
logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
|
||||||
|
# Sleep
|
||||||
|
time.sleep(slowdown_required)
|
||||||
|
# About to process URL host, cache time
|
||||||
|
cache.set("processed_{}".format(url_host), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||||
|
|
||||||
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
|
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
|
||||||
# Status "raw", "duplicated" and "error" should remain the way they are
|
##### Filter URL? -> Invalid
|
||||||
# Assumption: List of patterns sorted by importance
|
if (status_pattern_match == "invalid"):
|
||||||
if (article_status in ["valid", "invalid", "unknown"]):
|
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
|
||||||
# Regular expression pattern matching: https://regexr.com/
|
# Update status
|
||||||
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
|
obj_url.status = Urls.STATUS_ENUM.INVALID
|
||||||
# Matching? Update article status
|
obj_url.save()
|
||||||
if bool(re.match(regex_pattern, url)):
|
# updating_urls.append(obj_url)
|
||||||
if (status_if_match != article_status):
|
# Next URL
|
||||||
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
|
return
|
||||||
|
|
||||||
|
##### Process URL
|
||||||
|
try:
|
||||||
|
# Slow down if required to avoid too many requests error
|
||||||
|
self._url_host_slowdown(obj_url.url, self.url_host_slowdown_seconds)
|
||||||
|
# Get data
|
||||||
|
dict_url_data = process_url(obj_url.url)
|
||||||
|
# Not none or handle as exception
|
||||||
|
assert(dict_url_data is not None)
|
||||||
|
except Exception as e:
|
||||||
|
if (raise_exception_on_error):
|
||||||
|
# Simply raise exception
|
||||||
|
raise Exception("Error processing URL")
|
||||||
|
else:
|
||||||
|
# Set status to error
|
||||||
|
logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc()))
|
||||||
|
# Update status
|
||||||
|
obj_url.status = Urls.STATUS_ENUM.ERROR
|
||||||
|
obj_url.save()
|
||||||
|
# updating_urls.append(obj_url)
|
||||||
|
# Next URL
|
||||||
|
return
|
||||||
|
|
||||||
|
##### Canonical URL different? -> Duplicate
|
||||||
|
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||||
|
# Update status
|
||||||
|
obj_url.status = Urls.STATUS_ENUM.DUPLICATE
|
||||||
|
obj_url.save()
|
||||||
|
# updating_urls.append(obj_url)
|
||||||
|
|
||||||
|
# Get or create URL with canonical form
|
||||||
|
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||||
|
# Get the sources id associated to obj_url.id
|
||||||
|
url_sources = UrlsSource.objects.filter(id_url=obj_url)
|
||||||
|
for url_source_obj in url_sources:
|
||||||
|
# Associate same sources to url_canonical (it might already exist)
|
||||||
|
obj_urls_source, created = UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical)
|
||||||
|
|
||||||
|
# URLs duplciate association
|
||||||
|
obj_urls_duplicate, created = UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||||
|
|
||||||
|
# Next URL
|
||||||
|
return
|
||||||
|
|
||||||
|
##### Valid URL
|
||||||
|
# Update status
|
||||||
|
obj_url.status = Urls.STATUS_ENUM.VALID
|
||||||
|
obj_url.save()
|
||||||
|
# updating_urls.append(obj_url)
|
||||||
|
|
||||||
|
# Create or update extracted URL data
|
||||||
|
UrlContent.objects.update_or_create(
|
||||||
|
id_url=obj_url,
|
||||||
|
defaults = {
|
||||||
|
"date_published" : dict_url_data.get("publish_date"),
|
||||||
|
"title" : dict_url_data.get("title"),
|
||||||
|
"description" : dict_url_data.get("description"),
|
||||||
|
"content" : dict_url_data.get("content"),
|
||||||
|
"valid_content" : dict_url_data.get("valid_content"),
|
||||||
|
"language" : dict_url_data.get("language"),
|
||||||
|
"keywords" : dict_url_data.get("keywords"),
|
||||||
|
"tags" : dict_url_data.get("tags"),
|
||||||
|
"authors" : dict_url_data.get("authors"),
|
||||||
|
"image_main_url" : dict_url_data.get("image_main_url"),
|
||||||
|
"images_url" : dict_url_data.get("images_url"),
|
||||||
|
"videos_url" : dict_url_data.get("videos_url"),
|
||||||
|
"url_host" : dict_url_data.get("url_host"),
|
||||||
|
"site_name" : dict_url_data.get("site_name"),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
def process_raw_urls(self, batch_size):
|
||||||
|
|
||||||
|
def _get_status_pattern_matching(url, list_pattern_status_tuple):
|
||||||
|
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
|
||||||
|
"""
|
||||||
|
# Sort pattern tuples by priority. (pattern, priority, status)
|
||||||
|
for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
|
||||||
|
# Regular expression pattern matching: https://regexr.com/
|
||||||
|
if bool(re.match(regex_pattern, obj_url.url)):
|
||||||
|
logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
|
||||||
return status_if_match
|
return status_if_match
|
||||||
# Pattern matching not required or not found, original article status
|
return None
|
||||||
return article_status
|
|
||||||
|
|
||||||
|
|
||||||
def process_error_urls(self, batch_size=50):
|
|
||||||
# Get batch of URLs, status='error'
|
|
||||||
#error_urls = Urls.objects.SORTBY TS_FETCH....filter(status=Urls.STATUS_ENUM.RAW, ts_fetch__gte=time_delta_ts)[:batch_size]
|
|
||||||
pass
|
|
||||||
|
|
||||||
def process_raw_urls(self, time_delta=timedelta(days=1), batch_size=50):
|
|
||||||
try:
|
try:
|
||||||
logger.debug("Processing raw URLs")
|
logger.debug("Processing raw URLs")
|
||||||
|
|
||||||
# Get list of domains to filter
|
# Get batch of URLs, status='raw'
|
||||||
list_domains_to_filter = WebsiteToFilter.objects.values_list('url_host', flat=True)
|
raw_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.RAW)[:batch_size]
|
||||||
|
|
||||||
|
if (len(raw_urls) == 0):
|
||||||
|
logger.debug("No raw URLs to process")
|
||||||
|
return
|
||||||
|
|
||||||
# Get list of (pattern, priority, status) tuples to override status if required
|
# Get list of (pattern, priority, status) tuples to override status if required
|
||||||
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
|
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
|
||||||
|
|
||||||
# Fetched during last 24 hours
|
|
||||||
time_delta_ts = timezone.now() - time_delta
|
|
||||||
# Get batch of URLs, status='raw' and fetched X days ago
|
|
||||||
raw_urls = Urls.objects.filter(status=Urls.STATUS_ENUM.RAW, ts_fetch__gte=time_delta_ts)[:batch_size]
|
|
||||||
# List of objects to bulk update
|
# List of objects to bulk update
|
||||||
updating_urls = []
|
# updating_urls = []
|
||||||
|
|
||||||
# Per URL
|
# Per URL
|
||||||
for obj_url in raw_urls:
|
for obj_url in raw_urls:
|
||||||
##### Any domain to filter included in URL? -> Invalid
|
# Override status if pattern matching?
|
||||||
if (any([d in obj_url.url for d in list_domains_to_filter])):
|
status_pattern_match = _get_status_pattern_matching(obj_url.url, list_pattern_status_tuple)
|
||||||
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
|
# Process URL
|
||||||
# Update status
|
self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False)
|
||||||
obj_url.status = Urls.STATUS_ENUM.INVALID
|
|
||||||
obj_url.save()
|
|
||||||
updating_urls.append(obj_url)
|
|
||||||
# Next URL
|
|
||||||
continue
|
|
||||||
|
|
||||||
##### Process URL
|
|
||||||
try:
|
|
||||||
# Get data
|
|
||||||
dict_url_data = process_url(obj_url.url)
|
|
||||||
# Not none or handle as exception
|
|
||||||
assert(dict_url_data is not None)
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc()))
|
|
||||||
# Update status
|
|
||||||
obj_url.status = Urls.STATUS_ENUM.ERROR
|
|
||||||
obj_url.save()
|
|
||||||
updating_urls.append(obj_url)
|
|
||||||
# Next URL
|
|
||||||
continue
|
|
||||||
|
|
||||||
##### Canonical URL different? -> Duplicate
|
# TODO: Fix enum type issue. Bulk update instead of .save() for each object
|
||||||
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
|
||||||
# Update status
|
|
||||||
obj_url.status = Urls.STATUS_ENUM.DUPLICATE
|
|
||||||
obj_url.save()
|
|
||||||
updating_urls.append(obj_url)
|
|
||||||
|
|
||||||
# Get or create URL with canonical form
|
|
||||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
|
||||||
# Get the sources id associated to obj_url.id
|
|
||||||
url_sources = UrlsSource.objects.filter(id_url=obj_url)
|
|
||||||
for url_source_obj in url_sources:
|
|
||||||
# Associate same sources to url_canonical (it might already exist)
|
|
||||||
UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical)
|
|
||||||
|
|
||||||
# Next URL
|
|
||||||
continue
|
|
||||||
|
|
||||||
##### Valid URL
|
|
||||||
# Update status
|
|
||||||
obj_url.status = Urls.STATUS_ENUM.VALID
|
|
||||||
obj_url.save()
|
|
||||||
updating_urls.append(obj_url)
|
|
||||||
|
|
||||||
# Create extracted URL data
|
|
||||||
UrlContent.objects.create(
|
|
||||||
id_url=obj_url,
|
|
||||||
date_published=dict_url_data.get("publish_date"),
|
|
||||||
title=dict_url_data.get("title"),
|
|
||||||
description=dict_url_data.get("description"),
|
|
||||||
content=dict_url_data.get("content"),
|
|
||||||
valid_content=dict_url_data.get("valid_content"),
|
|
||||||
language=dict_url_data.get("language"),
|
|
||||||
keywords=dict_url_data.get("keywords"),
|
|
||||||
tags=dict_url_data.get("tags"),
|
|
||||||
authors=dict_url_data.get("authors"),
|
|
||||||
image_main_url=dict_url_data.get("image_main_url"),
|
|
||||||
images_url=dict_url_data.get("images_url"),
|
|
||||||
videos_url=dict_url_data.get("videos_url"),
|
|
||||||
url_host=dict_url_data.get("url_host"),
|
|
||||||
site_name=dict_url_data.get("site_name"),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
##### Override status if pattern matching?
|
|
||||||
for obj_url in updating_urls:
|
|
||||||
# Check if article status needs to be updated with pattern matching
|
|
||||||
status_pattern_matching = self._get_status_pattern_matching(obj_url.url, obj_url.status, list_pattern_status_tuple)
|
|
||||||
# Update status?
|
|
||||||
if (status_pattern_matching != obj_url.status):
|
|
||||||
logger.debug("Pattern matching, overriding with status {} for URL: {}".format(status_pattern_matching, obj_url.url))
|
|
||||||
# Update, no need to append to updating_urls, already included
|
|
||||||
obj_url.status = status_pattern_matching
|
|
||||||
obj_url.save()
|
|
||||||
|
|
||||||
# TODO: Fix enum type issue. Bulk update
|
|
||||||
# Urls.objects.bulk_update(updating_urls, ['status'])
|
# Urls.objects.bulk_update(updating_urls, ['status'])
|
||||||
|
|
||||||
logger.info("Updated #{} raw URLs".format(len(updating_urls)))
|
logger.info("Updated #{} raw URLs".format(len(raw_urls)))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||||
|
|
||||||
|
def process_error_urls(self, batch_size):
|
||||||
|
try:
|
||||||
|
logger.debug("Processing error URLs")
|
||||||
|
|
||||||
|
# Keep track of processed and skipped "error" URLs
|
||||||
|
num_urls_skipped, num_urls_processed = 0, 0
|
||||||
|
# Get batch of URLs, status='error'
|
||||||
|
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
|
||||||
|
|
||||||
|
while ((len(error_urls) > 0) and (num_urls_processed < batch_size)):
|
||||||
|
# Per URL
|
||||||
|
for obj_url in error_urls:
|
||||||
|
# URL ID cached? -> Tried to process recently already, skip
|
||||||
|
if (self._is_cached_key("error_{}".format(obj_url.id), hash_encoded=False)):
|
||||||
|
num_urls_skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Process URL
|
||||||
|
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
|
||||||
|
num_urls_processed += 1
|
||||||
|
except Exception as e:
|
||||||
|
# Error, cache to avoid re-processing for X time
|
||||||
|
self._cache_key("error_{}".format(obj_url.id), hash_encode=False, cache_timeout=self._cache_timeout_error_url)
|
||||||
|
num_urls_skipped += 1
|
||||||
|
|
||||||
|
# Get following batch of URLs, status='error'
|
||||||
|
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
|
||||||
|
|
||||||
|
logger.info("Updated #{}, skipped #{} error URLs".format(num_urls_processed, num_urls_skipped))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||||
@@ -8,15 +8,15 @@ logger = get_logger()
|
|||||||
|
|
||||||
class FetchFeeds():
|
class FetchFeeds():
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
logger.debug("Initializing News feed")
|
logger.debug("Initializing Fetcher Feeds")
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
logger.debug("Starting NewsFeed.run()")
|
logger.debug("Starting FetchFeeds.run()")
|
||||||
|
|
||||||
# Get feeds
|
# Get feeds
|
||||||
list_url_feeds = list(Feed.objects.values_list('rss_feed', flat=True))
|
list_url_feeds = list(Feed.objects.values_list('rss_feed', flat=True))
|
||||||
logger.debug("Fetching news from feeds: {}".format(list_url_feeds))
|
logger.debug("Fetching from feeds: {}".format(list_url_feeds))
|
||||||
|
|
||||||
# Process via RSS feeds
|
# Process via RSS feeds
|
||||||
for url_feed in list_url_feeds:
|
for url_feed in list_url_feeds:
|
||||||
@@ -47,4 +47,4 @@ class FetchFeeds():
|
|||||||
# Write to DB
|
# Write to DB
|
||||||
DB_Handler().insert_raw_urls(urls_fetched, source)
|
DB_Handler().insert_raw_urls(urls_fetched, source)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception in NewsFeed.run(): {}\n{}".format(e, traceback.format_exc()))
|
logger.warning("Exception in FetchFeeds.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||||
|
|||||||
39
app_urls/api/src/fetch_parser.py
Normal file
39
app_urls/api/src/fetch_parser.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
from .db_utils import DB_Handler
|
||||||
|
from ..models import WebsiteOfInterest
|
||||||
|
import newspaper
|
||||||
|
import traceback
|
||||||
|
from .logger import get_logger
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
|
class FetchParser():
|
||||||
|
def __init__(self) -> None:
|
||||||
|
logger.debug("Initializing Fetcher Parser")
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
try:
|
||||||
|
logger.debug("Starting FetchParser.run() for {}")
|
||||||
|
|
||||||
|
# Get URL hosts
|
||||||
|
list_url_host = list(WebsiteOfInterest.objects.values_list('url_host', flat=True))
|
||||||
|
logger.debug("Fetching news by parsing URL hosts: {}".format(list_url_host))
|
||||||
|
|
||||||
|
# Process newspaper4k build method
|
||||||
|
for url_host_feed in list_url_host:
|
||||||
|
# Protocol
|
||||||
|
if not (url_host_feed.startswith("http")):
|
||||||
|
url_host_feed_formatted = "https://" + url_host_feed
|
||||||
|
else:
|
||||||
|
url_host_feed_formatted = url_host_feed
|
||||||
|
|
||||||
|
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
|
||||||
|
# Source object
|
||||||
|
url_host_built = newspaper.build(url_host_feed_formatted)
|
||||||
|
# Get articles URL list
|
||||||
|
urls_fetched = url_host_built.article_urls()
|
||||||
|
|
||||||
|
# URL fetching source
|
||||||
|
source = "newspaper4k {}".format(url_host_feed)
|
||||||
|
# Write to DB
|
||||||
|
DB_Handler().insert_raw_urls(urls_fetched, source)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from django.core.cache import cache
|
||||||
from .logger import get_logger
|
from .logger import get_logger
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
import newspaper
|
import newspaper
|
||||||
@@ -6,6 +7,7 @@ from urllib.parse import unquote
|
|||||||
#import langdetect
|
#import langdetect
|
||||||
#langdetect.DetectorFactory.seed = 0
|
#langdetect.DetectorFactory.seed = 0
|
||||||
|
|
||||||
|
|
||||||
def process_url(url):
|
def process_url(url):
|
||||||
try:
|
try:
|
||||||
# Process
|
# Process
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from django_rq import job
|
from django_rq import job
|
||||||
|
|
||||||
from .src.fetch_feed import FetchFeeds
|
from .src.fetch_feed import FetchFeeds
|
||||||
|
from .src.fetch_parser import FetchParser
|
||||||
from .src.db_utils import DB_Handler
|
from .src.db_utils import DB_Handler
|
||||||
'''
|
'''
|
||||||
from src.fetch_parser import FetchParser
|
from src.fetch_parser import FetchParser
|
||||||
@@ -8,16 +9,13 @@ from src.fetch_search import FetchSearcher
|
|||||||
from src.missing_kids_fetch import MissingKidsFetch
|
from src.missing_kids_fetch import MissingKidsFetch
|
||||||
from src.missing_kids_status import MissingKidsStatus
|
from src.missing_kids_status import MissingKidsStatus
|
||||||
from src.url_status import UpdateErrorURLs
|
from src.url_status import UpdateErrorURLs
|
||||||
from src.db_utils import DB_Handler
|
|
||||||
from src.credentials import db_connect_info, redis_connect_info
|
|
||||||
|
|
||||||
# DB Handler
|
|
||||||
db_handler = DB_Handler(db_connect_info, redis_connect_info)
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from .src.logger import get_logger
|
from .src.logger import get_logger
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
|
# TODO: Queues with priorities, process_raw_urls least priority due to slowdown...
|
||||||
|
|
||||||
@job
|
@job
|
||||||
def background_task(process_type: str):
|
def background_task(process_type: str):
|
||||||
logger.info("Task triggered: {}".format(process_type))
|
logger.info("Task triggered: {}".format(process_type))
|
||||||
@@ -25,18 +23,17 @@ def background_task(process_type: str):
|
|||||||
try:
|
try:
|
||||||
if (process_type == "fetch_feeds"):
|
if (process_type == "fetch_feeds"):
|
||||||
FetchFeeds().run()
|
FetchFeeds().run()
|
||||||
|
elif (process_type == "fetch_parser"):
|
||||||
|
FetchParser().run()
|
||||||
elif (process_type == "process_raw_urls"):
|
elif (process_type == "process_raw_urls"):
|
||||||
DB_Handler().process_raw_urls(batch_size=3)
|
DB_Handler().process_raw_urls(batch_size=50)
|
||||||
|
elif (process_type == "process_error_urls"):
|
||||||
|
DB_Handler().process_error_urls(batch_size=50)
|
||||||
else:
|
else:
|
||||||
logger.info("Task unknown!: {}".format(process_type))
|
logger.info("Task unknown!: {}".format(process_type))
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
if (process_type == "fetch_feeds"):
|
|
||||||
FetchFeeds(db_handler).run()
|
|
||||||
|
|
||||||
elif (process_type == "fetch_parser"):
|
|
||||||
FetchParser(db_handler).run()
|
|
||||||
|
|
||||||
elif (process_type == "search") or (process_type == "search_full"):
|
elif (process_type == "search") or (process_type == "search_full"):
|
||||||
FetchSearcher(cred.db_connect_info, cred.redis_connect_info, full=True).run()
|
FetchSearcher(cred.db_connect_info, cred.redis_connect_info, full=True).run()
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from django.urls import path
|
from django.urls import path
|
||||||
from .views import trigger_task
|
from .views import trigger_task, link_list
|
||||||
|
|
||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
|
path('links', link_list, name='link_list'),
|
||||||
path('<str:task>', trigger_task, name='trigger_task'),
|
path('<str:task>', trigger_task, name='trigger_task'),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import django_rq
|
import django_rq
|
||||||
from django.http import JsonResponse
|
from django.http import JsonResponse
|
||||||
from .tasks import background_task
|
from .tasks import background_task
|
||||||
|
import os
|
||||||
from .src.logger import get_logger
|
from .src.logger import get_logger
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
@@ -9,3 +10,8 @@ def trigger_task(request, task):
|
|||||||
queue = django_rq.get_queue('default') # Get the default queue
|
queue = django_rq.get_queue('default') # Get the default queue
|
||||||
job = queue.enqueue(background_task, task)
|
job = queue.enqueue(background_task, task)
|
||||||
return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})
|
return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})
|
||||||
|
|
||||||
|
def link_list(request):
|
||||||
|
prefix = "http://localhost:8000/api"
|
||||||
|
links = ["fetch_feeds", "fetch_parser", "process_raw_urls", "process_error_urls"]
|
||||||
|
return JsonResponse({"links": ["http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id"] + [os.path.join(prefix, l) for l in links]})
|
||||||
|
|||||||
Reference in New Issue
Block a user