Towards django RQ
This commit is contained in:
63
1-DB.ipynb
63
1-DB.ipynb
@@ -19,10 +19,12 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"db_postgres\n",
|
||||
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
|
||||
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n",
|
||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 1/1\u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 2/2\u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h"
|
||||
]
|
||||
}
|
||||
@@ -116,12 +118,14 @@
|
||||
" title TEXT,\n",
|
||||
" description TEXT,\n",
|
||||
" content TEXT,\n",
|
||||
" language CHAR(2), -- ISO 639-1 Code\n",
|
||||
" tags TEXT[],\n",
|
||||
" authors TEXT[],\n",
|
||||
" image_urls TEXT[]\n",
|
||||
" );\n",
|
||||
" CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n",
|
||||
" CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n",
|
||||
" CREATE INDEX idx_language ON URL_CONTENT (language);\n",
|
||||
" \"\"\")\n",
|
||||
"\n",
|
||||
" # Feeds\n",
|
||||
@@ -188,8 +192,9 @@
|
||||
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))\n",
|
||||
"\n",
|
||||
" # URL Content\n",
|
||||
" content = \"Bla Bla Bla!!!\"*25\n",
|
||||
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, tags, authors, image_urls) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s)\", (1, datetime.now(tz=timezone.utc), content, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))"
|
||||
" language, content = \"en\", \"Bla Bla Bla!!!\"*25\n",
|
||||
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, image_urls) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n",
|
||||
" (1, datetime.now(tz=timezone.utc), content, language, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -204,99 +209,99 @@
|
||||
"\t urls\n",
|
||||
"[(1,\n",
|
||||
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (2,\n",
|
||||
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (3,\n",
|
||||
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (4,\n",
|
||||
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (5,\n",
|
||||
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (6,\n",
|
||||
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'valid'),\n",
|
||||
" (7,\n",
|
||||
" 'https://www.google.com',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (8,\n",
|
||||
" 'www.super_0.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (9,\n",
|
||||
" 'www.super_1.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (10,\n",
|
||||
" 'www.super_2.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (11,\n",
|
||||
" 'www.super_3.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (12,\n",
|
||||
" 'www.super_4.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (13,\n",
|
||||
" 'www.super_5.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (14,\n",
|
||||
" 'www.super_6.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (15,\n",
|
||||
" 'www.super_7.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (16,\n",
|
||||
" 'www.super_8.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (17,\n",
|
||||
" 'www.super_9.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (18,\n",
|
||||
" 'www.super_10.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (19,\n",
|
||||
" 'www.super_11.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (20,\n",
|
||||
" 'www.super_12.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (21,\n",
|
||||
" 'www.super_13.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (22,\n",
|
||||
" 'www.super_14.org',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (23,\n",
|
||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid'),\n",
|
||||
" (24,\n",
|
||||
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'invalid')]\n",
|
||||
"\t urls_duplicate\n",
|
||||
"[]\n",
|
||||
@@ -322,7 +327,7 @@
|
||||
"[('.*missingkids.org/poster/.*', 50, 'valid')]\n",
|
||||
"\t url_content\n",
|
||||
"[(1,\n",
|
||||
" datetime.datetime(2025, 3, 6, 23, 4, 37, 654130, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" datetime.datetime(2025, 3, 7, 16, 57, 38, 54447, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
|
||||
" 'Mommy blogger turned child abuser',\n",
|
||||
" 'Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla '\n",
|
||||
" 'Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",
|
||||
|
||||
@@ -103,13 +103,26 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"('https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert',\n",
|
||||
" 'foxnews.com')"
|
||||
]
|
||||
},
|
||||
"execution_count": 54,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# !pip install trafilatura trafilatura[all] cchardet\n",
|
||||
"import courlan\n",
|
||||
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
||||
"url = \"https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
||||
"courlan.check_url(url)"
|
||||
]
|
||||
},
|
||||
@@ -125,13 +138,65 @@
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import newspaper\n",
|
||||
"\n",
|
||||
"article = newspaper.article(url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"datetime.datetime(2025, 3, 4, 4, 0, 31, tzinfo=tzoffset(None, -18000))"
|
||||
]
|
||||
},
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"article.publish_date"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install trafilatura\n",
|
||||
"import trafilatura\n",
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
||||
"url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
|
||||
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
|
||||
"\n",
|
||||
"# Fetch\n",
|
||||
"doc = trafilatura.fetch_url(url)\n",
|
||||
@@ -142,9 +207,40 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'author': 'Audrey Conklin',\n",
|
||||
" 'body': <Element body at 0x7e22813ce400>,\n",
|
||||
" 'categories': [],\n",
|
||||
" 'comments': None,\n",
|
||||
" 'commentsbody': <Element body at 0x7e22813ce180>,\n",
|
||||
" 'date': '2025-03-03',\n",
|
||||
" 'description': \"Disgraced parenting blogger and mom of six Ruby Franke's \"\n",
|
||||
" '\"power\" and public image\" allowed her crimes against her '\n",
|
||||
" 'children to go \"unchecked,\" according to a defense attorney.',\n",
|
||||
" 'filedate': '2025-03-08',\n",
|
||||
" 'fingerprint': None,\n",
|
||||
" 'hostname': 'foxnews.com',\n",
|
||||
" 'id': None,\n",
|
||||
" 'image': 'https://static.foxnews.com/foxnews.com/content/uploads/2024/03/967e1c1b-Franke.jpg',\n",
|
||||
" 'language': None,\n",
|
||||
" 'license': None,\n",
|
||||
" 'pagetype': 'article',\n",
|
||||
" 'raw_text': None,\n",
|
||||
" 'sitename': 'Fox News',\n",
|
||||
" 'tags': [],\n",
|
||||
" 'text': None,\n",
|
||||
" 'title': \"Utah mommy blogger Ruby Franke's power, public image allowed child \"\n",
|
||||
" \"abuse to go 'unchecked': expert\",\n",
|
||||
" 'url': 'https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pprint(metadata.as_dict())"
|
||||
]
|
||||
@@ -165,6 +261,64 @@
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 18.6 ms, sys: 40 μs, total: 18.7 ms\n",
|
||||
"Wall time: 18 ms\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'en'"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"'''\n",
|
||||
"!pip install lingua-language-detector\n",
|
||||
"import lingua\n",
|
||||
"ld = lingua.LanguageDetectorBuilder.from_all_languages().build()\n",
|
||||
"l = ld.detect_language_of(content)\n",
|
||||
"'''\n",
|
||||
"# !pip install langdetect \n",
|
||||
"import langdetect\n",
|
||||
"langdetect.DetectorFactory.seed = 0\n",
|
||||
"langdetect.detect(content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
34
app_urls/README.md
Normal file
34
app_urls/README.md
Normal file
@@ -0,0 +1,34 @@
|
||||
* Dependencies
|
||||
```
|
||||
conda create -n matitos_urls python=3.12
|
||||
conda activate matitos_urls
|
||||
pip install django psycopg[binary] django-rq
|
||||
```
|
||||
|
||||
* Environment variables
|
||||
```
|
||||
DB_NAME=${DB_NAME:-matitos}
|
||||
DB_USER=${DB_NAME:-supermatitos}
|
||||
DB_PASSWORD=${DB_NAME:-supermatitos}
|
||||
DB_HOST=${DB_NAME:-localhost}
|
||||
DB_PORT=${DB_NAME:-5432}
|
||||
|
||||
REDIS_HOST=${REDIS_HOST:-localhost}
|
||||
REDIS_PORT=${REDIS_PORT:-6379}
|
||||
```
|
||||
|
||||
* Django DB
|
||||
```
|
||||
# Generate content for models.py
|
||||
python manage.py inspectdb
|
||||
python manage.py makemigrations
|
||||
python manage.py migrate --fake
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
# Server
|
||||
python manage.py runserver
|
||||
# Worker
|
||||
python manage.py rqworker default
|
||||
```
|
||||
0
app_urls/api/__init__.py
Normal file
0
app_urls/api/__init__.py
Normal file
3
app_urls/api/admin.py
Normal file
3
app_urls/api/admin.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from django.contrib import admin
|
||||
|
||||
# Register your models here.
|
||||
6
app_urls/api/apps.py
Normal file
6
app_urls/api/apps.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class ApiConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'api'
|
||||
132
app_urls/api/migrations/0001_initial.py
Normal file
132
app_urls/api/migrations/0001_initial.py
Normal file
@@ -0,0 +1,132 @@
|
||||
# Generated by Django 5.1.7 on 2025-03-07 16:56
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Feed',
|
||||
fields=[
|
||||
('id', models.SmallAutoField(primary_key=True, serialize=False)),
|
||||
('rss_feed', models.TextField(unique=True)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'feed',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Search',
|
||||
fields=[
|
||||
('id', models.SmallAutoField(primary_key=True, serialize=False)),
|
||||
('keyword_search', models.TextField(unique=True)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'search',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Source',
|
||||
fields=[
|
||||
('id', models.SmallAutoField(primary_key=True, serialize=False)),
|
||||
('source', models.TextField(unique=True)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'source',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='StatusPatternMatching',
|
||||
fields=[
|
||||
('pattern', models.TextField(primary_key=True, serialize=False)),
|
||||
('priority', models.SmallIntegerField()),
|
||||
('status', models.TextField()),
|
||||
],
|
||||
options={
|
||||
'db_table': 'status_pattern_matching',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Urls',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('url', models.TextField(unique=True)),
|
||||
('ts_fetch', models.DateTimeField()),
|
||||
('status', models.TextField()),
|
||||
],
|
||||
options={
|
||||
'db_table': 'urls',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='WebsiteOfInterest',
|
||||
fields=[
|
||||
('id', models.SmallAutoField(primary_key=True, serialize=False)),
|
||||
('url_host', models.TextField(unique=True)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'website_of_interest',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='WebsiteToFilter',
|
||||
fields=[
|
||||
('id', models.SmallAutoField(primary_key=True, serialize=False)),
|
||||
('url_host', models.TextField(unique=True)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'website_to_filter',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='UrlContent',
|
||||
fields=[
|
||||
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
|
||||
('date_published', models.DateTimeField(blank=True, null=True)),
|
||||
('title', models.TextField(blank=True, null=True)),
|
||||
('description', models.TextField(blank=True, null=True)),
|
||||
('content', models.TextField(blank=True, null=True)),
|
||||
('tags', models.TextField(blank=True, null=True)),
|
||||
('authors', models.TextField(blank=True, null=True)),
|
||||
('image_urls', models.TextField(blank=True, null=True)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'url_content',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='UrlsDuplicate',
|
||||
fields=[
|
||||
('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'urls_duplicate',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='UrlsSource',
|
||||
fields=[
|
||||
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'urls_source',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
]
|
||||
0
app_urls/api/migrations/__init__.py
Normal file
0
app_urls/api/migrations/__init__.py
Normal file
101
app_urls/api/models.py
Normal file
101
app_urls/api/models.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from django.db import models
|
||||
|
||||
# Create your models here.
|
||||
class Feed(models.Model):
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
rss_feed = models.TextField(unique=True)
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'feed'
|
||||
|
||||
|
||||
class Search(models.Model):
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
keyword_search = models.TextField(unique=True)
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'search'
|
||||
|
||||
|
||||
class Source(models.Model):
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
source = models.TextField(unique=True)
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'source'
|
||||
|
||||
|
||||
class StatusPatternMatching(models.Model):
|
||||
pattern = models.TextField(primary_key=True)
|
||||
priority = models.SmallIntegerField()
|
||||
status = models.TextField() # This field type is a guess.
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'status_pattern_matching'
|
||||
|
||||
|
||||
class UrlContent(models.Model):
|
||||
id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True)
|
||||
date_published = models.DateTimeField(blank=True, null=True)
|
||||
title = models.TextField(blank=True, null=True)
|
||||
description = models.TextField(blank=True, null=True)
|
||||
content = models.TextField(blank=True, null=True)
|
||||
tags = models.TextField(blank=True, null=True) # This field type is a guess.
|
||||
authors = models.TextField(blank=True, null=True) # This field type is a guess.
|
||||
image_urls = models.TextField(blank=True, null=True) # This field type is a guess.
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'url_content'
|
||||
|
||||
|
||||
class Urls(models.Model):
|
||||
url = models.TextField(unique=True)
|
||||
ts_fetch = models.DateTimeField()
|
||||
status = models.TextField() # This field type is a guess.
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls'
|
||||
|
||||
|
||||
class UrlsDuplicate(models.Model):
|
||||
id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
|
||||
id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls_duplicate'
|
||||
unique_together = (('id_url_canonical', 'id_url_duplicated'),)
|
||||
|
||||
|
||||
class UrlsSource(models.Model):
|
||||
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source) found, that is not supported. The first column is selected.
|
||||
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls_source'
|
||||
unique_together = (('id_url', 'id_source'),)
|
||||
|
||||
|
||||
class WebsiteOfInterest(models.Model):
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
url_host = models.TextField(unique=True)
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'website_of_interest'
|
||||
|
||||
|
||||
class WebsiteToFilter(models.Model):
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
url_host = models.TextField(unique=True)
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'website_to_filter'
|
||||
13
app_urls/api/tasks.py
Normal file
13
app_urls/api/tasks.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from django_rq import job
|
||||
import time
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@job
|
||||
def task_1(message):
|
||||
logger.info("Message: {}".format(message))
|
||||
try:
|
||||
time.sleep(5) # Simulate a long-running task
|
||||
print(f"Task completed: {message}")
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
3
app_urls/api/tests.py
Normal file
3
app_urls/api/tests.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
||||
6
app_urls/api/urls.py
Normal file
6
app_urls/api/urls.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from django.urls import path
|
||||
from .views import trigger_task
|
||||
|
||||
urlpatterns = [
|
||||
path('trigger_task/', trigger_task, name='trigger_task')
|
||||
]
|
||||
10
app_urls/api/views.py
Normal file
10
app_urls/api/views.py
Normal file
@@ -0,0 +1,10 @@
|
||||
import django_rq
|
||||
from django.http import JsonResponse
|
||||
from .tasks import task_1
|
||||
|
||||
def trigger_task(request):
|
||||
"""View that enqueues a task."""
|
||||
queue = django_rq.get_queue('default') # Get the default queue
|
||||
job = queue.enqueue(task_1, "Hello from Django RQ!")
|
||||
|
||||
return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})
|
||||
0
app_urls/core/__init__.py
Normal file
0
app_urls/core/__init__.py
Normal file
16
app_urls/core/asgi.py
Normal file
16
app_urls/core/asgi.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
ASGI config for core project.
|
||||
|
||||
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.asgi import get_asgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
|
||||
application = get_asgi_application()
|
||||
142
app_urls/core/settings.py
Normal file
142
app_urls/core/settings.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""
|
||||
Django settings for core project.
|
||||
|
||||
Generated by 'django-admin startproject' using Django 5.1.7.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.1/topics/settings/
|
||||
|
||||
For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/5.1/ref/settings/
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'django-insecure-kc0jj#_=7i$_79p(n5)p3taxvhnq=w*ori-%%iu_a6wye@$(*n'
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
|
||||
ALLOWED_HOSTS = []
|
||||
|
||||
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = [
|
||||
'django.contrib.admin',
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
# 'rest_framework',
|
||||
'django_rq',
|
||||
'api',
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
]
|
||||
|
||||
ROOT_URLCONF = 'core.urls'
|
||||
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': [],
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.debug',
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
WSGI_APPLICATION = 'core.wsgi.application'
|
||||
|
||||
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.postgresql',
|
||||
'NAME': os.environ.get("DB_NAME", "matitos"),
|
||||
'USER': os.environ.get("DB_USER", "supermatitos"),
|
||||
'PASSWORD': os.environ.get("DB_PASSWORD", "supermatitos"),
|
||||
'HOST': os.environ.get("DB_HOST", "localhost"),
|
||||
'PORT': os.environ.get("DB_PORT", "5432"),
|
||||
#'OPTIONS': {
|
||||
# 'options': '-c default_transaction_read_only=on'
|
||||
#}
|
||||
}
|
||||
}
|
||||
|
||||
RQ_QUEUES = {
|
||||
'default': {
|
||||
'HOST': os.environ.get("REDIS_HOST", "localhost"),
|
||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||
'DB': os.environ.get("REDIS_DB", 0),
|
||||
'DEFAULT_TIMEOUT': os.environ.get("REDIS_DEFAULT_TIMEOUT", 360),
|
||||
}
|
||||
}
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# Internationalization
|
||||
# https://docs.djangoproject.com/en/5.1/topics/i18n/
|
||||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
|
||||
TIME_ZONE = 'UTC'
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
USE_TZ = True
|
||||
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/5.1/howto/static-files/
|
||||
|
||||
STATIC_URL = 'static/'
|
||||
|
||||
# Default primary key field type
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
|
||||
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
23
app_urls/core/urls.py
Normal file
23
app_urls/core/urls.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""
|
||||
URL configuration for core project.
|
||||
|
||||
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||
https://docs.djangoproject.com/en/5.1/topics/http/urls/
|
||||
Examples:
|
||||
Function views
|
||||
1. Add an import: from my_app import views
|
||||
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
||||
Class-based views
|
||||
1. Add an import: from other_app.views import Home
|
||||
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
||||
Including another URLconf
|
||||
1. Import the include() function: from django.urls import include, path
|
||||
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||
"""
|
||||
from django.contrib import admin
|
||||
from django.urls import path, include
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('api/', include('api.urls'))
|
||||
]
|
||||
16
app_urls/core/wsgi.py
Normal file
16
app_urls/core/wsgi.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
WSGI config for core project.
|
||||
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
|
||||
application = get_wsgi_application()
|
||||
22
app_urls/manage.py
Executable file
22
app_urls/manage.py
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
"""Django's command-line utility for administrative tasks."""
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
"""Run administrative tasks."""
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
try:
|
||||
from django.core.management import execute_from_command_line
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Couldn't import Django. Are you sure it's installed and "
|
||||
"available on your PYTHONPATH environment variable? Did you "
|
||||
"forget to activate a virtual environment?"
|
||||
) from exc
|
||||
execute_from_command_line(sys.argv)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user