Urls web visualization, cleaning obsolete code
This commit is contained in:
124
1-DB.ipynb
124
1-DB.ipynb
@@ -2,7 +2,7 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -11,7 +11,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -20,17 +20,108 @@
|
|||||||
"text": [
|
"text": [
|
||||||
"db_postgres\n",
|
"db_postgres\n",
|
||||||
"db_redis\n",
|
"db_redis\n",
|
||||||
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n",
|
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
|
||||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
" ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
" ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.5s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.6s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.7s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠇ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.8s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠏ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.9s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠋ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.0s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.1s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.2s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.3s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.4s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.5s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.6s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.7s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
|
||||||
|
" ⠇ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m1.8s \u001b[0m\n",
|
||||||
|
" ⠋ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||||
|
" ⠋ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||||
|
" ⠋ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
|
||||||
|
" ⠏ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m1.9s \u001b[0m\n",
|
||||||
|
" ⠙ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||||
|
" ⠙ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||||
|
" ⠙ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
|
||||||
|
" ⠋ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.0s \u001b[0m\n",
|
||||||
|
" ⠹ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||||
|
" ⠹ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||||
|
" ⠹ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
|
||||||
|
" ⠙ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.1s \u001b[0m\n",
|
||||||
|
" ⠸ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||||
|
" ⠸ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||||
|
" ⠸ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n",
|
||||||
|
" ⠹ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.2s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" ⠼ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" ⠼ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
|
||||||
|
" ⠸ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⠀\u001b[0m] 166.8kB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.3s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" ⠴ 82780b9b6d69 Downloading \u001b[39m 166.8kB/16.38MB\u001b[0m \u001b[34m0.5s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
|
||||||
|
" ⠼ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.833MB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.4s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" ⠦ 82780b9b6d69 Downloading \u001b[39m 9.833MB/16.38MB\u001b[0m \u001b[34m0.6s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
|
||||||
|
" ⠴ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⠀\u001b[0m] 163.8kB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.5s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" ⠿ 82780b9b6d69 Extracting \u001b[39m 163.8kB/16.38MB\u001b[0m \u001b[34m0.7s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
|
||||||
|
" ⠦ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.667MB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.6s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" ⠿ 82780b9b6d69 Extracting \u001b[39m 9.667MB/16.38MB\u001b[0m \u001b[34m0.8s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣿\u001b[0m] 0B/0B Pulled \u001b[32m\u001b[0m \u001b[34m2.7s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m 82780b9b6d69 Pull complete \u001b[32m\u001b[0m \u001b[34m0.9s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
|
||||||
|
" ⠋ Container db_redis \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||||
|
" ⠋ Container db_postgres \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||||
|
" ⠋ Container dozzle \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
|
||||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||||
|
" ⠿ Container dozzle \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n",
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n",
|
||||||
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||||
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||||
|
" ⠿ Container dozzle \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||||
|
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||||
|
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||||
"\u001b[?25h"
|
"\u001b[?25h"
|
||||||
]
|
]
|
||||||
@@ -42,7 +133,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -143,6 +234,7 @@
|
|||||||
" # Feeds\n",
|
" # Feeds\n",
|
||||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n",
|
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n",
|
||||||
" # Websites of interest\n",
|
" # Websites of interest\n",
|
||||||
|
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.missingkids.org/poster', 'url_host');\" )\n",
|
||||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n",
|
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n",
|
||||||
" # Search keywords\n",
|
" # Search keywords\n",
|
||||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
|
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
|
||||||
@@ -159,7 +251,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 4,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -211,7 +303,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -260,7 +352,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 13,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@@ -285,7 +377,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,46 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"conda create -n matitos_fetcher python=3.12\n",
|
|
||||||
"conda activate matitos_fetcher\n",
|
|
||||||
"conda install -c conda-forge curl\n",
|
|
||||||
"pip install ipykernel \"psycopg[binary]\" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"!uvicorn app:app --host 0.0.0.0 --port 5000 --reload"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "matitos_fetcher",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.12.9"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
FROM continuumio/miniconda3:25.1.1-2
|
|
||||||
|
|
||||||
# App repository
|
|
||||||
COPY . /opt/app/
|
|
||||||
|
|
||||||
RUN conda install -c conda-forge curl
|
|
||||||
RUN pip install --no-cache-dir --upgrade "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]
|
|
||||||
RUN pip freeze
|
|
||||||
# GoogleNews-1.6.10 Pillow-10.1.0 PyYAML-6.0.1 aiofiles-23.2.1 anyio-3.7.1 beautifulsoup4-4.9.3 bs4-0.0.1 click-8.1.7 cssselect-1.2.0 dateparser-1.2.0 dnspython-1.16.0 duckduckgo_search-3.9.8 fastapi-0.104.1 fastapi-utils-0.2.1 feedfinder2-0.0.4 feedparser-6.0.10 filelock-3.13.1 gnews-0.3.6 greenlet-3.0.1 h11-0.14.0 h2-4.1.0 hpack-4.0.0 httpcore-1.0.2 httpx-0.25.2 hyperframe-6.0.1 jieba3k-0.35.1 joblib-1.3.2 lxml-4.9.3 newspaper3k-0.2.8 nltk-3.8.1 numpy-1.26.2 psycopg-3.1.13 psycopg-binary-3.1.13 pydantic-1.10.13 pymongo-3.12.3 python-dateutil-2.8.2 python-dotenv-0.19.2 pytz-2023.3.post1 redis-5.0.1 regex-2023.10.3 requests-2.26.0 requests-file-1.5.1 sgmllib3k-1.0.0 six-1.16.0 sniffio-1.3.0 socksio-1.0.0 soupsieve-2.5 sqlalchemy-1.4.50 starlette-0.27.0 tinysegmenter-0.3 tldextract-5.1.1 typing-extensions-4.8.0 tzlocal-5.2 uvicorn-0.24.0.post1
|
|
||||||
|
|
||||||
WORKDIR /opt/app
|
|
||||||
|
|
||||||
# https://www.uvicorn.org/settings/#resource-limits
|
|
||||||
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
|
|
||||||
|
|
||||||
# docker build -t fetch_app .
|
|
||||||
# docker run --rm --name container_fetch_app fetch_app
|
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
# Fetcher
|
|
||||||
|
|
||||||
```
|
|
||||||
conda create -n matitos_fetcher python=3.12
|
|
||||||
conda activate matitos_fetcher
|
|
||||||
conda install -c conda-forge curl
|
|
||||||
pip install ipykernel "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
* Fetcher app
|
|
||||||
- Contains several endpoints to perform a specific fetching type task
|
|
||||||
- For more details, check in [app.py](app.py) /{fetch_type}
|
|
||||||
|
|
||||||
* Build and run
|
|
||||||
- Important: To be deployed with other micro-services, [docker-compose.yml](../docker-compose.yml)
|
|
||||||
```
|
|
||||||
docker build -t fetch_app .
|
|
||||||
docker run --rm --name container_fetch_app fetch_app
|
|
||||||
```
|
|
||||||
@@ -1,79 +0,0 @@
|
|||||||
from src.fetch_feed import FetchFeed
|
|
||||||
from src.fetch_parser import FetchParser
|
|
||||||
from src.fetch_search import FetchSearch
|
|
||||||
|
|
||||||
from src.missing_kids_fetch import MissingKidsFetch
|
|
||||||
from src.missing_kids_status import MissingKidsStatus
|
|
||||||
|
|
||||||
from src.url_status import UpdateErrorURLs
|
|
||||||
from src.db_utils import DB_Handler
|
|
||||||
|
|
||||||
import src.credentials as cred
|
|
||||||
from logging_ import get_logger
|
|
||||||
|
|
||||||
from fastapi import FastAPI, BackgroundTasks
|
|
||||||
##################################################################################################
|
|
||||||
|
|
||||||
logger = get_logger()
|
|
||||||
logger.info("Environment: {}".format(cred.ENVIRONMENT))
|
|
||||||
|
|
||||||
db_handler = DB_Handler(cred.db_connect_info, cred.redis_connect_info)
|
|
||||||
|
|
||||||
app = FastAPI()
|
|
||||||
|
|
||||||
@app.get("/")
|
|
||||||
def hello_world():
|
|
||||||
return {"message": "Ok"}
|
|
||||||
|
|
||||||
@app.get("/{process_type}")
|
|
||||||
async def process(background_tasks: BackgroundTasks, process_type: str):
|
|
||||||
# Concurrent job running
|
|
||||||
logger.info("Triggered: {}".format(process_type))
|
|
||||||
|
|
||||||
if (process_type == "fetch_feeds"):
|
|
||||||
task_run = FetchFeed(db_handler).run
|
|
||||||
elif (process_type == "fetch_parser"):
|
|
||||||
task_run = FetchParser(db_handler).run
|
|
||||||
elif (process_type == "search") or (process_type == "search_full"):
|
|
||||||
task_run = FetchSearch(cred.db_connect_info, cred.redis_connect_info, full=True).run
|
|
||||||
elif (process_type == "search_reduced"):
|
|
||||||
task_run = FetchSearch(cred.db_connect_info, cred.redis_connect_info, full=False).run
|
|
||||||
|
|
||||||
# Selenium based
|
|
||||||
elif (process_type == "fetch_missing_kids_reduced"):
|
|
||||||
task_run = MissingKidsFetch(db_handler, num_pages=4).run
|
|
||||||
elif (process_type == "fetch_missing_kids_full"):
|
|
||||||
task_run = MissingKidsFetch(db_handler, num_pages=100000).run
|
|
||||||
|
|
||||||
elif (process_type == "update_missing_kids_status_reduced"):
|
|
||||||
task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status
|
|
||||||
elif (process_type == "update_missing_kids_status_full"):
|
|
||||||
task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status
|
|
||||||
|
|
||||||
elif (process_type == "update_error_urls"):
|
|
||||||
task_run = UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status
|
|
||||||
else:
|
|
||||||
return {"message": "ERROR. Unknown fetcher type!"}
|
|
||||||
|
|
||||||
# Run task
|
|
||||||
background_tasks.add_task(task_run)
|
|
||||||
# Return message
|
|
||||||
return {"message": "Started {}: Ok".format(process_type)}
|
|
||||||
|
|
||||||
"""
|
|
||||||
# TODO: Instead of background tasks!
|
|
||||||
|
|
||||||
import rq
|
|
||||||
import redis
|
|
||||||
|
|
||||||
# Redis connection
|
|
||||||
redis_conn = redis.Redis(host='localhost', port=6379, db=0)
|
|
||||||
queue = rq.Queue(connection=redis_conn)
|
|
||||||
|
|
||||||
# ...
|
|
||||||
# Queue the processing task
|
|
||||||
dict_args= {"db_handler": db_handler, }
|
|
||||||
queue.enqueue(task_run, **dict_args)
|
|
||||||
|
|
||||||
# https://python-rq.org/
|
|
||||||
"""
|
|
||||||
@@ -1,502 +0,0 @@
|
|||||||
import psycopg
|
|
||||||
import redis
|
|
||||||
import traceback
|
|
||||||
import random
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from .url_utils import process_article
|
|
||||||
from .logger import get_logger
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
|
|
||||||
# The rest, elsewhere
|
|
||||||
|
|
||||||
class DB_Handler():
|
|
||||||
def __init__(self, db_connect_info, redis_connect_info):
|
|
||||||
logger.debug("Initializing URL DB writer")
|
|
||||||
self.db_connect_info = db_connect_info
|
|
||||||
self.redis_instance = redis.Redis(host=redis_connect_info.get("host"), port=redis_connect_info.get("port"))
|
|
||||||
self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.redis_instance.ping()
|
|
||||||
logger.debug("Succesfully pinged Redis")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error trying to ping Redis: {}".format(str(e)))
|
|
||||||
|
|
||||||
def get_urls_count(self, last_minutes_check):
|
|
||||||
#####################
|
|
||||||
### Get number of URLs within last X minutes
|
|
||||||
#####################
|
|
||||||
try:
|
|
||||||
# Update
|
|
||||||
with psycopg.connect(self.db_connect_info) as conn:
|
|
||||||
# Open cursor
|
|
||||||
cursor = conn.cursor()
|
|
||||||
num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0]
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error updating URLs status: {}".format(str(e)))
|
|
||||||
num_urls = None
|
|
||||||
return num_urls
|
|
||||||
|
|
||||||
def _get_url_host_list(self):
|
|
||||||
try:
|
|
||||||
with psycopg.connect(self.db_connect_info) as conn:
|
|
||||||
# List of URL host
|
|
||||||
list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()]
|
|
||||||
# Clean http / https from URLs
|
|
||||||
list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host]
|
|
||||||
# Clean last slash if exists
|
|
||||||
list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host]
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception fetching URL host list: " + str(e))
|
|
||||||
list_url_host = []
|
|
||||||
return list_url_host
|
|
||||||
|
|
||||||
def _get_search_list(self):
|
|
||||||
try:
|
|
||||||
with psycopg.connect(self.db_connect_info) as conn:
|
|
||||||
# List of keyword searches
|
|
||||||
list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()]
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception fetching searches list: " + str(e))
|
|
||||||
list_search_text = []
|
|
||||||
return list_search_text
|
|
||||||
|
|
||||||
def _get_feed_urls(self):
|
|
||||||
try:
|
|
||||||
with psycopg.connect(self.db_connect_info) as conn:
|
|
||||||
list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
|
|
||||||
# Decode (tuple with 1 element)
|
|
||||||
list_url_feeds = [l[0] for l in list_url_feeds]
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception fetching RSS sites: " + str(e))
|
|
||||||
list_url_feeds = []
|
|
||||||
return list_url_feeds
|
|
||||||
|
|
||||||
def _get_url_hosts(self):
|
|
||||||
try:
|
|
||||||
with psycopg.connect(self.db_connect_info) as conn:
|
|
||||||
list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
|
|
||||||
# Decode (tuple with 1 element)
|
|
||||||
list_url_hosts = [l[0] for l in list_url_hosts]
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception fetching RSS sites: " + str(e))
|
|
||||||
list_url_hosts = []
|
|
||||||
return list_url_hosts
|
|
||||||
|
|
||||||
def _format(self, values):
|
|
||||||
# Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729
|
|
||||||
# String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value)
|
|
||||||
if (type(values) == list) or (type(values) == tuple):
|
|
||||||
insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")"
|
|
||||||
elif (type(values) == str):
|
|
||||||
insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" )
|
|
||||||
else:
|
|
||||||
logger.warning("Error formatting input values: {}".format(values))
|
|
||||||
assert False
|
|
||||||
return insert_args
|
|
||||||
|
|
||||||
def _get_cached_canonical_url(self, url):
|
|
||||||
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
|
|
||||||
try:
|
|
||||||
filter_url = self.redis_instance.get(url)
|
|
||||||
if (filter_url is not None):
|
|
||||||
filter_url = filter_url.decode("utf-8")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception querying Redis: {}".format(str(e)))
|
|
||||||
filter_url = None
|
|
||||||
return filter_url
|
|
||||||
|
|
||||||
def _update_urls_status(self, dict_status_ids):
|
|
||||||
#####################
|
|
||||||
### Update status to array of URL IDs
|
|
||||||
#####################
|
|
||||||
try:
|
|
||||||
# Update
|
|
||||||
with psycopg.connect(self.db_connect_info) as conn:
|
|
||||||
# Open cursor
|
|
||||||
cursor = conn.cursor()
|
|
||||||
# Autocommit at end of transaction (Atomic insert of URLs and sources)
|
|
||||||
with conn.transaction() as tx:
|
|
||||||
for key_status, value_ids in dict_status_ids.items():
|
|
||||||
cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids])))
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error updating URLs status: {}".format(str(e)))
|
|
||||||
|
|
||||||
def _get_missing_kids_urls(self, num_urls=None):
|
|
||||||
#####################
|
|
||||||
### Get list of Missing Kids URLs
|
|
||||||
#####################
|
|
||||||
try:
|
|
||||||
missing_kids_ids_and_urls = []
|
|
||||||
if (num_urls is None):
|
|
||||||
limit = 500
|
|
||||||
else:
|
|
||||||
limit = num_urls
|
|
||||||
offset = 0
|
|
||||||
with psycopg.connect(self.db_connect_info) as conn:
|
|
||||||
# Open cursor
|
|
||||||
cursor = conn.cursor()
|
|
||||||
while True:
|
|
||||||
# Query
|
|
||||||
missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
|
|
||||||
# Finished?
|
|
||||||
if (len(missing_kids_ids_and_urls_query) == 0):
|
|
||||||
break
|
|
||||||
# Extend
|
|
||||||
missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query
|
|
||||||
# Offset
|
|
||||||
offset += len(missing_kids_ids_and_urls_query)
|
|
||||||
# Stop?
|
|
||||||
if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls):
|
|
||||||
break
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error getting Missing Kids URLs: {}".format(str(e)))
|
|
||||||
missing_kids_ids_and_urls = []
|
|
||||||
return missing_kids_ids_and_urls
|
|
||||||
|
|
||||||
def _get_error_urls(self, num_urls=None):
|
|
||||||
#####################
|
|
||||||
### Get list of Missing Kids URLs
|
|
||||||
#####################
|
|
||||||
try:
|
|
||||||
error_urls = []
|
|
||||||
if (num_urls is None):
|
|
||||||
limit = 500
|
|
||||||
else:
|
|
||||||
limit = num_urls
|
|
||||||
offset = 0
|
|
||||||
with psycopg.connect(self.db_connect_info) as conn:
|
|
||||||
# Open cursor
|
|
||||||
cursor = conn.cursor()
|
|
||||||
while True:
|
|
||||||
# Query
|
|
||||||
error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
|
|
||||||
# Finished?
|
|
||||||
if (len(error_urls_query) == 0):
|
|
||||||
break
|
|
||||||
# Extend
|
|
||||||
error_urls = error_urls + error_urls_query
|
|
||||||
# Offset
|
|
||||||
offset += len(error_urls_query)
|
|
||||||
# Stop?
|
|
||||||
if (num_urls is not None) and (len(error_urls) >= num_urls):
|
|
||||||
break
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error getting Error URLs: {}".format(str(e)))
|
|
||||||
error_urls = []
|
|
||||||
return error_urls
|
|
||||||
|
|
||||||
def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched...
|
|
||||||
"""
|
|
||||||
# TODO: REFACTOR
|
|
||||||
For each input url
|
|
||||||
|
|
||||||
Already processed?
|
|
||||||
-> Update on Redis expire time
|
|
||||||
-> Associate to source
|
|
||||||
Not processed? Get main URL:
|
|
||||||
-> URL Canonical valid?
|
|
||||||
-> Rely on this as main URL
|
|
||||||
-> URL Canonical not valid?
|
|
||||||
-> Use input url, unless it's a news.google.com link
|
|
||||||
-> If news.google.com link, filter out. REDIS?
|
|
||||||
Main URL processing:
|
|
||||||
-> Update in REDIS, association url -> url_canonical
|
|
||||||
-> url != url_canonical: Add in duplicate table
|
|
||||||
If both != news.google.com
|
|
||||||
"""
|
|
||||||
|
|
||||||
# URLs to insert, URLs duplicated association, URL to Canonical form
|
|
||||||
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {}
|
|
||||||
|
|
||||||
# URL VS CANONICAL:
|
|
||||||
# News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen
|
|
||||||
# Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/
|
|
||||||
|
|
||||||
for url in urls_fetched:
|
|
||||||
# Domain to filter? Input url
|
|
||||||
filter_due_to_domain = False
|
|
||||||
for domain_to_filter in list_domains_to_filter:
|
|
||||||
if (domain_to_filter in url):
|
|
||||||
logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url))
|
|
||||||
filter_due_to_domain = True
|
|
||||||
if (filter_due_to_domain):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
|
|
||||||
cached_canonical_url = self._get_cached_canonical_url(url)
|
|
||||||
if (cached_canonical_url is not None):
|
|
||||||
# Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry)
|
|
||||||
dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y
|
|
||||||
# If url has been processed, so was its canonical form
|
|
||||||
logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url))
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Process TODO: Add language...
|
|
||||||
url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple)
|
|
||||||
# TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id)
|
|
||||||
|
|
||||||
# Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB)
|
|
||||||
if (url_canonical is None) and ("news.google.com" in url):
|
|
||||||
logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url))
|
|
||||||
continue
|
|
||||||
# Canonical URL still news.google.com? Continue (avoid inserting in DB)
|
|
||||||
if (url_canonical is not None) and ("news.google.com" in url_canonical):
|
|
||||||
logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical))
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Domain to filter? Input canonical_url
|
|
||||||
filter_due_to_domain = False
|
|
||||||
for domain_to_filter in list_domains_to_filter:
|
|
||||||
if (url_canonical is not None) and (domain_to_filter in url_canonical):
|
|
||||||
filter_due_to_domain = True
|
|
||||||
if (filter_due_to_domain):
|
|
||||||
logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical))
|
|
||||||
continue
|
|
||||||
|
|
||||||
if (url_canonical is None) or (article_status == "error"):
|
|
||||||
logger.debug("Processing failed for URL: {}".format(url))
|
|
||||||
# Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based
|
|
||||||
if ("news.google.com" in url) or ("consent.google.com" in url):
|
|
||||||
logging.debug("Not able to process Google News link, skipping: {}".format(url))
|
|
||||||
else:
|
|
||||||
dict_full_urls_to_canonical[url] = url # X -> X
|
|
||||||
list_insert_url_tuple_args.append( (url, article_status) )
|
|
||||||
continue
|
|
||||||
|
|
||||||
# URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different
|
|
||||||
if (url_canonical != url):
|
|
||||||
list_tuple_canonical_duplicate_urls.append( (url_canonical, url) )
|
|
||||||
# Dict: url -> canonical (update association)
|
|
||||||
dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X
|
|
||||||
|
|
||||||
# Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
|
|
||||||
if (self._get_cached_canonical_url(url_canonical) is not None):
|
|
||||||
# Canonical URL was already processed
|
|
||||||
logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical))
|
|
||||||
else:
|
|
||||||
# Insert url_canonical to DB formatted
|
|
||||||
list_insert_url_tuple_args.append( (url_canonical, article_status) )
|
|
||||||
# Canonical URL different? Process
|
|
||||||
if (url_canonical != url):
|
|
||||||
if ("news.google.com" in url) or ("consent.google.com" in url):
|
|
||||||
logging.debug("Not adding google.news.com based link, skipping: {}".format(url))
|
|
||||||
else:
|
|
||||||
# Fetched url -> duplicate (using canonical as main link)
|
|
||||||
article_status = "duplicate"
|
|
||||||
# Insert url (non-canonical) to DB formatted
|
|
||||||
list_insert_url_tuple_args.append( (url, article_status) )
|
|
||||||
|
|
||||||
return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical
|
|
||||||
|
|
||||||
def _insert_urls(self, cursor, list_insert_url_tuple_args):
|
|
||||||
#####################
|
|
||||||
### Insert URLs with status
|
|
||||||
#####################
|
|
||||||
if (len(list_insert_url_tuple_args) > 0):
|
|
||||||
insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] )
|
|
||||||
# Insert. (url_1, status_1), (url_2, status_2), ...
|
|
||||||
sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args)
|
|
||||||
# logger.debug("SQL CODE: {}".format(sql_code))
|
|
||||||
c = cursor.execute(sql_code)
|
|
||||||
# NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT)
|
|
||||||
# https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488
|
|
||||||
|
|
||||||
def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls):
|
|
||||||
#####################
|
|
||||||
### Insert duplicated URLs
|
|
||||||
#####################
|
|
||||||
if (len(list_tuple_canonical_duplicate_urls) > 0):
|
|
||||||
# Flatten, format, set to remove duplicates
|
|
||||||
args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")"
|
|
||||||
|
|
||||||
# Dict: url -> id
|
|
||||||
dict_url_to_id = {}
|
|
||||||
# Get url -> id association to populate duplicated URLs
|
|
||||||
for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall():
|
|
||||||
dict_url_to_id[url_] = id_
|
|
||||||
|
|
||||||
# Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB
|
|
||||||
# ORIGINAL CODE. Issue, might not have found association to all urls
|
|
||||||
### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls]
|
|
||||||
|
|
||||||
list_tuple_canonical_duplicate_urls_ids = []
|
|
||||||
for (url_1, url_2) in list_tuple_canonical_duplicate_urls:
|
|
||||||
id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2)
|
|
||||||
if (id_url_1 is None) or (id_url_2 is None):
|
|
||||||
logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2))
|
|
||||||
else:
|
|
||||||
list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) )
|
|
||||||
|
|
||||||
if (len(list_tuple_canonical_duplicate_urls_ids) > 0):
|
|
||||||
insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] )
|
|
||||||
# Insert. (id_url_canonical_1, id_url_1), ...
|
|
||||||
sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args)
|
|
||||||
# logger.debug("SQL CODE: {}".format(sql_code))
|
|
||||||
c = cursor.execute(sql_code)
|
|
||||||
|
|
||||||
def _get_pattern_status_list(self):
|
|
||||||
#####################
|
|
||||||
### Get list of domains to filter
|
|
||||||
#####################
|
|
||||||
# TODO: Cache on redis and query once every N hours? ...
|
|
||||||
try:
|
|
||||||
with psycopg.connect(self.db_connect_info) as conn:
|
|
||||||
# Open cursor
|
|
||||||
cursor = conn.cursor()
|
|
||||||
# TODO: Cache on Redis
|
|
||||||
list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall()
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error getting pattern status list: {}".format(str(e)))
|
|
||||||
list_pattern_status = []
|
|
||||||
return list_pattern_status
|
|
||||||
|
|
||||||
def _get_domains_to_filter(self):
|
|
||||||
#####################
|
|
||||||
### Get list of domains to filter
|
|
||||||
#####################
|
|
||||||
# TODO: Cache on redis and query once every N hours? ...
|
|
||||||
try:
|
|
||||||
with psycopg.connect(self.db_connect_info) as conn:
|
|
||||||
# Open cursor
|
|
||||||
cursor = conn.cursor()
|
|
||||||
# TODO: Cache on Redis
|
|
||||||
sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ]
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Error getting domains to filter: {}".format(str(e)))
|
|
||||||
sites_to_filter = []
|
|
||||||
return sites_to_filter
|
|
||||||
|
|
||||||
def _get_cached_source_id(self, source):
|
|
||||||
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
|
|
||||||
try:
|
|
||||||
source_id = self.redis_instance.get(source)
|
|
||||||
if (source_id is not None):
|
|
||||||
source_id = source_id.decode("utf-8")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception querying Redis: {}".format(str(e)))
|
|
||||||
source_id = None
|
|
||||||
return source_id
|
|
||||||
|
|
||||||
def _get_source_id(self, cursor, source):
|
|
||||||
#####################
|
|
||||||
### Get source corresponding id
|
|
||||||
#####################
|
|
||||||
# Cached?
|
|
||||||
id_source = self._get_cached_source_id(source)
|
|
||||||
if (id_source is None):
|
|
||||||
c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone()
|
|
||||||
if (c is None) or (len(c) == 0):
|
|
||||||
# Source does not exist, insert and get id
|
|
||||||
c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone()
|
|
||||||
# Decode source id
|
|
||||||
id_source = c[0]
|
|
||||||
# Cache
|
|
||||||
print("*"*10, source, id_source)
|
|
||||||
self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds)
|
|
||||||
return id_source
|
|
||||||
|
|
||||||
def _get_urls_id(self, cursor, urls_full):
|
|
||||||
#####################
|
|
||||||
### Get id of inserted and filtered URLs
|
|
||||||
#####################
|
|
||||||
# TODO: Cache url -> url_id, url_canonical
|
|
||||||
if (len(urls_full) == 0):
|
|
||||||
return []
|
|
||||||
# Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source
|
|
||||||
in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")"
|
|
||||||
id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ]
|
|
||||||
return id_urls_related
|
|
||||||
|
|
||||||
def _insert_urls_source(self, cursor, id_urls_related, id_source):
|
|
||||||
#####################
|
|
||||||
### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ...
|
|
||||||
#####################
|
|
||||||
if (len(id_urls_related) == 0) or (id_source is None):
|
|
||||||
return
|
|
||||||
columns = "(id_url, id_source)"
|
|
||||||
insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] )
|
|
||||||
# Insert
|
|
||||||
sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args)
|
|
||||||
# logger.debug("SQL CODE: {}".format(sql_code))
|
|
||||||
c = cursor.execute(sql_code)
|
|
||||||
|
|
||||||
def write_batch(self, urls_fetched, source):
|
|
||||||
# Chunks of 50 elements
|
|
||||||
n = 50
|
|
||||||
# Divide in small chunks
|
|
||||||
urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)]
|
|
||||||
# Process
|
|
||||||
for urls_fetched_chunk_i in urls_fetched_chunks:
|
|
||||||
self._write_small_batch(urls_fetched_chunk_i, source)
|
|
||||||
|
|
||||||
def _write_small_batch(self, urls_fetched, source):
|
|
||||||
try:
|
|
||||||
logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source))
|
|
||||||
|
|
||||||
if (len(urls_fetched) == 0):
|
|
||||||
logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
|
|
||||||
return
|
|
||||||
|
|
||||||
# Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests)
|
|
||||||
random.shuffle(urls_fetched)
|
|
||||||
|
|
||||||
# Get list of domains to filter
|
|
||||||
list_domains_to_filter = self._get_domains_to_filter()
|
|
||||||
# Get list of (pattern, priority, status) tuples to override status if required
|
|
||||||
list_pattern_status_tuple = self._get_pattern_status_list()
|
|
||||||
# Sort pattern tuples by priority
|
|
||||||
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
|
|
||||||
|
|
||||||
# Process URLs to update DB
|
|
||||||
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple)
|
|
||||||
# Full set of URL and its canonical form (to associate them to a search), both to insert and filter
|
|
||||||
urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) )
|
|
||||||
|
|
||||||
# Insert
|
|
||||||
with psycopg.connect(self.db_connect_info) as conn:
|
|
||||||
# Open cursor
|
|
||||||
cursor = conn.cursor()
|
|
||||||
# Autocommit at end of transaction (Atomic insert of URLs and sources)
|
|
||||||
with conn.transaction() as tx:
|
|
||||||
# Insert processed URLs
|
|
||||||
self._insert_urls(cursor, list_insert_url_tuple_args)
|
|
||||||
# Insert URLs duplicated (canonical != fetched url)
|
|
||||||
self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls)
|
|
||||||
|
|
||||||
# Get source id in DB
|
|
||||||
id_source = self._get_source_id(cursor, source)
|
|
||||||
# Get IDs of all related URLs
|
|
||||||
id_urls_related = self._get_urls_id(cursor, urls_full)
|
|
||||||
# Insert search source associated to URLs
|
|
||||||
self._insert_urls_source(cursor, id_urls_related, id_source)
|
|
||||||
|
|
||||||
# Update Redis status of inserted and filtered URLs after writing to DB
|
|
||||||
for url, url_canonical in dict_full_urls_to_canonical.items():
|
|
||||||
try:
|
|
||||||
# Set with updated expiry time
|
|
||||||
self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds)
|
|
||||||
if (url != url_canonical):
|
|
||||||
self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception running set in Redis: {}".format(str(e)))
|
|
||||||
|
|
||||||
if (len(list_insert_url_tuple_args) > 0):
|
|
||||||
try:
|
|
||||||
webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
|
|
||||||
endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token)
|
|
||||||
|
|
||||||
payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) })
|
|
||||||
r = requests.post(endpoint_message, data=payload)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Webhook failed: {}".format(str(e)))
|
|
||||||
|
|
||||||
logger.debug("URL DB write finished")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) )
|
|
||||||
logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) )
|
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
from .db_utils import DB_Handler
|
|
||||||
import feedparser
|
|
||||||
import dateutil
|
|
||||||
from .logger import get_logger
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
class FetchFeed():
|
|
||||||
def __init__(self, db_handler: DB_Handler) -> None:
|
|
||||||
logger.debug("Initializing News feed")
|
|
||||||
self.db_handler = db_handler
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
try:
|
|
||||||
logger.debug("Starting NewsFeed.run()")
|
|
||||||
# Get feeds
|
|
||||||
list_url_feeds = self.db_handler._get_feed_urls()
|
|
||||||
logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds)))
|
|
||||||
|
|
||||||
# Process via RSS feeds
|
|
||||||
for url_feed in list_url_feeds:
|
|
||||||
# Initialize
|
|
||||||
urls_fetched, urls_publish_date = [], []
|
|
||||||
# Fetch feeds
|
|
||||||
feeds = feedparser.parse(url_feed)
|
|
||||||
# Parse
|
|
||||||
for f in feeds.get("entries", []):
|
|
||||||
# Get URL
|
|
||||||
url = f.get("link", None)
|
|
||||||
# Process?
|
|
||||||
if (url is not None):
|
|
||||||
# Available publish date?
|
|
||||||
publish_date_parsed = f.get("published_parsed")
|
|
||||||
if (publish_date_parsed is None):
|
|
||||||
publish_date = f.get("published", None)
|
|
||||||
if (publish_date is not None):
|
|
||||||
publish_date_parsed = dateutil.parser.parse(publish_date)
|
|
||||||
|
|
||||||
# Published date
|
|
||||||
urls_publish_date.append(publish_date_parsed)
|
|
||||||
# URL
|
|
||||||
urls_fetched.append(url)
|
|
||||||
|
|
||||||
# URL fetching source
|
|
||||||
source = "feed {}".format(url_feed)
|
|
||||||
# Write to DB
|
|
||||||
self.db_handler.write_batch(urls_fetched, source)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception in NewsFeed.run(): {}".format(str(e)))
|
|
||||||
@@ -1,45 +0,0 @@
|
|||||||
from .db_utils import DB_Handler
|
|
||||||
import newspaper
|
|
||||||
from .logger import get_logger
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
class FetchParser():
|
|
||||||
def __init__(self, db_handler: DB_Handler) -> None:
|
|
||||||
logger.debug("Initializing News SiteParsing newspaper4k")
|
|
||||||
self.db_handler = db_handler
|
|
||||||
|
|
||||||
# TODO: MOVE LOGIC ELSEWHERE!
|
|
||||||
def _postprocess(self, article_urls):
|
|
||||||
return [url.replace("#comment-stream", "") for url in article_urls]
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
try:
|
|
||||||
logger.debug("Starting NewsSiteParsing.run() for {}")
|
|
||||||
|
|
||||||
# Get URL hosts
|
|
||||||
list_url_hosts = self.db_handler._get_url_hosts()
|
|
||||||
logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
|
|
||||||
|
|
||||||
# Process newspaper4k build method
|
|
||||||
for url_host_feed in list_url_hosts:
|
|
||||||
# Protocol
|
|
||||||
if not (url_host_feed.startswith("http")):
|
|
||||||
url_host_feed_formatted = "https://" + url_host_feed
|
|
||||||
else:
|
|
||||||
url_host_feed_formatted = url_host_feed
|
|
||||||
|
|
||||||
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
|
|
||||||
# Source object
|
|
||||||
url_host_built = newspaper.build(url_host_feed_formatted)
|
|
||||||
# Get articles URL list
|
|
||||||
urls_fetched = url_host_built.article_urls()
|
|
||||||
# TODO: MOVE!
|
|
||||||
# Post-processing
|
|
||||||
urls_fetched = self._postprocess(urls_fetched)
|
|
||||||
|
|
||||||
# URL fetching source
|
|
||||||
source = "newspaper4k {}".format(url_host_feed)
|
|
||||||
# Write to DB
|
|
||||||
self.db_handler.write_batch(urls_fetched, source)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))
|
|
||||||
@@ -1,73 +0,0 @@
|
|||||||
from .db_utils import DB_Handler
|
|
||||||
from .utils import get_searxng_instances
|
|
||||||
from .fetch_search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch
|
|
||||||
from .logger import get_logger
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
class FetchSearch():
|
|
||||||
def __init__(self, db_handler: DB_Handler, full=True) -> None:
|
|
||||||
logger.debug("Initializing News feed")
|
|
||||||
self.db_handler = db_handler
|
|
||||||
self.full_search = full
|
|
||||||
|
|
||||||
def _run_fetching(self, search_text):
|
|
||||||
logger.debug("Starting _run_fetching() for {}".format(search_text))
|
|
||||||
|
|
||||||
# Common parameters
|
|
||||||
lang, region = "en", "US"
|
|
||||||
|
|
||||||
### PreSearch
|
|
||||||
dict_params_news = {"search": search_text}
|
|
||||||
FetcherPreSearch(**dict_params_news).fetch_articles(self.db_handler)
|
|
||||||
|
|
||||||
### DuckDuckGo
|
|
||||||
period = "d"
|
|
||||||
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
|
|
||||||
FetcherDuckDuckGo(**dict_params_news).fetch_articles(self.db_handler)
|
|
||||||
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
|
|
||||||
FetcherDuckDuckGo(**dict_params_general).fetch_articles(self.db_handler)
|
|
||||||
|
|
||||||
if (self.full_search):
|
|
||||||
# Avoid site:{} search due to G-Bypass required time
|
|
||||||
if ("site:" not in search_text):
|
|
||||||
### GNews
|
|
||||||
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
|
|
||||||
FetcherGNews(**dict_params).fetch_articles(self.db_handler)
|
|
||||||
|
|
||||||
### GoogleNews
|
|
||||||
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
|
|
||||||
FetcherGoogleNews(**dict_params_news).fetch_articles(self.db_handler)
|
|
||||||
# dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
|
|
||||||
|
|
||||||
if False:
|
|
||||||
### SearxNG
|
|
||||||
period = "day"
|
|
||||||
for searx_instance in get_searxng_instances():
|
|
||||||
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
|
|
||||||
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
|
|
||||||
# Append thread
|
|
||||||
FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
|
|
||||||
FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)
|
|
||||||
|
|
||||||
logger.debug("Finished _run_fetching()")
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
try:
|
|
||||||
logger.info("Fetching text searches & URL hosts of interest")
|
|
||||||
|
|
||||||
# Get text searches of interest
|
|
||||||
list_search_text_of_interest = self.db_handler._get_search_list()
|
|
||||||
|
|
||||||
# Get URL host of interest
|
|
||||||
list_url_host = self.db_handler._get_url_host_list()
|
|
||||||
# Get text searches for URL hosts
|
|
||||||
list_search_text_url_host = ["site:{}".format(l) for l in list_url_host]
|
|
||||||
|
|
||||||
for search_text in list_search_text_of_interest + list_search_text_url_host:
|
|
||||||
logger.debug("Fetching news for search: {}".format(search_text))
|
|
||||||
self._run_fetching(search_text)
|
|
||||||
|
|
||||||
logger.info("Finished fetching text searches & URL hosts of interest")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception in NewsSearch.run(): {}".format(str(e)))
|
|
||||||
|
|
||||||
@@ -1,384 +0,0 @@
|
|||||||
from duckduckgo_search import DDGS
|
|
||||||
from gnews import GNews
|
|
||||||
from GoogleNews import GoogleNews
|
|
||||||
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
import json
|
|
||||||
import numpy as np
|
|
||||||
import random
|
|
||||||
from .google_bypass import GoogleByPass
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from .logger import get_logger
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Generic fetcher (fetches articles, writes to DB)
|
|
||||||
class FetcherAbstract(ABC):
|
|
||||||
@abstractmethod
|
|
||||||
def _fetch(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def fetch_articles(self, db_writer):
|
|
||||||
logger.debug("Starting fetch() for {}".format(self.name))
|
|
||||||
# Fetch articles
|
|
||||||
list_news = self._fetch()
|
|
||||||
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
|
|
||||||
# Write to DB
|
|
||||||
db_writer.write_batch(list_news, self.name)
|
|
||||||
|
|
||||||
# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
|
|
||||||
|
|
||||||
user_agents_list = [
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
|
|
||||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
|
||||||
"Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
|
|
||||||
"Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
|
|
||||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class FetcherPreSearch(FetcherAbstract):
|
|
||||||
def __init__(self, search):
|
|
||||||
"""
|
|
||||||
# period ->
|
|
||||||
- h = hours (eg: 12h)
|
|
||||||
- d = days (eg: 7d)
|
|
||||||
- m = months (eg: 6m)
|
|
||||||
- y = years (eg: 1y)
|
|
||||||
"""
|
|
||||||
self.search = search
|
|
||||||
self.period = "1d" # TODO Fixed for the moment
|
|
||||||
# self.lang = lang
|
|
||||||
# self.region = region
|
|
||||||
search_category = "news"
|
|
||||||
self.name = "presearch {} {} {}".format(search, search_category, self.period)
|
|
||||||
|
|
||||||
def _fetch(self):
|
|
||||||
try:
|
|
||||||
# PreSearch fetching endpoint, parameter search keyword
|
|
||||||
presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search)
|
|
||||||
# Timeout: 15 minutes
|
|
||||||
r = requests.get(presearch_fetch_endpoint, timeout=900)
|
|
||||||
# Decode
|
|
||||||
list_news = json.loads(r.text).get("list_urls", [])
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e)))
|
|
||||||
list_news = []
|
|
||||||
return list_news
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class FetcherGNews(FetcherAbstract):
|
|
||||||
def __init__(self, search, period, lang="en", region="US"):
|
|
||||||
"""
|
|
||||||
# period ->
|
|
||||||
- h = hours (eg: 12h)
|
|
||||||
- d = days (eg: 7d)
|
|
||||||
- m = months (eg: 6m)
|
|
||||||
- y = years (eg: 1y)
|
|
||||||
"""
|
|
||||||
self.search = search
|
|
||||||
self.period = period
|
|
||||||
self.lang = lang
|
|
||||||
self.region = region
|
|
||||||
search_category = "news"
|
|
||||||
self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
|
|
||||||
|
|
||||||
def _fetch(self):
|
|
||||||
try:
|
|
||||||
list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search)
|
|
||||||
# Decode
|
|
||||||
list_news = []
|
|
||||||
for l in list_dict_news:
|
|
||||||
list_news.append(l.get("url"))
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
|
||||||
list_news = []
|
|
||||||
|
|
||||||
# Bypass Google links
|
|
||||||
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
|
|
||||||
|
|
||||||
return list_news_redirections
|
|
||||||
|
|
||||||
class FetcherGoogleNews(FetcherAbstract):
|
|
||||||
def __init__(self, search, search_category="news", period="1d", lang="en", region="US"):
|
|
||||||
assert(search_category in ["news", "general"])
|
|
||||||
|
|
||||||
self.lang = lang
|
|
||||||
self.region = region
|
|
||||||
self.period = period
|
|
||||||
self.search_category = search_category
|
|
||||||
self.search = search
|
|
||||||
self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
|
|
||||||
|
|
||||||
def _fetch(self):
|
|
||||||
try:
|
|
||||||
# Initialize
|
|
||||||
g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region)
|
|
||||||
g.enableException(True)
|
|
||||||
|
|
||||||
if (self.search_category == "general"):
|
|
||||||
set_links = set()
|
|
||||||
# Search
|
|
||||||
g.search(self.search)
|
|
||||||
|
|
||||||
# Iterate pages
|
|
||||||
MAX_ITER_PAGES = 15
|
|
||||||
for i in range(MAX_ITER_PAGES):
|
|
||||||
time.sleep(random.uniform(1, 1.5))
|
|
||||||
num_before = len(set_links)
|
|
||||||
|
|
||||||
# Get page
|
|
||||||
try:
|
|
||||||
links = g.page_at(i)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e)))
|
|
||||||
break
|
|
||||||
# Links
|
|
||||||
for l in links:
|
|
||||||
# '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ'
|
|
||||||
url = l.get("link").split("url=")[-1]
|
|
||||||
set_links.add(url)
|
|
||||||
|
|
||||||
num_after = len(set_links)
|
|
||||||
|
|
||||||
# Finished?
|
|
||||||
if (num_before == num_after):
|
|
||||||
logger.debug("Iterated {} pages on GoogleNews general search".format(i))
|
|
||||||
break
|
|
||||||
# To list
|
|
||||||
list_news = list(set_links)
|
|
||||||
elif (self.search_category == "news"):
|
|
||||||
# Search
|
|
||||||
g.get_news(self.search)
|
|
||||||
# Fetch
|
|
||||||
list_news = g.get_links()
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
|
||||||
list_news = []
|
|
||||||
|
|
||||||
# Bypass Google links
|
|
||||||
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
|
|
||||||
|
|
||||||
return list_news_redirections
|
|
||||||
|
|
||||||
class FetcherDuckDuckGo(FetcherAbstract):
|
|
||||||
def __init__(self, search, search_category, period, lang="wt", region="wt"):
|
|
||||||
assert(search_category in ["news", "general"])
|
|
||||||
assert(period in ["d", "w", "m", "y"])
|
|
||||||
self.search = search
|
|
||||||
self.search_category = search_category
|
|
||||||
self.period = period
|
|
||||||
self.lang_region = "{}-{}".format(lang, region)
|
|
||||||
self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region)
|
|
||||||
|
|
||||||
def _fetch(self):
|
|
||||||
try:
|
|
||||||
list_news = []
|
|
||||||
with DDGS(timeout=10) as ddgs:
|
|
||||||
if (self.search_category == "general"):
|
|
||||||
generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region)
|
|
||||||
elif (self.search_category == "news"):
|
|
||||||
generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region)
|
|
||||||
|
|
||||||
for l in generator_links:
|
|
||||||
list_news.append( l.get("url", l.get("href")) )
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
|
||||||
list_news = []
|
|
||||||
return list_news
|
|
||||||
|
|
||||||
|
|
||||||
class FetcherSearxNews(FetcherAbstract):
|
|
||||||
def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"):
|
|
||||||
assert(search_category in ["news", "general"])
|
|
||||||
assert(period in [None, "day", "week", "month", "year"])
|
|
||||||
# Random header (minimize prob of web-scrapping detection)
|
|
||||||
self.headers = {
|
|
||||||
'User-agent': str(np.random.choice(user_agents_list)),
|
|
||||||
'Accept-Encoding': 'gzip, deflate',
|
|
||||||
'Accept': '*/*',
|
|
||||||
'Connection': 'keep-alive',
|
|
||||||
}
|
|
||||||
""" # Optional header
|
|
||||||
self.headers = {
|
|
||||||
'User-agent': str(np.random.choice(user_agents_list)),
|
|
||||||
'Accept-Encoding': 'gzip, deflate, br',
|
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
||||||
'Connection': 'keep-alive',
|
|
||||||
'Upgrade-Insecure-Requests': '1',
|
|
||||||
'TE': 'trailers',
|
|
||||||
'Sec-Fetch-Site': 'cross-site',
|
|
||||||
'Sec-Fetch-Mode': 'navigate',
|
|
||||||
'Sec-Fetch-Dest': 'document',
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
self.search = search
|
|
||||||
self.searx_instance = searx_instance
|
|
||||||
self.lang_region = "{}-{}".format(lang, region)
|
|
||||||
self.search_category = search_category
|
|
||||||
self.period = period
|
|
||||||
self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5
|
|
||||||
self.request_timeout = 240
|
|
||||||
|
|
||||||
period_name_mapping = {
|
|
||||||
None: "no_date_range",
|
|
||||||
"day": "1d",
|
|
||||||
"week": "1w",
|
|
||||||
"month": "1m",
|
|
||||||
"year": "1y",
|
|
||||||
}
|
|
||||||
self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region)
|
|
||||||
logger.info("SearX - Initialized SearX fetcher: {}".format(self.name))
|
|
||||||
|
|
||||||
def _request_and_decode(self, url_search):
|
|
||||||
# Initial random time sleep (minimize chance of getting blocked)
|
|
||||||
time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher))
|
|
||||||
# Request
|
|
||||||
logger.debug("SearX - Searching: {}".format(url_search))
|
|
||||||
try:
|
|
||||||
r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e))
|
|
||||||
return []
|
|
||||||
|
|
||||||
if (r.status_code == 200):
|
|
||||||
# Status code Ok
|
|
||||||
pass
|
|
||||||
elif (r.status_code == 429):
|
|
||||||
# TooManyRequests, "Rate limit exceeded"
|
|
||||||
logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text))
|
|
||||||
return []
|
|
||||||
elif (r.status_code != 200):
|
|
||||||
logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text))
|
|
||||||
return []
|
|
||||||
else:
|
|
||||||
logger.debug("SearX - Status code: {}".format(r.status_code))
|
|
||||||
|
|
||||||
# Decode request
|
|
||||||
soup = BeautifulSoup(r.text, 'html.parser')
|
|
||||||
page_url_set = set()
|
|
||||||
# h3 links
|
|
||||||
for elem in soup.find_all('h3'):
|
|
||||||
# Get url
|
|
||||||
url = elem.find('a').get('href')
|
|
||||||
page_url_set.add(url)
|
|
||||||
return page_url_set
|
|
||||||
|
|
||||||
def _get_news_list(self):
|
|
||||||
############################################################
|
|
||||||
# Domain & search parameter
|
|
||||||
search_domain = os.path.join(self.searx_instance, "search?q=")
|
|
||||||
# Search keywords
|
|
||||||
search_formatted = self.search.replace(" ", "+").replace(":", "%3A")
|
|
||||||
# Period formatted
|
|
||||||
period_formatted = "&time_range={}".format(self.period) if self.period is not None else ""
|
|
||||||
# Search parameters
|
|
||||||
search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted)
|
|
||||||
# Combined url search
|
|
||||||
url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters)
|
|
||||||
############################################################
|
|
||||||
|
|
||||||
# Request and decode on page=1
|
|
||||||
url_set = self._request_and_decode(url_search_nopage)
|
|
||||||
# No results?
|
|
||||||
if (len(url_set) == 0):
|
|
||||||
logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage))
|
|
||||||
return []
|
|
||||||
|
|
||||||
# Iterate pages
|
|
||||||
search_numpage = 2
|
|
||||||
while True:
|
|
||||||
# Combine url search with page number
|
|
||||||
url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage)
|
|
||||||
# Request and decode on page=X
|
|
||||||
url_set_i = self._request_and_decode(url_search_with_page)
|
|
||||||
|
|
||||||
# Length before merging
|
|
||||||
length_current = len(url_set)
|
|
||||||
# Merge
|
|
||||||
url_set = url_set.union(url_set_i)
|
|
||||||
# Length after merging
|
|
||||||
length_merged = len(url_set)
|
|
||||||
|
|
||||||
# No new elements?
|
|
||||||
if (length_current == length_merged):
|
|
||||||
logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage))
|
|
||||||
break
|
|
||||||
# Next page
|
|
||||||
search_numpage += 1
|
|
||||||
|
|
||||||
return list(url_set)
|
|
||||||
|
|
||||||
def _fetch(self):
|
|
||||||
try:
|
|
||||||
# Fetch news
|
|
||||||
list_news = self._get_news_list()
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
|
||||||
list_news = []
|
|
||||||
return list_news
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
import requests
|
|
||||||
import json
|
|
||||||
from .logger import get_logger
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
class GoogleByPass():
|
|
||||||
def __init__(self) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def bypass_google_urls(self, list_urls):
|
|
||||||
if (len(list_urls) == 0):
|
|
||||||
return []
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Endpoint
|
|
||||||
gbypass_endpoint = "http://selenium_app:80/get_redirection"
|
|
||||||
# Timeout: 20 minutes
|
|
||||||
timeout = 60*20
|
|
||||||
r = requests.post(gbypass_endpoint, json={"list_urls": list_urls}, timeout=timeout)
|
|
||||||
# Decode
|
|
||||||
list_urls_redirections = json.loads(r.text).get("list_urls_redirections", [])
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
|
|
||||||
list_urls_redirections = []
|
|
||||||
|
|
||||||
return list_urls_redirections
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
import logging
|
|
||||||
|
|
||||||
import os
|
|
||||||
os.makedirs("logs", exist_ok=True)
|
|
||||||
|
|
||||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
|
||||||
logger = logging.getLogger("news_fetcher")
|
|
||||||
logger.setLevel(logging.INFO)
|
|
||||||
|
|
||||||
# To file log: INFO / WARNING / ERROR
|
|
||||||
fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4)
|
|
||||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
|
||||||
logger.addHandler(fh)
|
|
||||||
|
|
||||||
# To file log: WARNING / ERROR
|
|
||||||
fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
|
|
||||||
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
|
||||||
fh_.setLevel(logging.WARNING)
|
|
||||||
logger.addHandler(fh_)
|
|
||||||
|
|
||||||
def get_logger():
|
|
||||||
return logger
|
|
||||||
@@ -1,36 +0,0 @@
|
|||||||
from .db_utils import DB_Handler
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
from .logger import get_logger
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
class MissingKidsFetch():
|
|
||||||
def __init__(self, db_handler: DB_Handler, num_pages) -> None:
|
|
||||||
logger.debug("Initializing News MissingKids")
|
|
||||||
self.db_handler = db_handler
|
|
||||||
self.num_pages = num_pages
|
|
||||||
self.missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}"
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
try:
|
|
||||||
logger.debug("Starting NewsMissingKids.run()")
|
|
||||||
try:
|
|
||||||
# Timeout
|
|
||||||
if (self.num_pages > 15):
|
|
||||||
timeout = 60*90 # 1.5h
|
|
||||||
else:
|
|
||||||
timeout = 60*5 # 5 min
|
|
||||||
# Request
|
|
||||||
r = requests.get(self.missingkids_fetch_endpoint.format(self.num_pages), timeout=timeout)
|
|
||||||
# Decode
|
|
||||||
urls_fetched = json.loads(r.text).get("list_urls", [])
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
|
|
||||||
urls_fetched = []
|
|
||||||
|
|
||||||
# URL fetching source
|
|
||||||
source = "missingkids fetcher"
|
|
||||||
# Write to DB
|
|
||||||
self.db_handler.write_batch(urls_fetched, source)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e)))
|
|
||||||
@@ -1,98 +0,0 @@
|
|||||||
from .db_utils import URL_DB_Writer
|
|
||||||
from .url_utils import get_missing_kid_status
|
|
||||||
from .logger import get_logger
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
|
|
||||||
def get_missing_kid_status(url, return_canonical_url=False):
|
|
||||||
import time
|
|
||||||
import requests
|
|
||||||
|
|
||||||
# Sleep
|
|
||||||
time.sleep(0.75)
|
|
||||||
try:
|
|
||||||
# Request
|
|
||||||
r = requests.get(url, timeout=300)
|
|
||||||
# Decode
|
|
||||||
status_code = r.status_code
|
|
||||||
# Canonical URL removing parameters
|
|
||||||
url_canonical = r.url
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception on get URL status request: {}. {}".format(url, str(e)))
|
|
||||||
status_code = None
|
|
||||||
url_canonical = url
|
|
||||||
|
|
||||||
if (status_code == 200):
|
|
||||||
status = "valid"
|
|
||||||
elif (status_code == 404):
|
|
||||||
status = "invalid"
|
|
||||||
else:
|
|
||||||
status = "unknown"
|
|
||||||
|
|
||||||
logger.debug("Missing Kid URL {} status: {}".format(url, status))
|
|
||||||
if (return_canonical_url):
|
|
||||||
return status, url_canonical
|
|
||||||
else:
|
|
||||||
return status
|
|
||||||
|
|
||||||
class MissingKidsStatus():
|
|
||||||
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
|
|
||||||
self.num_urls = num_urls
|
|
||||||
self.db_connect_info = db_connect_info
|
|
||||||
self.redis_connect_info = redis_connect_info
|
|
||||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
|
||||||
|
|
||||||
def update_missing_kids_status(self):
|
|
||||||
try:
|
|
||||||
logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls))
|
|
||||||
# List of URLs
|
|
||||||
list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls)
|
|
||||||
# Dict: status -> IDs to update to new status
|
|
||||||
dict_status_ids, dict_status_urls = {}, {}
|
|
||||||
# Check URLs with invalid status?
|
|
||||||
skip_invalid_check = False
|
|
||||||
|
|
||||||
flush_every, flush_current = 20, 0
|
|
||||||
# Iterate URLs
|
|
||||||
for (id, url, current_status) in list_ids_and_urls:
|
|
||||||
# Skip duplicate URLs
|
|
||||||
if (current_status == "duplicate"):
|
|
||||||
continue
|
|
||||||
# Skip invalid URLs?
|
|
||||||
if (skip_invalid_check):
|
|
||||||
if (current_status == "invalid"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get status
|
|
||||||
new_status = get_missing_kid_status(url)
|
|
||||||
# Different? Update
|
|
||||||
if (current_status != new_status):
|
|
||||||
# Extend array
|
|
||||||
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
|
|
||||||
# Debugging dict
|
|
||||||
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
|
|
||||||
# +1 processed
|
|
||||||
flush_current += 1
|
|
||||||
|
|
||||||
# Flush batch?
|
|
||||||
if (flush_every == flush_current):
|
|
||||||
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
|
|
||||||
# Update DB
|
|
||||||
self.db_writer._update_urls_status(dict_status_ids)
|
|
||||||
# Reset
|
|
||||||
flush_current = 0
|
|
||||||
dict_status_ids, dict_status_urls = {}, {}
|
|
||||||
|
|
||||||
# Flush remaining batch
|
|
||||||
if (flush_current > 0):
|
|
||||||
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
|
|
||||||
# Update DB
|
|
||||||
self.db_writer._update_urls_status(dict_status_ids)
|
|
||||||
# Reset
|
|
||||||
flush_current = 0
|
|
||||||
dict_status_ids, dict_status_urls = {}, {}
|
|
||||||
|
|
||||||
logger.info("Finished updating status to Missing Kids URLs")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e)))
|
|
||||||
|
|
||||||
@@ -1,62 +0,0 @@
|
|||||||
from .db_utils import URL_DB_Writer
|
|
||||||
from .url_utils import process_article
|
|
||||||
from .logger import get_logger
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
class UpdateErrorURLs():
|
|
||||||
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
|
|
||||||
self.num_urls = num_urls
|
|
||||||
self.db_connect_info = db_connect_info
|
|
||||||
self.redis_connect_info = redis_connect_info
|
|
||||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
|
||||||
|
|
||||||
def update_error_urls_status(self):
|
|
||||||
try:
|
|
||||||
logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls))
|
|
||||||
# List of URLs with status 'error'
|
|
||||||
list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls)
|
|
||||||
# Current status
|
|
||||||
current_status = "error"
|
|
||||||
# Dict: status -> IDs to update to new status
|
|
||||||
dict_status_ids, dict_status_urls = {}, {}
|
|
||||||
|
|
||||||
# Get list of (pattern, priority, status) tuples to override status if required
|
|
||||||
list_pattern_status_tuple = self.db_writer._get_pattern_status_list()
|
|
||||||
# Sort pattern tuples by priority
|
|
||||||
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
|
|
||||||
|
|
||||||
flush_every, flush_current = 20, 0
|
|
||||||
# Iterate URLs
|
|
||||||
for (id, url) in list_ids_and_urls:
|
|
||||||
# Get status
|
|
||||||
url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple)
|
|
||||||
# Different? Update
|
|
||||||
if (current_status != new_status):
|
|
||||||
# Extend array
|
|
||||||
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
|
|
||||||
# Debugging dict
|
|
||||||
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
|
|
||||||
# +1 processed
|
|
||||||
flush_current += 1
|
|
||||||
|
|
||||||
# Flush batch?
|
|
||||||
if (flush_every == flush_current):
|
|
||||||
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
|
|
||||||
# Update DB
|
|
||||||
self.db_writer._update_urls_status(dict_status_ids)
|
|
||||||
# Reset
|
|
||||||
flush_current = 0
|
|
||||||
dict_status_ids, dict_status_urls = {}, {}
|
|
||||||
|
|
||||||
# Flush remaining batch
|
|
||||||
if (flush_current > 0):
|
|
||||||
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
|
|
||||||
# Update DB
|
|
||||||
self.db_writer._update_urls_status(dict_status_ids)
|
|
||||||
# Reset
|
|
||||||
flush_current = 0
|
|
||||||
dict_status_ids, dict_status_urls = {}, {}
|
|
||||||
|
|
||||||
logger.info("Finished updating status to URLs with error")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))
|
|
||||||
@@ -1,262 +0,0 @@
|
|||||||
from gnews import GNews
|
|
||||||
import dateutil.parser
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
from .utils import remove_http_s
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
import traceback
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
from .logger import get_logger
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
def get_published_date(article):
|
|
||||||
try:
|
|
||||||
"""
|
|
||||||
# Already fetched publish date information?
|
|
||||||
if (publish_date_ is not None):
|
|
||||||
return publish_date_
|
|
||||||
"""
|
|
||||||
|
|
||||||
# List of potential publish dates
|
|
||||||
potential_dates = []
|
|
||||||
# Publish date is the best match
|
|
||||||
potential_dates.append(article.publish_date)
|
|
||||||
# Publish date metadata is the following best match
|
|
||||||
potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
|
|
||||||
# Iterate remaining keys
|
|
||||||
for key in article.meta_data.keys():
|
|
||||||
if ("date" in key):
|
|
||||||
potential_dates.append(article.meta_data[key])
|
|
||||||
|
|
||||||
def invalid_date(p_date):
|
|
||||||
# Today + 2 days, article from the future?
|
|
||||||
today_plus_two = datetime.utcnow() + timedelta(days=2)
|
|
||||||
# Article from the future?
|
|
||||||
return p_date.timestamp() > today_plus_two.timestamp()
|
|
||||||
|
|
||||||
for date_ in potential_dates:
|
|
||||||
# String date? parse
|
|
||||||
if (type(date_) == str):
|
|
||||||
try:
|
|
||||||
date_ = dateutil.parser.parse(date_)
|
|
||||||
except Exception as e:
|
|
||||||
logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
|
|
||||||
date_ = None
|
|
||||||
# Valid?
|
|
||||||
if (date_ is not None) and (not invalid_date(date_)):
|
|
||||||
return date_
|
|
||||||
|
|
||||||
logger.debug("Article with no published date: {}".format(article.url))
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
logger.info("Error while retrieving published date for URL: {}".format(article.url))
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_url_host(article_source_url, url):
|
|
||||||
# https://www.blabla.com/blabla -> www.blabla.com
|
|
||||||
if (article_source_url != ""):
|
|
||||||
# Article source URL already extracted, save path if any
|
|
||||||
return remove_http_s(article_source_url) # .split("/")[0]
|
|
||||||
else:
|
|
||||||
return remove_http_s(url).split("/")[0]
|
|
||||||
|
|
||||||
def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
|
|
||||||
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
|
|
||||||
# Status "raw", "duplicated" and "error" should remain the way they are
|
|
||||||
# Assumption: List of patterns sorted by importance
|
|
||||||
if (article_status in ["valid", "invalid", "unknown"]):
|
|
||||||
# Regular expression pattern matching: https://regexr.com/
|
|
||||||
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
|
|
||||||
# Matching?
|
|
||||||
matching = bool(re.match(regex_pattern, url))
|
|
||||||
# Update article status
|
|
||||||
if (matching):
|
|
||||||
if (status_if_match != article_status):
|
|
||||||
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
|
|
||||||
return status_if_match
|
|
||||||
# Pattern matching not required or not found, original article status
|
|
||||||
return article_status
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def bypass_google_link(article_url):
|
|
||||||
|
|
||||||
def bypass_google_consent(article_url):
|
|
||||||
# Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
|
|
||||||
article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
|
|
||||||
|
|
||||||
# https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
|
|
||||||
headers = {
|
|
||||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
|
|
||||||
}
|
|
||||||
cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Request
|
|
||||||
r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
|
|
||||||
# Decode
|
|
||||||
soup = BeautifulSoup(r.text, 'html.parser')
|
|
||||||
url_of_interest = soup.a['href']
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
|
|
||||||
url_of_interest = None
|
|
||||||
|
|
||||||
# Not able to bypass?
|
|
||||||
if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
|
|
||||||
url_of_interest = None
|
|
||||||
return url_of_interest
|
|
||||||
|
|
||||||
def bypass_google_using_service(article_url):
|
|
||||||
try:
|
|
||||||
# e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
|
|
||||||
gbypass_endpoint = "http://selenium_app:80/get_redirection"
|
|
||||||
# Timeout: 5 minutes
|
|
||||||
r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
|
|
||||||
# Decode
|
|
||||||
redirect_url = json.loads(r.text).get("redirect_url", "")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
|
|
||||||
redirect_url = ""
|
|
||||||
|
|
||||||
return redirect_url
|
|
||||||
|
|
||||||
logger.debug("Starting gbypass_endpoint()")
|
|
||||||
|
|
||||||
article_url_bypassed = None
|
|
||||||
# Bypass using request
|
|
||||||
if ("consent.google.com" in article_url):
|
|
||||||
article_url_bypassed = bypass_google_consent(article_url)
|
|
||||||
# Not bypassed yet? Bypass using service
|
|
||||||
if (article_url_bypassed is None):
|
|
||||||
article_url_bypassed = bypass_google_using_service(article_url)
|
|
||||||
|
|
||||||
# if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
|
|
||||||
if (article_url_bypassed == "") or (article_url_bypassed is None):
|
|
||||||
# Empty URL returned by Gbypass
|
|
||||||
logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
|
|
||||||
return article_url_bypassed
|
|
||||||
|
|
||||||
def process_article(article_url, list_pattern_status_tuple, language="en"):
|
|
||||||
# TODO:
|
|
||||||
"""
|
|
||||||
https://github.com/fhamborg/news-please
|
|
||||||
https://github.com/fhamborg/Giveme5W1H
|
|
||||||
https://github.com/santhoshse7en/news-fetch
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
logger.debug("Starting process_article()")
|
|
||||||
|
|
||||||
if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
|
|
||||||
# Bypass to get redirection
|
|
||||||
article_url = bypass_google_link(article_url)
|
|
||||||
# Error?
|
|
||||||
if (article_url is None):
|
|
||||||
return None, {}, "error"
|
|
||||||
elif ("missingkids.org/poster" in article_url):
|
|
||||||
# Get status
|
|
||||||
article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
|
|
||||||
article_elements = {
|
|
||||||
"url_full": article_url,
|
|
||||||
"url_canonical": url_canonical
|
|
||||||
}
|
|
||||||
return url_canonical, article_elements, article_status
|
|
||||||
else:
|
|
||||||
# Avoid Too many requests (feeds, ...)
|
|
||||||
time.sleep(0.75)
|
|
||||||
|
|
||||||
logger.debug("Processing: {}".format(article_url))
|
|
||||||
|
|
||||||
# Default status unless something happens
|
|
||||||
article_status = "valid"
|
|
||||||
|
|
||||||
# Parse article
|
|
||||||
# TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
|
|
||||||
# TODO: Language per config
|
|
||||||
article = GNews(language).get_full_article(url=article_url)
|
|
||||||
|
|
||||||
# Article parsed?
|
|
||||||
if (article is None) or (not article.is_parsed):
|
|
||||||
logger.debug("Article not parsed: {}".format(article_url))
|
|
||||||
return article_url, {}, "error"
|
|
||||||
|
|
||||||
# Canonical link as main URL
|
|
||||||
url_canonical = article.canonical_link
|
|
||||||
# Empty canonical URL?
|
|
||||||
if (article.canonical_link is None) or (article.canonical_link == ""):
|
|
||||||
# URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
|
|
||||||
if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
|
|
||||||
logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
|
|
||||||
try:
|
|
||||||
# Remove text after parameter call
|
|
||||||
url = article.url.split("?")[0]
|
|
||||||
# Remove comment-stream
|
|
||||||
url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
|
|
||||||
# Article
|
|
||||||
article_attempt = GNews(language).get_full_article(url=url)
|
|
||||||
# Retrieving same title? Update article based on clean URL
|
|
||||||
if (article_attempt is not None) and (article_attempt.title == article.title):
|
|
||||||
article = article_attempt
|
|
||||||
except Exception as e:
|
|
||||||
logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
|
|
||||||
else: # Default behaviour
|
|
||||||
logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
|
|
||||||
|
|
||||||
# By default, URL same as canonical
|
|
||||||
url_canonical = article.url
|
|
||||||
|
|
||||||
elif (article.url != article.canonical_link):
|
|
||||||
# If different, stick to canonical URL
|
|
||||||
logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
|
|
||||||
else:
|
|
||||||
# If same, continue...
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Update config to determine if content is valid
|
|
||||||
article.config.MIN_WORD_COUNT = 150
|
|
||||||
article.config.MIN_SENT_COUNT = 6
|
|
||||||
|
|
||||||
# Valid URL?
|
|
||||||
if (not article.is_valid_url()):
|
|
||||||
logger.debug("Not a valid news article: {}".format(url_canonical))
|
|
||||||
article_status = "invalid"
|
|
||||||
# Is the article's body text is long enough to meet standard article requirements?
|
|
||||||
if (not article.is_valid_body()):
|
|
||||||
logger.debug("Article body not valid: {}".format(url_canonical))
|
|
||||||
article_status = "unknown"
|
|
||||||
|
|
||||||
if (article.images != article.imgs):
|
|
||||||
logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
|
|
||||||
|
|
||||||
# article.keywords, article.meta_keywords, article.summary
|
|
||||||
# article.movies
|
|
||||||
# article.top_image
|
|
||||||
|
|
||||||
# Check if article status needs to be updated
|
|
||||||
article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
|
|
||||||
|
|
||||||
article_elements = {
|
|
||||||
'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
|
|
||||||
'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com
|
|
||||||
'title': article.title, # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020
|
|
||||||
'description': article.meta_description, # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office
|
|
||||||
'text': article.text, # ${Article content}
|
|
||||||
'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
|
|
||||||
'authors': article.authors, # ['Christopher Knaus']
|
|
||||||
'language': article.meta_lang, # en
|
|
||||||
'tags': list(article.tags), # ['Wide Open Border', '’My Son Hunter’ Movie', ...]
|
|
||||||
'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...]
|
|
||||||
'url_canonical': url_canonical, # Canonical URL (redirection)
|
|
||||||
# 'html': article.html, # HTML article
|
|
||||||
}
|
|
||||||
logger.debug("Processing OK: {}".format(url_canonical))
|
|
||||||
return url_canonical, article_elements, article_status
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
|
|
||||||
return None, {}, "error"
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
|
|
||||||
def remove_http_s(url):
|
|
||||||
url = url.replace("https://", "") if url.startswith("https://") else url
|
|
||||||
url = url.replace("http://", "") if url.startswith("http://") else url
|
|
||||||
return url
|
|
||||||
|
|
||||||
def is_valid_url(url):
|
|
||||||
if (url.startswith("https://")):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_searxng_instances():
|
|
||||||
# SearxNG instances: https://searx.space/
|
|
||||||
searx_instances = set()
|
|
||||||
searx_instances.add("https://searx.work/")
|
|
||||||
searx_instances.add("https://search.ononoki.org/")
|
|
||||||
searx_instances.add("https://searxng.nicfab.eu/")
|
|
||||||
searx_instances.add("https://searx.be/")
|
|
||||||
|
|
||||||
# searx_instances.add("https://searx.fmac.xyz/")
|
|
||||||
# searx_instances.add("https://northboot.xyz/") # FIX
|
|
||||||
|
|
||||||
# searx_instances.add("https://serx.ml/") # Offline
|
|
||||||
# searx_instances.add("https://searx.ru/")
|
|
||||||
# searx_instances.add("https://searx.sp-codes.de/")
|
|
||||||
# searx_instances.add("https://searxng.nicfab.eu/")
|
|
||||||
# searx_instances.add("https://s.frlt.one/")
|
|
||||||
# searx_instances.add("https://search.sapti.me/")
|
|
||||||
|
|
||||||
# To list
|
|
||||||
list_searx_instances = list(searx_instances)
|
|
||||||
return list_searx_instances
|
|
||||||
3
app_selenium/README.md
Normal file
3
app_selenium/README.md
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
|
||||||
|
* Missing kids posters fetch (num_pages=X)
|
||||||
|
* ...
|
||||||
@@ -17,7 +17,7 @@ class Search(models.Model):
|
|||||||
db_table = 'search'
|
db_table = 'search'
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "[{}]->{}".format(self.type, self.search)
|
return "[{}: {}]".format(self.type, self.search)
|
||||||
|
|
||||||
class Source(models.Model):
|
class Source(models.Model):
|
||||||
id = models.SmallAutoField(primary_key=True)
|
id = models.SmallAutoField(primary_key=True)
|
||||||
|
|||||||
@@ -130,7 +130,7 @@ class DB_Handler():
|
|||||||
# Get or create URL with canonical form
|
# Get or create URL with canonical form
|
||||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||||
# Get the source-search IDs associated to obj_url.id
|
# Get the source-search IDs associated to obj_url.id
|
||||||
list_url_source_search = UrlsSourceSearch.objects.fiter(id_url=obj_url)
|
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
|
||||||
for obj_url_source_search in list_url_source_search:
|
for obj_url_source_search in list_url_source_search:
|
||||||
# Associate same sources to url_canonical (it might already exist)
|
# Associate same sources to url_canonical (it might already exist)
|
||||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
|
|
||||||
<script>
|
<script>
|
||||||
|
|
||||||
function getQueryString(pageNumber, itemsNumber, sources, statuses){
|
function getQueryString(pageNumber, itemsNumber, sources, searches, statuses){
|
||||||
// Query parameters. If input is null, get most recent value
|
// Query parameters. If input is null, get most recent value
|
||||||
let queryParams = new URLSearchParams(window.location.search);
|
let queryParams = new URLSearchParams(window.location.search);
|
||||||
// page
|
// page
|
||||||
@@ -21,6 +21,9 @@
|
|||||||
// sources
|
// sources
|
||||||
if (sources == null) sources = queryParams.get("sources") ?? "all";
|
if (sources == null) sources = queryParams.get("sources") ?? "all";
|
||||||
queryParams.set("sources", sources);
|
queryParams.set("sources", sources);
|
||||||
|
// searches
|
||||||
|
if (searches == null) searches = queryParams.get("searches") ?? "all";
|
||||||
|
queryParams.set("searches", searches);
|
||||||
// status
|
// status
|
||||||
if (statuses == null) statuses = queryParams.get("status") ?? "all";
|
if (statuses == null) statuses = queryParams.get("status") ?? "all";
|
||||||
queryParams.set("status", statuses);
|
queryParams.set("status", statuses);
|
||||||
@@ -33,11 +36,11 @@
|
|||||||
return queryParamsString;
|
return queryParamsString;
|
||||||
}
|
}
|
||||||
|
|
||||||
function loadPage(pageNumber, itemsNumber, sources, statuses) {
|
function loadPage(pageNumber, itemsNumber, sources, searches, statuses) {
|
||||||
$("#item-list").fadeTo(100, 0.5); // Smooth fade effect
|
$("#item-list").fadeTo(100, 0.5); // Smooth fade effect
|
||||||
$("#loading").show();
|
$("#loading").show();
|
||||||
|
|
||||||
queryParamsString = getQueryString(pageNumber, itemsNumber, sources, statuses);
|
queryParamsString = getQueryString(pageNumber, itemsNumber, sources, searches, statuses);
|
||||||
|
|
||||||
$.ajax({
|
$.ajax({
|
||||||
url: "?" + queryParamsString,
|
url: "?" + queryParamsString,
|
||||||
@@ -58,7 +61,7 @@
|
|||||||
$(document).on("click", ".pagination a", function (event) {
|
$(document).on("click", ".pagination a", function (event) {
|
||||||
event.preventDefault();
|
event.preventDefault();
|
||||||
let page = $(this).attr("data-page");
|
let page = $(this).attr("data-page");
|
||||||
loadPage(pageNumber=page, itemsNumber=null, sources=null, statuses=null);
|
loadPage(pageNumber=page, itemsNumber=null, sources=null, searches=null, statuses=null);
|
||||||
});
|
});
|
||||||
|
|
||||||
$(document).ready(function () {
|
$(document).ready(function () {
|
||||||
@@ -68,25 +71,63 @@
|
|||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
const sourcesToggleAll = $("#toggle-all-sources");
|
const sourcesToggleAll = $("#toggle-all-sources");
|
||||||
const sourcesCheckboxes = $(".source-checkbox");
|
const sourcesCheckboxes = $(".source-checkbox");
|
||||||
|
const searchesToggleAll = $("#toggle-all-searches");
|
||||||
|
const searchesCheckboxes = $(".search-checkbox");
|
||||||
const statusesToggleAll = $("#toggle-all-status");
|
const statusesToggleAll = $("#toggle-all-status");
|
||||||
const statusCheckboxes = $(".status-checkbox");
|
const statusCheckboxes = $(".status-checkbox");
|
||||||
|
|
||||||
function updateFilters() {
|
function updateFilters() {
|
||||||
// Get selected sources
|
// Get selected sources
|
||||||
let selectedSources = sourcesCheckboxes.filter(":checked").map(function () {
|
if (sourcesToggleAll.prop("checked")) {
|
||||||
return $(this).val();
|
selectedSources = "all";
|
||||||
}).get().join(",");
|
}
|
||||||
|
else {
|
||||||
|
if (sourcesCheckboxes.filter(":checked").length > 0 ){
|
||||||
|
selectedSources = sourcesCheckboxes.filter(":checked").map(function () {
|
||||||
|
return $(this).val();
|
||||||
|
}).get().join(",");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
selectedSources = "none";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get selected searches
|
||||||
|
if (searchesToggleAll.prop("checked")) {
|
||||||
|
selectedSearches = "all";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (searchesCheckboxes.filter(":checked").length > 0 ){
|
||||||
|
selectedSearches = searchesCheckboxes.filter(":checked").map(function () {
|
||||||
|
return $(this).val();
|
||||||
|
}).get().join(",");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
selectedSearches = "none";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Get selected URL statuses
|
// Get selected URL statuses
|
||||||
let selectedStatuses = statusCheckboxes.filter(":checked").map(function () {
|
if (statusesToggleAll.prop("checked")) {
|
||||||
return $(this).val();
|
selectedStatuses = "all";
|
||||||
}).get().join(",");
|
}
|
||||||
|
else {
|
||||||
|
if (statusCheckboxes.filter(":checked").length > 0 ){
|
||||||
|
selectedStatuses = statusCheckboxes.filter(":checked").map(function () {
|
||||||
|
return $(this).val();
|
||||||
|
}).get().join(",");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
selectedStatuses = "none";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Get selected items per page
|
// Get selected items per page
|
||||||
let selectedItems = $("input[name='items']:checked").val();
|
let selectedItems = $("input[name='items']:checked").val();
|
||||||
|
|
||||||
// Update pagination and reload data
|
// Update pagination and reload data
|
||||||
loadPage(1, selectedItems, selectedSources, selectedStatuses);
|
loadPage(1, selectedItems, selectedSources, selectedSearches, selectedStatuses);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -101,6 +142,15 @@
|
|||||||
sourcesToggleAll.prop("checked", sourcesCheckboxes.length === sourcesCheckboxes.filter(":checked").length);
|
sourcesToggleAll.prop("checked", sourcesCheckboxes.length === sourcesCheckboxes.filter(":checked").length);
|
||||||
updateFilters();
|
updateFilters();
|
||||||
});
|
});
|
||||||
|
// Searches
|
||||||
|
searchesToggleAll.on("change", function () {
|
||||||
|
searchesCheckboxes.prop("checked", searchesToggleAll.prop("checked"));
|
||||||
|
updateFilters();
|
||||||
|
});
|
||||||
|
searchesCheckboxes.on("change", function () {
|
||||||
|
searchesToggleAll.prop("checked", searchesCheckboxes.length === searchesCheckboxes.filter(":checked").length);
|
||||||
|
updateFilters();
|
||||||
|
});
|
||||||
// Status
|
// Status
|
||||||
statusesToggleAll.on("change", function () {
|
statusesToggleAll.on("change", function () {
|
||||||
statusCheckboxes.prop("checked", statusesToggleAll.prop("checked"));
|
statusCheckboxes.prop("checked", statusesToggleAll.prop("checked"));
|
||||||
@@ -121,11 +171,15 @@
|
|||||||
// Sources
|
// Sources
|
||||||
sourcesCheckboxes.each(function () { $(this).prop("checked", true); });
|
sourcesCheckboxes.each(function () { $(this).prop("checked", true); });
|
||||||
sourcesToggleAll.prop("checked", true);
|
sourcesToggleAll.prop("checked", true);
|
||||||
|
// Searches
|
||||||
|
searchesCheckboxes.each(function () { $(this).prop("checked", true); });
|
||||||
|
searchesToggleAll.prop("checked", true);
|
||||||
// Statuses
|
// Statuses
|
||||||
statusCheckboxes.each(function () { $(this).prop("checked", true); });
|
statusCheckboxes.each(function () { $(this).prop("checked", true); });
|
||||||
statusesToggleAll.prop("checked", true);
|
statusesToggleAll.prop("checked", true);
|
||||||
// Items
|
// Items
|
||||||
$("input[name='items'][value='" + 15 + "']").prop("checked", true);
|
// $("input[name='items'][value='" + 15 + "']").prop("checked", true);
|
||||||
|
// loadPage(pageNumber=page, itemsNumber=null, sources=null, searches=null, statuses=null);
|
||||||
});
|
});
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -148,6 +202,23 @@
|
|||||||
let savedTheme = localStorage.getItem("theme") ||
|
let savedTheme = localStorage.getItem("theme") ||
|
||||||
(window.matchMedia("(prefers-color-scheme: dark)").matches ? "dark" : "light");
|
(window.matchMedia("(prefers-color-scheme: dark)").matches ? "dark" : "light");
|
||||||
setTheme(savedTheme);
|
setTheme(savedTheme);
|
||||||
|
// Local browser timestamp aware for ts_fetch print
|
||||||
|
document.querySelectorAll(".timestamp").forEach(function (el) {
|
||||||
|
const ts = el.getAttribute("data-ts");
|
||||||
|
if (ts) {
|
||||||
|
const options = {
|
||||||
|
day: "2-digit",
|
||||||
|
month: "2-digit",
|
||||||
|
year: "numeric",
|
||||||
|
hour: "2-digit",
|
||||||
|
minute: "2-digit",
|
||||||
|
second: "2-digit",
|
||||||
|
hour12: false // Use 24-hour format
|
||||||
|
}; // "en-GB" for DD-MM-YYYY
|
||||||
|
const localDate = new Date(ts).toLocaleString("en-GB", options); // Adjust to browser's timezone
|
||||||
|
el.innerHTML = `${localDate}`;
|
||||||
|
}
|
||||||
|
});
|
||||||
});
|
});
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
</script>
|
</script>
|
||||||
@@ -174,6 +245,9 @@
|
|||||||
box-shadow: 2px 0 5px rgba(0, 0, 0, 0.1);
|
box-shadow: 2px 0 5px rgba(0, 0, 0, 0.1);
|
||||||
padding: 15px;
|
padding: 15px;
|
||||||
transition: width 0.3s ease;
|
transition: width 0.3s ease;
|
||||||
|
/* Enable scrolling */
|
||||||
|
overflow-y: auto;
|
||||||
|
max-height: 100vh;
|
||||||
}
|
}
|
||||||
|
|
||||||
#sidebar .nav-link {
|
#sidebar .nav-link {
|
||||||
@@ -313,10 +387,10 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
th:nth-child(1), td:nth-child(1) { width: 50%; } /* URL column */
|
th:nth-child(1), td:nth-child(1) { width: 50%; } /* URL column */
|
||||||
th:nth-child(2), td:nth-child(2) { width: 20%; } /* Fetch Date */
|
th:nth-child(2), td:nth-child(2) { width: 27.5%; } /* Fetch Date */
|
||||||
th:nth-child(3), td:nth-child(3) { width: 20%; } /* Sources */
|
th:nth-child(3), td:nth-child(3) { width: 10%; } /* Sources */
|
||||||
th:nth-child(4), td:nth-child(4) { width: 5%; } /* Status */
|
th:nth-child(4), td:nth-child(4) { width: 10%; } /* Searches */
|
||||||
th:nth-child(5), td:nth-child(5) { width: 5%; } /* Action */
|
th:nth-child(5), td:nth-child(5) { width: 2.5%; } /* Status */
|
||||||
|
|
||||||
/* ============================= */
|
/* ============================= */
|
||||||
/* Pagination Styling */
|
/* Pagination Styling */
|
||||||
@@ -407,33 +481,23 @@
|
|||||||
<span id="theme-icon">🌙</span>
|
<span id="theme-icon">🌙</span>
|
||||||
</button>
|
</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Sources -->
|
|
||||||
<div class="nav-item mt-3">
|
|
||||||
<strong>Select sources</strong>
|
|
||||||
<form id="source-filter-form">
|
|
||||||
<!-- Toggle All Checkbox -->
|
|
||||||
<div class="form-check">
|
|
||||||
<input class="form-check-input" type="checkbox" id="toggle-all-sources">
|
|
||||||
<label class="form-check-label fw-bold" for="toggle-all-sources">
|
|
||||||
Toggle all
|
|
||||||
</label>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Individual Source Checkboxes -->
|
<!-- URLs per page -->
|
||||||
{% for source in sources %}
|
<div class="nav-item mt-3">
|
||||||
<div class="form-check">
|
<strong>URLs per page</strong>
|
||||||
<input class="form-check-input source-checkbox" type="checkbox" value="{{ source.id }}" id="source-{{ source.id }}">
|
<div class="card-body">
|
||||||
<label class="form-check-label" for="source-{{ source.id }}">
|
<!-- Individual Status Checkboxes -->
|
||||||
{{ source.source }}
|
{% for url_per_page in list_urls_per_page %}
|
||||||
</label>
|
<div class="items-form-check">
|
||||||
|
<input class="form-check-input items" type="radio" name="items" id="value-{{ url_per_page }}" value="{{ url_per_page }}">
|
||||||
|
<label class="form-check-label" for="value-{{ url_per_page }}">{{ url_per_page }}</label>
|
||||||
</div>
|
</div>
|
||||||
{% empty %}
|
{% empty %}
|
||||||
<tr>
|
<tr>
|
||||||
<td colspan="2" class="text-center">No sources available.</td>
|
<td colspan="2" class="text-center">No options available.</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</form>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Status -->
|
<!-- Status -->
|
||||||
@@ -457,6 +521,33 @@
|
|||||||
</label>
|
</label>
|
||||||
</div>
|
</div>
|
||||||
{% empty %}
|
{% empty %}
|
||||||
|
<tr>
|
||||||
|
<td colspan="2" class="text-center">No statuses available.</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Sources -->
|
||||||
|
<div class="nav-item mt-3">
|
||||||
|
<strong>Select sources</strong>
|
||||||
|
<form id="source-filter-form">
|
||||||
|
<!-- Toggle All Checkbox -->
|
||||||
|
<div class="form-check">
|
||||||
|
<input class="form-check-input" type="checkbox" id="toggle-all-sources">
|
||||||
|
<label class="form-check-label fw-bold" for="toggle-all-sources">
|
||||||
|
Toggle all
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<!-- Individual Source Checkboxes -->
|
||||||
|
{% for source in sources %}
|
||||||
|
<div class="form-check">
|
||||||
|
<input class="form-check-input source-checkbox" type="checkbox" value="{{ source.id }}" id="source-{{ source.id }}">
|
||||||
|
<label class="form-check-label" for="source-{{ source.id }}">
|
||||||
|
{{ source.source }}
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
{% empty %}
|
||||||
<tr>
|
<tr>
|
||||||
<td colspan="2" class="text-center">No sources available.</td>
|
<td colspan="2" class="text-center">No sources available.</td>
|
||||||
</tr>
|
</tr>
|
||||||
@@ -464,26 +555,34 @@
|
|||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- URLs per page -->
|
<!-- Searches -->
|
||||||
<div class="nav-item mt-3">
|
<div class="nav-item mt-3">
|
||||||
<strong>URLs per page</strong>
|
<strong>Select searches</strong>
|
||||||
<div class="card-body">
|
<form id="search-filter-form">
|
||||||
<!-- Individual Status Checkboxes -->
|
<!-- Toggle All Checkbox -->
|
||||||
{% for url_per_page in list_urls_per_page %}
|
<div class="form-check">
|
||||||
<div class="items-form-check">
|
<input class="form-check-input" type="checkbox" id="toggle-all-searches">
|
||||||
<input class="form-check-input items" type="radio" name="items" id="value-{{ url_per_page }}" value="{{ url_per_page }}">
|
<label class="form-check-label fw-bold" for="toggle-all-searches">
|
||||||
<label class="form-check-label" for="value-{{ url_per_page }}">{{ url_per_page }}</label>
|
Toggle all
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<!-- Individual Search Checkboxes -->
|
||||||
|
{% for search in searches %}
|
||||||
|
<div class="form-check">
|
||||||
|
<input class="form-check-input search-checkbox" type="checkbox" value="{{ search.id }}" id="search-{{ search.id }}">
|
||||||
|
<label class="form-check-label" for="search-{{ search.id }}">
|
||||||
|
[{{ search.type }}] {{ search.search }}
|
||||||
|
</label>
|
||||||
</div>
|
</div>
|
||||||
{% empty %}
|
{% empty %}
|
||||||
<tr>
|
<tr>
|
||||||
<td colspan="2" class="text-center">No options available.</td>
|
<td colspan="2" class="text-center">No search available.</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|||||||
@@ -7,15 +7,18 @@
|
|||||||
<th scope="col"><strong>URL</strong></th>
|
<th scope="col"><strong>URL</strong></th>
|
||||||
<th scope="col"><strong>Fetch date</strong></th>
|
<th scope="col"><strong>Fetch date</strong></th>
|
||||||
<th scope="col"><strong>Sources</strong></th>
|
<th scope="col"><strong>Sources</strong></th>
|
||||||
|
<th scope="col"><strong>Search</strong></th>
|
||||||
<th scope="col"><strong>Status</strong></th>
|
<th scope="col"><strong>Status</strong></th>
|
||||||
<th scope="col"><strong>Action</strong></th>
|
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
{% for item in page_obj %}
|
{% for item in page_obj %}
|
||||||
<tr>
|
<tr>
|
||||||
<td><a href="{{ item.url }}/" target="_blank">{{ item.url }}</a></td>
|
<td>
|
||||||
<td>{{ item.ts_fetch }}</td>
|
<a href="./{{ item.id }}" class="btn btn-primary btn-sm" target="_blank">➤ </a>
|
||||||
|
<a href="{{ item.url }}/" target="_blank">{{ item.url }}</a>
|
||||||
|
</td>
|
||||||
|
<td class="timestamp" data-ts="{{ item.ts_fetch|date:'c' }}">{{ item.ts_fetch }}</td>
|
||||||
<td>
|
<td>
|
||||||
{% with sources_map|dict_get:item.id as sources %}
|
{% with sources_map|dict_get:item.id as sources %}
|
||||||
{% if sources %}
|
{% if sources %}
|
||||||
@@ -27,6 +30,17 @@
|
|||||||
{% endif %}
|
{% endif %}
|
||||||
{% endwith %}
|
{% endwith %}
|
||||||
</td>
|
</td>
|
||||||
|
<td>
|
||||||
|
{% with searches_map|dict_get:item.id as searches %}
|
||||||
|
{% if searches %}
|
||||||
|
{% for search in searches %}
|
||||||
|
<span class="badge bg-secondary">{{ search }}</span>
|
||||||
|
{% endfor %}
|
||||||
|
{% else %}
|
||||||
|
<span class="text-muted">No searches</span>
|
||||||
|
{% endif %}
|
||||||
|
{% endwith %}
|
||||||
|
</td>
|
||||||
<td>
|
<td>
|
||||||
{% if item.status == 'raw' %}
|
{% if item.status == 'raw' %}
|
||||||
<span class="badge bg-secondary">{{ item.status|capfirst }}</span>
|
<span class="badge bg-secondary">{{ item.status|capfirst }}</span>
|
||||||
@@ -43,11 +57,7 @@
|
|||||||
{% else %}
|
{% else %}
|
||||||
<span class="badge bg-light">Unknown</span>
|
<span class="badge bg-light">Unknown</span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</td>
|
</td>
|
||||||
<td>
|
|
||||||
<a href="url/{{ item.id }}" class="btn btn-primary btn-sm" target="_blank">Details</a>
|
|
||||||
</td>
|
|
||||||
|
|
||||||
</tr>
|
</tr>
|
||||||
{% empty %}
|
{% empty %}
|
||||||
<tr>
|
<tr>
|
||||||
|
|||||||
@@ -54,7 +54,7 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fetch URL
|
// Fetch URL
|
||||||
let fetchUrl = `/news/url/${urlId}/fetch/?url=${encodeURIComponent(url)}&model=${encodeURIComponent(selectedModel)}&text=${encodeURIComponent(inputText)}`;
|
let fetchUrl = `/api/url/${urlId}/fetch/?url=${encodeURIComponent(url)}&model=${encodeURIComponent(selectedModel)}&text=${encodeURIComponent(inputText)}`;
|
||||||
|
|
||||||
let resultContainer = $("#chat-output");
|
let resultContainer = $("#chat-output");
|
||||||
resultContainer.html(""); // Clear previous content before fetching
|
resultContainer.html(""); // Clear previous content before fetching
|
||||||
@@ -99,12 +99,6 @@
|
|||||||
// Render Markdown progressively (but safely)
|
// Render Markdown progressively (but safely)
|
||||||
messageContainer.html(marked.parse(accumulatedText));
|
messageContainer.html(marked.parse(accumulatedText));
|
||||||
//////////////////////////////////////
|
//////////////////////////////////////
|
||||||
|
|
||||||
//////////////////////////////////////
|
|
||||||
// ORIGINAL:
|
|
||||||
//let text = decoder.decode(value).replace(/\n/g, "<br>");
|
|
||||||
//resultContainer.append(text); // Append streamed text
|
|
||||||
//////////////////////////////////////
|
|
||||||
|
|
||||||
resultContainer.scrollTop(resultContainer[0].scrollHeight); // Auto-scroll to bottom
|
resultContainer.scrollTop(resultContainer[0].scrollHeight); // Auto-scroll to bottom
|
||||||
return read();
|
return read();
|
||||||
@@ -135,12 +129,16 @@
|
|||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Fetch Date</th>
|
<th>Fetch Date</th>
|
||||||
<td>{{ url_item.ts_fetch }}</td>
|
<td>{{ url_item.ts_fetch }} UTC</td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Sources</th>
|
<th>Source</th>
|
||||||
<td>{{ sources|join:", " }}</td>
|
<td>{{ sources|join:", " }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Search</th>
|
||||||
|
<td>{{ searches|join:", " }}</td>
|
||||||
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Status</th>
|
<th>Status</th>
|
||||||
<td>{{ url_item.status }}</td>
|
<td>{{ url_item.status }}</td>
|
||||||
@@ -175,7 +173,6 @@
|
|||||||
<form onsubmit="fetchDetailsWithSelection(event, {{ url_item.id }}, '{{ url_item.url }}')">
|
<form onsubmit="fetchDetailsWithSelection(event, {{ url_item.id }}, '{{ url_item.url }}')">
|
||||||
<label for="options-{{ url_item.id }}">Model:</label>
|
<label for="options-{{ url_item.id }}">Model:</label>
|
||||||
<select id="options-{{ url_item.id }}" class="form-control mb-2">
|
<select id="options-{{ url_item.id }}" class="form-control mb-2">
|
||||||
<!-- <option value="">-- Select an option --</option> -->
|
|
||||||
{% for model in models %}
|
{% for model in models %}
|
||||||
<option value="{{ model }}">{{ model }}</option>
|
<option value="{{ model }}">{{ model }}</option>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
@@ -185,21 +182,23 @@
|
|||||||
<!-- Input field with a default value -->
|
<!-- Input field with a default value -->
|
||||||
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
|
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
|
||||||
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="3">{{ prompt }} {{ url_item.url }}</textarea>
|
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="3">{{ prompt }} {{ url_item.url }}</textarea>
|
||||||
|
|
||||||
<!-- Fetch details button -->
|
<div class="d-flex align-items-center">
|
||||||
<button class="btn btn-primary" onclick="fetchDetails({{ url_item.id }}, '{{ url_item.url }}')">
|
<!-- Fetch details button -->
|
||||||
Fetch Details
|
<button class="btn btn-primary" onclick="fetchDetails({{ url_item.id }}, '{{ url_item.url }}')">
|
||||||
</button>
|
Fetch Details
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<!-- Loading Spinner (Hidden by Default) -->
|
||||||
|
<div id="loading-spinner" class="spinner-border text-primary ms-2" role="status" style="display: none;">
|
||||||
|
<span class="visually-hidden">Loading...</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- Chatbot-style response box -->
|
<!-- Chatbot-style response box -->
|
||||||
<div class="chat-box mt-3 p-3 border rounded">
|
<div class="chat-box mt-3 p-3 border rounded">
|
||||||
<div id="chat-output"></div>
|
<div id="chat-output"></div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- Loading Spinner (Hidden by Default) -->
|
|
||||||
<div id="loading-spinner" class="spinner-border text-primary mt-3" role="status" style="display: none;">
|
|
||||||
<span class="visually-hidden">Loading...</span>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from . import views
|
|||||||
|
|
||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
path('', views.link_list, name='link_list'),
|
path('', views.link_list, name='link_list'),
|
||||||
path('url/', views.news, name='url_detail'),
|
path('url/', views.urls, name='url_detail'),
|
||||||
path('url/<int:id>/', views.url_detail_view, name='url_detail'),
|
path('url/<int:id>/', views.url_detail_view, name='url_detail'),
|
||||||
path('url/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
|
path('url/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
|
||||||
path('task/<str:task>', views.trigger_task, name='trigger_task'),
|
path('task/<str:task>', views.trigger_task, name='trigger_task'),
|
||||||
|
|||||||
@@ -18,64 +18,80 @@ def link_list(request):
|
|||||||
prefix = "http://localhost:8000/api/task"
|
prefix = "http://localhost:8000/api/task"
|
||||||
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_500000"]
|
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_500000"]
|
||||||
|
|
||||||
db_links = ["http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500"]
|
list_links = [
|
||||||
return JsonResponse({"links": ["http://localhost:8000/api/url"] + db_links + [os.path.join(prefix, l) for l in links]})
|
# DB
|
||||||
|
"http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500",
|
||||||
|
# Admin panel
|
||||||
|
"http://localhost:8000/admin",
|
||||||
|
# URLs
|
||||||
|
"http://localhost:8000/api/url",
|
||||||
|
# API tasks
|
||||||
|
] + [os.path.join(prefix, l) for l in links]
|
||||||
|
# Json
|
||||||
|
return JsonResponse({"links": list_links })
|
||||||
|
|
||||||
|
|
||||||
from django.http import StreamingHttpResponse, HttpResponse, JsonResponse
|
from django.http import StreamingHttpResponse, JsonResponse
|
||||||
from django.shortcuts import render, get_object_or_404
|
from django.shortcuts import render, get_object_or_404
|
||||||
from django.core.paginator import Paginator
|
from django.core.paginator import Paginator
|
||||||
import requests
|
|
||||||
from django.http import StreamingHttpResponse
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
import ollama
|
import ollama
|
||||||
|
|
||||||
from .models import Urls, Source, Search, UrlsSourceSearch, UrlContent
|
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
|
||||||
|
|
||||||
# Create your views here.
|
# Create your views here.
|
||||||
def news(request):
|
def urls(request):
|
||||||
# URLs
|
# URLs
|
||||||
urls = Urls.objects.all()
|
urls = Urls.objects.all()
|
||||||
# Sources
|
# Sources
|
||||||
sources = Source.objects.all()
|
sources = Source.objects.all()
|
||||||
seaerches = Search.objects.all()
|
searches = Search.objects.all()
|
||||||
|
|
||||||
# Parameters
|
# Parameters
|
||||||
page_number = request.GET.get("page", 1)
|
page_number = request.GET.get("page", 1)
|
||||||
num_items = request.GET.get("items", 15)
|
num_items = request.GET.get("items", 15)
|
||||||
source_ids = request.GET.get("sources", ','.join([str(s.id) for s in sources]))
|
source_ids = request.GET.get("sources", ','.join([str(s.id) for s in sources]))
|
||||||
|
search_ids = request.GET.get("searches", ','.join([str(s.id) for s in searches]))
|
||||||
status_filters = request.GET.get("status", None)
|
status_filters = request.GET.get("status", None)
|
||||||
|
|
||||||
# Filters
|
# Filters
|
||||||
if (status_filters) and (status_filters != "all"):
|
if (status_filters) and (status_filters != "all"):
|
||||||
urls = urls.filter(status__in=status_filters.split(","))
|
if (status_filters == "none"):
|
||||||
|
urls = []
|
||||||
|
else:
|
||||||
|
urls = urls.filter(status__in=status_filters.split(","))
|
||||||
if (source_ids) and (source_ids != "all"):
|
if (source_ids) and (source_ids != "all"):
|
||||||
# TODO: Distinct needed?
|
if (source_ids == "none"):
|
||||||
# urls = urls.filter(urlssource__id_source__in=source_ids.split(",")).distinct()
|
urls = []
|
||||||
pass
|
else:
|
||||||
|
urls = urls.filter(urlssourcesearch__id_source__in=source_ids.split(",")) # .distinct()
|
||||||
|
if (search_ids) and (search_ids != "all"):
|
||||||
|
if (search_ids == "none"):
|
||||||
|
urls = []
|
||||||
|
else:
|
||||||
|
urls = urls.filter(urlssourcesearch__id_search__in=search_ids.split(",")) # .distinct()
|
||||||
|
|
||||||
# Pagination
|
# Pagination
|
||||||
paginator = Paginator(urls, num_items)
|
paginator = Paginator(urls, num_items)
|
||||||
page_obj = paginator.get_page(page_number)
|
page_obj = paginator.get_page(page_number)
|
||||||
|
|
||||||
# Map URL IDs to their sources, only for subset of URLs (page of interest)
|
# Map URL IDs to their sources & searches, only for subset of URLs (page of interest)
|
||||||
sources_map= {}
|
|
||||||
"""
|
|
||||||
sources_map = {
|
sources_map = {
|
||||||
url.id: list(Source.objects.filter(urlssource__id_url=url).values_list('source', flat=True))
|
url.id: list(Source.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
|
||||||
for url in page_obj.object_list
|
}
|
||||||
|
searches_map = {
|
||||||
|
url.id: list(Search.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
|
||||||
}
|
}
|
||||||
"""
|
|
||||||
|
|
||||||
context = {
|
context = {
|
||||||
"page_obj": page_obj,
|
"page_obj": page_obj,
|
||||||
"sources": sources,
|
"sources": sources,
|
||||||
|
"searches": searches,
|
||||||
"sources_map": sources_map,
|
"sources_map": sources_map,
|
||||||
|
"searches_map": searches_map,
|
||||||
"list_status": Urls.STATUS_ENUM.values,
|
"list_status": Urls.STATUS_ENUM.values,
|
||||||
"list_urls_per_page": [15, 50, 100],
|
"list_urls_per_page": [15, 100, 500],
|
||||||
}
|
}
|
||||||
|
|
||||||
# If request is AJAX, return JSON response
|
# If request is AJAX, return JSON response
|
||||||
if request.headers.get("X-Requested-With") == "XMLHttpRequest":
|
if request.headers.get("X-Requested-With") == "XMLHttpRequest":
|
||||||
return JsonResponse({'items_html': render(request, 'item_list_partial.html', context).content.decode('utf-8')})
|
return JsonResponse({'items_html': render(request, 'item_list_partial.html', context).content.decode('utf-8')})
|
||||||
@@ -83,32 +99,54 @@ def news(request):
|
|||||||
return render(request, "item_list.html", context)
|
return render(request, "item_list.html", context)
|
||||||
|
|
||||||
|
|
||||||
|
class OllamaClient():
|
||||||
|
def __init__(self):
|
||||||
|
self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
|
||||||
|
|
||||||
|
def _get_default_model(self):
|
||||||
|
return "gemma3:1b"
|
||||||
|
|
||||||
|
def get_models(self):
|
||||||
|
models = sorted([m.model for m in self.client.list().models])
|
||||||
|
if (self._get_default_model() in models):
|
||||||
|
return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
|
||||||
|
else:
|
||||||
|
return models
|
||||||
|
|
||||||
|
def get_prompt(self):
|
||||||
|
return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
|
||||||
|
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
|
||||||
|
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
|
||||||
|
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
|
||||||
|
# return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
|
||||||
|
#return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
|
||||||
|
|
||||||
|
|
||||||
def url_detail_view(request, id):
|
def url_detail_view(request, id):
|
||||||
url_item = get_object_or_404(Urls, id=id)
|
url_item = get_object_or_404(Urls, id=id)
|
||||||
url_sources = list(Source.objects.filter(urlssource__id_url=url_item).values_list('source', flat=True))
|
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
||||||
|
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
||||||
|
# url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
url_content = UrlContent.objects.get(pk=id)
|
url_content = UrlContent.objects.get(pk=id)
|
||||||
except UrlContent.DoesNotExist:
|
except UrlContent.DoesNotExist:
|
||||||
url_content = {}
|
url_content = {}
|
||||||
|
|
||||||
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
|
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
|
||||||
# LLM models available
|
ollama = OllamaClient()
|
||||||
client = ollama.Client(host = 'https://ollamamodel.matitos.org')
|
|
||||||
models = sorted([m.model for m in client.list().models])
|
|
||||||
# default_model = "llama3.2:3b"
|
|
||||||
|
|
||||||
context = {
|
context = {
|
||||||
'url_item': url_item,
|
'url_item': url_item,
|
||||||
'sources': url_sources,
|
'sources': url_sources,
|
||||||
'models': models,
|
'searches': url_searches,
|
||||||
#'default_model': default_model,
|
'models': ollama.get_models(),
|
||||||
'prompt': "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:",
|
'prompt': ollama.get_prompt(),
|
||||||
#"prompt": "Image you are a journalist, TLDR in a paragraph:",
|
|
||||||
#"prompt": "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
|
|
||||||
'url_content': url_content,
|
'url_content': url_content,
|
||||||
}
|
}
|
||||||
return render(request, 'url_detail.html', context)
|
return render(request, 'url_detail.html', context)
|
||||||
|
|
||||||
|
# TODO: move to ollamajs...
|
||||||
def fetch_details(request, id):
|
def fetch_details(request, id):
|
||||||
url_item = get_object_or_404(Urls, id=id)
|
url_item = get_object_or_404(Urls, id=id)
|
||||||
url_param = request.GET.get("url", "") # Get URL
|
url_param = request.GET.get("url", "") # Get URL
|
||||||
@@ -116,14 +154,14 @@ def fetch_details(request, id):
|
|||||||
text = request.GET.get("text", "") # Get LLM prompt
|
text = request.GET.get("text", "") # Get LLM prompt
|
||||||
|
|
||||||
# LLM
|
# LLM
|
||||||
client = ollama.Client(host = 'https://ollamamodel.matitos.org')
|
ollama = OllamaClient()
|
||||||
|
|
||||||
def stream_response():
|
def stream_response():
|
||||||
msg_content = {
|
msg_content = {
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": text,
|
"content": text,
|
||||||
}
|
}
|
||||||
response = client.chat(model=model, messages=[msg_content], stream=True)
|
response = ollama.client.chat(model=model, messages=[msg_content], stream=True)
|
||||||
for chunk in response:
|
for chunk in response:
|
||||||
yield chunk["message"]["content"] # Stream each chunk of text
|
yield chunk["message"]["content"] # Stream each chunk of text
|
||||||
|
|
||||||
|
|||||||
@@ -124,9 +124,6 @@ SCHEDULER_QUEUES = {
|
|||||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||||
'DB': os.environ.get("REDIS_DB", 0),
|
'DB': os.environ.get("REDIS_DB", 0),
|
||||||
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
|
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
|
||||||
#'USERNAME': 'some-user',
|
|
||||||
#'PASSWORD': 'some-password',
|
|
||||||
#'DEFAULT_TIMEOUT': 360,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
SCHEDULER_CONFIG = {
|
SCHEDULER_CONFIG = {
|
||||||
|
|||||||
@@ -20,6 +20,5 @@ from django.urls import path, include
|
|||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
path('admin/', admin.site.urls),
|
path('admin/', admin.site.urls),
|
||||||
path('api/', include('api.urls')),
|
path('api/', include('api.urls')),
|
||||||
#path('scheduler/', include('django_rq.urls')),
|
|
||||||
path('scheduler/', include('scheduler.urls')),
|
path('scheduler/', include('scheduler.urls')),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
"""Django's command-line utility for administrative tasks."""
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Run administrative tasks."""
|
|
||||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mysite.settings')
|
|
||||||
try:
|
|
||||||
from django.core.management import execute_from_command_line
|
|
||||||
except ImportError as exc:
|
|
||||||
raise ImportError(
|
|
||||||
"Couldn't import Django. Are you sure it's installed and "
|
|
||||||
"available on your PYTHONPATH environment variable? Did you "
|
|
||||||
"forget to activate a virtual environment?"
|
|
||||||
) from exc
|
|
||||||
execute_from_command_line(sys.argv)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
"""
|
|
||||||
ASGI config for mysite project.
|
|
||||||
|
|
||||||
It exposes the ASGI callable as a module-level variable named ``application``.
|
|
||||||
|
|
||||||
For more information on this file, see
|
|
||||||
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
from django.core.asgi import get_asgi_application
|
|
||||||
|
|
||||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mysite.settings')
|
|
||||||
|
|
||||||
application = get_asgi_application()
|
|
||||||
@@ -1,132 +0,0 @@
|
|||||||
"""
|
|
||||||
Django settings for mysite project.
|
|
||||||
|
|
||||||
Generated by 'django-admin startproject' using Django 5.1.6.
|
|
||||||
|
|
||||||
For more information on this file, see
|
|
||||||
https://docs.djangoproject.com/en/5.1/topics/settings/
|
|
||||||
|
|
||||||
For the full list of settings and their values, see
|
|
||||||
https://docs.djangoproject.com/en/5.1/ref/settings/
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
|
||||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
||||||
|
|
||||||
|
|
||||||
# Quick-start development settings - unsuitable for production
|
|
||||||
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
|
|
||||||
|
|
||||||
# SECURITY WARNING: keep the secret key used in production secret!
|
|
||||||
SECRET_KEY = 'django-insecure-0+jg0u+%s@sj759i7@jn*%-#jl)8&#=siclb5908pwe!7=*$qb'
|
|
||||||
|
|
||||||
# SECURITY WARNING: don't run with debug turned on in production!
|
|
||||||
DEBUG = True
|
|
||||||
|
|
||||||
ALLOWED_HOSTS = []
|
|
||||||
|
|
||||||
|
|
||||||
# Application definition
|
|
||||||
|
|
||||||
INSTALLED_APPS = [
|
|
||||||
'news.apps.NewsConfig',
|
|
||||||
'django.contrib.admin',
|
|
||||||
'django.contrib.auth',
|
|
||||||
'django.contrib.contenttypes',
|
|
||||||
'django.contrib.sessions',
|
|
||||||
'django.contrib.messages',
|
|
||||||
'django.contrib.staticfiles',
|
|
||||||
]
|
|
||||||
|
|
||||||
MIDDLEWARE = [
|
|
||||||
'django.middleware.security.SecurityMiddleware',
|
|
||||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
|
||||||
'django.middleware.common.CommonMiddleware',
|
|
||||||
'django.middleware.csrf.CsrfViewMiddleware',
|
|
||||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
|
||||||
'django.contrib.messages.middleware.MessageMiddleware',
|
|
||||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
|
||||||
]
|
|
||||||
|
|
||||||
ROOT_URLCONF = 'mysite.urls'
|
|
||||||
|
|
||||||
TEMPLATES = [
|
|
||||||
{
|
|
||||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
|
||||||
'DIRS': [],
|
|
||||||
'APP_DIRS': True,
|
|
||||||
'OPTIONS': {
|
|
||||||
'context_processors': [
|
|
||||||
'django.template.context_processors.debug',
|
|
||||||
'django.template.context_processors.request',
|
|
||||||
'django.contrib.auth.context_processors.auth',
|
|
||||||
'django.contrib.messages.context_processors.messages',
|
|
||||||
],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
WSGI_APPLICATION = 'mysite.wsgi.application'
|
|
||||||
|
|
||||||
|
|
||||||
# Database
|
|
||||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
|
|
||||||
|
|
||||||
DATABASES = {
|
|
||||||
'default': {
|
|
||||||
'ENGINE': 'django.db.backends.postgresql',
|
|
||||||
'NAME': os.environ.get("DJANGO_DB_NAME", "matitos"),
|
|
||||||
'USER': os.environ.get("DJANGO_DB_USER", "supermatitos"),
|
|
||||||
'PASSWORD': os.environ.get("DJANGO_DB_PASSWORD", "supermatitos"),
|
|
||||||
'HOST': os.environ.get("DJANGO_DB_HOST", "localhost"),
|
|
||||||
'PORT': os.environ.get("DJANGO_DB_PORT", "5432"),
|
|
||||||
#'OPTIONS': {
|
|
||||||
# 'options': '-c default_transaction_read_only=on'
|
|
||||||
#}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
# Password validation
|
|
||||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
|
|
||||||
|
|
||||||
AUTH_PASSWORD_VALIDATORS = [
|
|
||||||
{
|
|
||||||
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# Internationalization
|
|
||||||
# https://docs.djangoproject.com/en/5.1/topics/i18n/
|
|
||||||
|
|
||||||
LANGUAGE_CODE = 'en-us'
|
|
||||||
|
|
||||||
TIME_ZONE = 'UTC'
|
|
||||||
|
|
||||||
USE_I18N = True
|
|
||||||
|
|
||||||
USE_TZ = True
|
|
||||||
|
|
||||||
|
|
||||||
# Static files (CSS, JavaScript, Images)
|
|
||||||
# https://docs.djangoproject.com/en/5.1/howto/static-files/
|
|
||||||
|
|
||||||
STATIC_URL = 'static/'
|
|
||||||
|
|
||||||
# Default primary key field type
|
|
||||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
|
|
||||||
|
|
||||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
"""
|
|
||||||
URL configuration for mysite project.
|
|
||||||
|
|
||||||
The `urlpatterns` list routes URLs to views. For more information please see:
|
|
||||||
https://docs.djangoproject.com/en/5.1/topics/http/urls/
|
|
||||||
Examples:
|
|
||||||
Function views
|
|
||||||
1. Add an import: from my_app import views
|
|
||||||
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
|
||||||
Class-based views
|
|
||||||
1. Add an import: from other_app.views import Home
|
|
||||||
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
|
||||||
Including another URLconf
|
|
||||||
1. Import the include() function: from django.urls import include, path
|
|
||||||
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
|
||||||
"""
|
|
||||||
from django.contrib import admin
|
|
||||||
from django.urls import include, path
|
|
||||||
from django.views.generic.base import RedirectView
|
|
||||||
|
|
||||||
urlpatterns = [
|
|
||||||
path("", RedirectView.as_view(url='news/', permanent=False)),
|
|
||||||
path("news/", include("news.urls")),
|
|
||||||
path('admin/', admin.site.urls),
|
|
||||||
# path("facerecognition", include("facerecognition.urls")),
|
|
||||||
]
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
"""
|
|
||||||
WSGI config for mysite project.
|
|
||||||
|
|
||||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
|
||||||
|
|
||||||
For more information on this file, see
|
|
||||||
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
from django.core.wsgi import get_wsgi_application
|
|
||||||
|
|
||||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mysite.settings')
|
|
||||||
|
|
||||||
application = get_wsgi_application()
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
from django.contrib import admin
|
|
||||||
|
|
||||||
# Register your models here.
|
|
||||||
|
|
||||||
from .models import Urls, UrlsSource, Source
|
|
||||||
|
|
||||||
admin.site.register(Urls)
|
|
||||||
admin.site.register(UrlsSource)
|
|
||||||
admin.site.register(Source)
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
from django.apps import AppConfig
|
|
||||||
|
|
||||||
|
|
||||||
class NewsConfig(AppConfig):
|
|
||||||
default_auto_field = 'django.db.models.BigAutoField'
|
|
||||||
name = 'news'
|
|
||||||
@@ -1,38 +0,0 @@
|
|||||||
# Generated by Django 5.1.6 on 2025-02-20 15:36
|
|
||||||
|
|
||||||
import django.db.models.deletion
|
|
||||||
from django.db import migrations, models
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
initial = True
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.CreateModel(
|
|
||||||
name='SOURCE',
|
|
||||||
fields=[
|
|
||||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
|
||||||
('source', models.TextField()),
|
|
||||||
],
|
|
||||||
),
|
|
||||||
migrations.CreateModel(
|
|
||||||
name='URL',
|
|
||||||
fields=[
|
|
||||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
|
||||||
('url', models.TextField()),
|
|
||||||
('pub_date', models.DateTimeField(verbose_name='date published')),
|
|
||||||
],
|
|
||||||
),
|
|
||||||
migrations.CreateModel(
|
|
||||||
name='URL_SOURCE',
|
|
||||||
fields=[
|
|
||||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
|
||||||
('source', models.ForeignKey(on_delete=django.db.models.deletion.RESTRICT, to='news.source')),
|
|
||||||
('url', models.ForeignKey(on_delete=django.db.models.deletion.RESTRICT, to='news.url')),
|
|
||||||
],
|
|
||||||
),
|
|
||||||
]
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
# Generated by Django 5.1.6 on 2025-02-20 16:11
|
|
||||||
|
|
||||||
from django.db import migrations
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
('news', '0001_initial'),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.AlterModelTable(
|
|
||||||
name='source',
|
|
||||||
table='source',
|
|
||||||
),
|
|
||||||
migrations.AlterModelTable(
|
|
||||||
name='url',
|
|
||||||
table='urls',
|
|
||||||
),
|
|
||||||
migrations.AlterModelTable(
|
|
||||||
name='url_source',
|
|
||||||
table='urls_source',
|
|
||||||
),
|
|
||||||
]
|
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
# Generated by Django 5.1.6 on 2025-02-20 16:18
|
|
||||||
|
|
||||||
import django.db.models.functions.datetime
|
|
||||||
from django.db import migrations, models
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
('news', '0002_alter_source_table_alter_url_table_and_more'),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.RemoveField(
|
|
||||||
model_name='url',
|
|
||||||
name='pub_date',
|
|
||||||
),
|
|
||||||
migrations.AddField(
|
|
||||||
model_name='url',
|
|
||||||
name='status',
|
|
||||||
field=models.CharField(choices=[('raw', 'Raw'), ('error', 'Error'), ('valid', 'Valid'), ('unknown', 'Unknown'), ('invalid', 'Invalid'), ('duplicate', 'Duplicate')], default='raw'),
|
|
||||||
),
|
|
||||||
migrations.AddField(
|
|
||||||
model_name='url',
|
|
||||||
name='ts_fetch',
|
|
||||||
field=models.DateTimeField(db_default=django.db.models.functions.datetime.Now(), verbose_name='Date fetched'),
|
|
||||||
),
|
|
||||||
migrations.AlterField(
|
|
||||||
model_name='url',
|
|
||||||
name='url',
|
|
||||||
field=models.TextField(verbose_name='URL'),
|
|
||||||
),
|
|
||||||
]
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
# Generated by Django 5.1.6 on 2025-02-20 16:32
|
|
||||||
|
|
||||||
from django.db import migrations
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
('news', '0003_remove_url_pub_date_url_status_url_ts_fetch_and_more'),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.AlterUniqueTogether(
|
|
||||||
name='url_source',
|
|
||||||
unique_together={('url', 'source')},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
@@ -1,59 +0,0 @@
|
|||||||
# Generated by Django 5.1.6 on 2025-02-20 16:53
|
|
||||||
|
|
||||||
import django.db.models.deletion
|
|
||||||
from django.db import migrations, models
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
('news', '0004_alter_url_source_unique_together'),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.CreateModel(
|
|
||||||
name='Urls',
|
|
||||||
fields=[
|
|
||||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
|
||||||
('url', models.TextField(unique=True)),
|
|
||||||
('ts_fetch', models.DateTimeField()),
|
|
||||||
('status', models.TextField(choices=[('raw', 'Raw'), ('error', 'Error'), ('valid', 'Valid'), ('unknown', 'Unknown'), ('invalid', 'Invalid'), ('duplicate', 'Duplicate')], default='raw')),
|
|
||||||
],
|
|
||||||
options={
|
|
||||||
'db_table': 'urls',
|
|
||||||
'managed': False,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
migrations.RemoveField(
|
|
||||||
model_name='url_source',
|
|
||||||
name='url',
|
|
||||||
),
|
|
||||||
migrations.AlterUniqueTogether(
|
|
||||||
name='url_source',
|
|
||||||
unique_together=None,
|
|
||||||
),
|
|
||||||
migrations.RemoveField(
|
|
||||||
model_name='url_source',
|
|
||||||
name='source',
|
|
||||||
),
|
|
||||||
migrations.AlterModelOptions(
|
|
||||||
name='source',
|
|
||||||
options={'managed': False},
|
|
||||||
),
|
|
||||||
migrations.CreateModel(
|
|
||||||
name='UrlsSource',
|
|
||||||
fields=[
|
|
||||||
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='news.urls')),
|
|
||||||
],
|
|
||||||
options={
|
|
||||||
'db_table': 'urls_source',
|
|
||||||
'managed': False,
|
|
||||||
},
|
|
||||||
),
|
|
||||||
migrations.DeleteModel(
|
|
||||||
name='URL',
|
|
||||||
),
|
|
||||||
migrations.DeleteModel(
|
|
||||||
name='URL_SOURCE',
|
|
||||||
),
|
|
||||||
]
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
# Generated by Django 5.1.6 on 2025-03-06 09:36
|
|
||||||
|
|
||||||
from django.db import migrations
|
|
||||||
|
|
||||||
|
|
||||||
class Migration(migrations.Migration):
|
|
||||||
|
|
||||||
dependencies = [
|
|
||||||
('news', '0005_urls_remove_url_source_url_and_more'),
|
|
||||||
]
|
|
||||||
|
|
||||||
operations = [
|
|
||||||
migrations.AlterModelOptions(
|
|
||||||
name='urls',
|
|
||||||
options={'managed': False, 'ordering': ['-ts_fetch']},
|
|
||||||
),
|
|
||||||
]
|
|
||||||
@@ -1,61 +0,0 @@
|
|||||||
from django.db import models
|
|
||||||
from django.contrib.postgres.fields import ArrayField
|
|
||||||
|
|
||||||
# Create your models here.
|
|
||||||
class Urls(models.Model):
|
|
||||||
class STATUS_ENUM(models.TextChoices):
|
|
||||||
RAW = "raw"
|
|
||||||
ERROR = "error"
|
|
||||||
VALID = "valid"
|
|
||||||
UNKNOWN = "unknown"
|
|
||||||
INVALID = "invalid"
|
|
||||||
DUPLICATE = "duplicate"
|
|
||||||
|
|
||||||
url = models.TextField(unique=True)
|
|
||||||
ts_fetch = models.DateTimeField()
|
|
||||||
status = models.TextField(choices=STATUS_ENUM, default=STATUS_ENUM.RAW) # This field type is a guess.
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.url
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
managed = False
|
|
||||||
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
|
|
||||||
ordering = ["-ts_fetch"]
|
|
||||||
|
|
||||||
class Source(models.Model):
|
|
||||||
id = models.SmallAutoField(primary_key=True)
|
|
||||||
source = models.TextField(unique=True)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.source
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
managed = False
|
|
||||||
db_table = 'source'
|
|
||||||
|
|
||||||
class UrlsSource(models.Model):
|
|
||||||
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source) found, that is not supported. The first column is selected.
|
|
||||||
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return "Source: {}, URL: {}".format(self.id_source, self.id_url)
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
managed = False
|
|
||||||
db_table = 'urls_source'
|
|
||||||
unique_together = (('id_url', 'id_source'),)
|
|
||||||
|
|
||||||
class UrlContent(models.Model):
|
|
||||||
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True)
|
|
||||||
date_published = models.DateTimeField(blank=True, null=True)
|
|
||||||
title = models.TextField(blank=True, null=True)
|
|
||||||
description = models.TextField(blank=True, null=True)
|
|
||||||
content = models.TextField(blank=True, null=True)
|
|
||||||
tags = ArrayField(models.TextField(blank=True, null=True))
|
|
||||||
authors = ArrayField(models.TextField(blank=True, null=True))
|
|
||||||
image_urls = ArrayField(models.TextField(blank=True, null=True))
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
managed = False
|
|
||||||
db_table = 'url_content'
|
|
||||||
@@ -1,508 +0,0 @@
|
|||||||
<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
||||||
<title>News</title>
|
|
||||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
|
|
||||||
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
|
||||||
|
|
||||||
<script>
|
|
||||||
|
|
||||||
function getQueryString(pageNumber, itemsNumber, sources, statuses){
|
|
||||||
// Query parameters. If input is null, get most recent value
|
|
||||||
let queryParams = new URLSearchParams(window.location.search);
|
|
||||||
// page
|
|
||||||
if (pageNumber == null) pageNumber = queryParams.get("page") ?? 1;
|
|
||||||
queryParams.set("page", pageNumber);
|
|
||||||
// items
|
|
||||||
if (itemsNumber == null) itemsNumber = queryParams.get("items") ?? 15;
|
|
||||||
queryParams.set("items", itemsNumber);
|
|
||||||
// sources
|
|
||||||
if (sources == null) sources = queryParams.get("sources") ?? "all";
|
|
||||||
queryParams.set("sources", sources);
|
|
||||||
// status
|
|
||||||
if (statuses == null) statuses = queryParams.get("status") ?? "all";
|
|
||||||
queryParams.set("status", statuses);
|
|
||||||
|
|
||||||
// Encoding fix: %2C -> ,
|
|
||||||
let queryParamsString = queryParams.toString();
|
|
||||||
while (queryParamsString.includes("%2C")) {
|
|
||||||
queryParamsString = queryParamsString.replace("%2C", ",");
|
|
||||||
}
|
|
||||||
return queryParamsString;
|
|
||||||
}
|
|
||||||
|
|
||||||
function loadPage(pageNumber, itemsNumber, sources, statuses) {
|
|
||||||
$("#item-list").fadeTo(100, 0.5); // Smooth fade effect
|
|
||||||
$("#loading").show();
|
|
||||||
|
|
||||||
queryParamsString = getQueryString(pageNumber, itemsNumber, sources, statuses);
|
|
||||||
|
|
||||||
$.ajax({
|
|
||||||
url: "?" + queryParamsString,
|
|
||||||
type: "GET",
|
|
||||||
headers: { "X-Requested-With": "XMLHttpRequest" },
|
|
||||||
success: function (data) {
|
|
||||||
$("#item-list").fadeTo(0, 1).html(data.items_html); // Restore opacity smoothly
|
|
||||||
$("#loading").hide();
|
|
||||||
// Update URL without reloading
|
|
||||||
window.history.pushState({}, "", "?" + queryParamsString);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Pagination
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
$(document).on("click", ".pagination a", function (event) {
|
|
||||||
event.preventDefault();
|
|
||||||
let page = $(this).attr("data-page");
|
|
||||||
loadPage(pageNumber=page, itemsNumber=null, sources=null, statuses=null);
|
|
||||||
});
|
|
||||||
|
|
||||||
$(document).ready(function () {
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Filter updates
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
const sourcesToggleAll = $("#toggle-all-sources");
|
|
||||||
const sourcesCheckboxes = $(".source-checkbox");
|
|
||||||
const statusesToggleAll = $("#toggle-all-status");
|
|
||||||
const statusCheckboxes = $(".status-checkbox");
|
|
||||||
|
|
||||||
function updateFilters() {
|
|
||||||
// Get selected sources
|
|
||||||
let selectedSources = sourcesCheckboxes.filter(":checked").map(function () {
|
|
||||||
return $(this).val();
|
|
||||||
}).get().join(",");
|
|
||||||
|
|
||||||
// Get selected URL statuses
|
|
||||||
let selectedStatuses = statusCheckboxes.filter(":checked").map(function () {
|
|
||||||
return $(this).val();
|
|
||||||
}).get().join(",");
|
|
||||||
|
|
||||||
// Get selected items per page
|
|
||||||
let selectedItems = $("input[name='items']:checked").val();
|
|
||||||
|
|
||||||
// Update pagination and reload data
|
|
||||||
loadPage(1, selectedItems, selectedSources, selectedStatuses);
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Change triggers
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Sources
|
|
||||||
sourcesToggleAll.on("change", function () {
|
|
||||||
sourcesCheckboxes.prop("checked", sourcesToggleAll.prop("checked"));
|
|
||||||
updateFilters();
|
|
||||||
});
|
|
||||||
sourcesCheckboxes.on("change", function () {
|
|
||||||
sourcesToggleAll.prop("checked", sourcesCheckboxes.length === sourcesCheckboxes.filter(":checked").length);
|
|
||||||
updateFilters();
|
|
||||||
});
|
|
||||||
// Status
|
|
||||||
statusesToggleAll.on("change", function () {
|
|
||||||
statusCheckboxes.prop("checked", statusesToggleAll.prop("checked"));
|
|
||||||
updateFilters();
|
|
||||||
});
|
|
||||||
statusCheckboxes.on("change", function () {
|
|
||||||
// If all checkboxes are checked, mark "Toggle All" as checked
|
|
||||||
statusesToggleAll.prop("checked", statusCheckboxes.length === statusCheckboxes.filter(":checked").length);
|
|
||||||
updateFilters();
|
|
||||||
});
|
|
||||||
|
|
||||||
// Items change trigger update
|
|
||||||
$(".items").on("change", updateFilters);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Default values
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Sources
|
|
||||||
sourcesCheckboxes.each(function () { $(this).prop("checked", true); });
|
|
||||||
sourcesToggleAll.prop("checked", true);
|
|
||||||
// Statuses
|
|
||||||
statusCheckboxes.each(function () { $(this).prop("checked", true); });
|
|
||||||
statusesToggleAll.prop("checked", true);
|
|
||||||
// Items
|
|
||||||
$("input[name='items'][value='" + 15 + "']").prop("checked", true);
|
|
||||||
});
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Theme logic
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
function setTheme(mode) {
|
|
||||||
document.documentElement.setAttribute("data-theme", mode);
|
|
||||||
document.documentElement.setAttribute("data-bs-theme", mode);
|
|
||||||
localStorage.setItem("theme", mode);
|
|
||||||
document.getElementById("theme-icon").innerHTML = mode === "dark" ? "🌞" : "🌙";
|
|
||||||
document.body.classList.toggle("dark-mode", mode === "dark");
|
|
||||||
}
|
|
||||||
|
|
||||||
function toggleTheme() {
|
|
||||||
let currentTheme = document.documentElement.getAttribute("data-theme");
|
|
||||||
setTheme(currentTheme === "dark" ? "light" : "dark");
|
|
||||||
}
|
|
||||||
|
|
||||||
document.addEventListener("DOMContentLoaded", function () {
|
|
||||||
let savedTheme = localStorage.getItem("theme") ||
|
|
||||||
(window.matchMedia("(prefers-color-scheme: dark)").matches ? "dark" : "light");
|
|
||||||
setTheme(savedTheme);
|
|
||||||
});
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
</script>
|
|
||||||
|
|
||||||
<style>
|
|
||||||
/* Content Area */
|
|
||||||
#content {
|
|
||||||
margin-left: 170px; /* Match sidebar width */
|
|
||||||
min-width: calc(100vw - 170px); /* Ensure it doesn't shrink into the sidebar */
|
|
||||||
width: calc(100vw - 170px); /* Expands based on screen size */
|
|
||||||
padding: 20px;
|
|
||||||
overflow-x: auto; /* Prevent content from being squeezed */
|
|
||||||
transition: margin-left 0.3s ease;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Sidebar Styles */
|
|
||||||
#sidebar {
|
|
||||||
height: 100vh;
|
|
||||||
position: fixed;
|
|
||||||
top: 0;
|
|
||||||
left: 0;
|
|
||||||
width: 170px; /* Default width */
|
|
||||||
background-color: var(--bg-color);
|
|
||||||
box-shadow: 2px 0 5px rgba(0, 0, 0, 0.1);
|
|
||||||
padding: 15px;
|
|
||||||
transition: width 0.3s ease;
|
|
||||||
}
|
|
||||||
|
|
||||||
#sidebar .nav-link {
|
|
||||||
color: var(--text-color);
|
|
||||||
}
|
|
||||||
|
|
||||||
#sidebar .nav-link:hover {
|
|
||||||
background-color: var(--pagination-hover-bg);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ============================= */
|
|
||||||
/* Responsive Enhancements */
|
|
||||||
/* ============================= */
|
|
||||||
@media (min-width: 1200px) {
|
|
||||||
.table {
|
|
||||||
width: 95%; /* Allows table to take more space */
|
|
||||||
margin: 0 auto; /* Centers the table */
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@media (max-width: 768px) {
|
|
||||||
#sidebar {
|
|
||||||
width: 70px; /* Collapse sidebar to smaller width */
|
|
||||||
/*padding: 10px;*/
|
|
||||||
}
|
|
||||||
|
|
||||||
#content {
|
|
||||||
margin-left: 70px; /* Adjust margin to match collapsed sidebar */
|
|
||||||
min-width: calc(100vw - 70px); /* Prevent overlap */
|
|
||||||
/*padding: 10px;*/
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Adjust table for small screens */
|
|
||||||
.table-responsive {
|
|
||||||
overflow-x: auto;
|
|
||||||
}
|
|
||||||
|
|
||||||
.table th,
|
|
||||||
.table td {
|
|
||||||
white-space: nowrap; /* Prevent text wrapping in cells */
|
|
||||||
}
|
|
||||||
|
|
||||||
.table a {
|
|
||||||
word-break: break-word; /* Ensure long URLs break properly */
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ============================= */
|
|
||||||
/* Global Styles */
|
|
||||||
/* ============================= */
|
|
||||||
body {
|
|
||||||
background-color: var(--bg-color);
|
|
||||||
color: var(--text-color);
|
|
||||||
transition: background-color 0.3s, color 0.3s;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ============================= */
|
|
||||||
/* Light & Dark Mode Variables */
|
|
||||||
/* ============================= */
|
|
||||||
:root {
|
|
||||||
--bg-color: #ffffff;
|
|
||||||
--text-color: #212529;
|
|
||||||
--table-bg: #ffffff;
|
|
||||||
--table-text: #000000;
|
|
||||||
--table-border: #dee2e6;
|
|
||||||
--link-color: #007bff;
|
|
||||||
--pagination-bg: #ffffff;
|
|
||||||
--pagination-border: #dee2e6;
|
|
||||||
--pagination-hover-bg: #f8f9fa;
|
|
||||||
--pagination-active-bg: #007bff;
|
|
||||||
--pagination-active-text: #ffffff;
|
|
||||||
--button-bg: #f8f9fa;
|
|
||||||
--button-border: #ced4da;
|
|
||||||
--button-text: #212529;
|
|
||||||
}
|
|
||||||
|
|
||||||
[data-theme="dark"] {
|
|
||||||
--bg-color: #121212;
|
|
||||||
--text-color: #e0e0e0;
|
|
||||||
--table-bg: #1e1e1e;
|
|
||||||
--table-text: #ffffff;
|
|
||||||
--table-border: #2c2c2c;
|
|
||||||
--link-color: #9ec5fe;
|
|
||||||
--pagination-bg: #1e1e1e;
|
|
||||||
--pagination-border: #444;
|
|
||||||
--pagination-hover-bg: #333;
|
|
||||||
--pagination-active-bg: #007bff;
|
|
||||||
--pagination-active-text: #ffffff;
|
|
||||||
--button-bg: #1e1e1e;
|
|
||||||
--button-border: #444;
|
|
||||||
--button-text: #e0e0e0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ============================= */
|
|
||||||
/* Table Styling */
|
|
||||||
/* ============================= */
|
|
||||||
.table-responsive {
|
|
||||||
width: 100%; /* Ensure it spans the full width of its container */
|
|
||||||
max-width: 100%;
|
|
||||||
overflow-x: auto;
|
|
||||||
}
|
|
||||||
|
|
||||||
.table {
|
|
||||||
background-color: var(--table-bg);
|
|
||||||
color: var(--table-text);
|
|
||||||
border: 1px solid var(--table-border);
|
|
||||||
transition: background-color 0.3s, color 0.3s;
|
|
||||||
|
|
||||||
width: 100%; /* Ensures it takes full width of its container */
|
|
||||||
table-layout: auto; /* Allows columns to adjust dynamically */
|
|
||||||
/*white-space: nowrap;*/ /* Prevents text wrapping in cells */
|
|
||||||
}
|
|
||||||
|
|
||||||
.table th,
|
|
||||||
.table td {
|
|
||||||
border-color: var(--table-border);
|
|
||||||
}
|
|
||||||
|
|
||||||
.table thead {
|
|
||||||
background-color: var(--pagination-active-bg);
|
|
||||||
color: var(--pagination-active-text);
|
|
||||||
}
|
|
||||||
|
|
||||||
[data-theme="dark"] .table {
|
|
||||||
background-color: var(--table-bg);
|
|
||||||
color: var(--table-text);
|
|
||||||
}
|
|
||||||
|
|
||||||
[data-theme="dark"] .table th,
|
|
||||||
[data-theme="dark"] .table td {
|
|
||||||
border-color: var(--table-border);
|
|
||||||
}
|
|
||||||
|
|
||||||
[data-theme="dark"] .table thead {
|
|
||||||
background-color: #333;
|
|
||||||
color: #fff;
|
|
||||||
}
|
|
||||||
|
|
||||||
th:nth-child(1), td:nth-child(1) { width: 50%; } /* URL column */
|
|
||||||
th:nth-child(2), td:nth-child(2) { width: 20%; } /* Fetch Date */
|
|
||||||
th:nth-child(3), td:nth-child(3) { width: 20%; } /* Sources */
|
|
||||||
th:nth-child(4), td:nth-child(4) { width: 5%; } /* Status */
|
|
||||||
th:nth-child(5), td:nth-child(5) { width: 5%; } /* Action */
|
|
||||||
|
|
||||||
/* ============================= */
|
|
||||||
/* Pagination Styling */
|
|
||||||
/* ============================= */
|
|
||||||
.pagination {
|
|
||||||
display: flex;
|
|
||||||
justify-content: center;
|
|
||||||
padding: 10px 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
.pagination .page-link {
|
|
||||||
background-color: var(--pagination-bg);
|
|
||||||
border-color: var(--pagination-border);
|
|
||||||
color: var(--text-color);
|
|
||||||
padding: 10px 14px;
|
|
||||||
margin: 0 5px;
|
|
||||||
border-radius: 8px;
|
|
||||||
transition: background-color 0.3s, color 0.3s, transform 0.2s;
|
|
||||||
}
|
|
||||||
|
|
||||||
.pagination .page-link:hover {
|
|
||||||
background-color: var(--pagination-hover-bg);
|
|
||||||
transform: scale(1.05);
|
|
||||||
}
|
|
||||||
|
|
||||||
.pagination .active .page-link {
|
|
||||||
background-color: var(--pagination-active-bg);
|
|
||||||
color: var(--pagination-active-text);
|
|
||||||
border-color: var(--pagination-active-bg);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ============================= */
|
|
||||||
/* Theme Toggle Button */
|
|
||||||
/* ============================= */
|
|
||||||
.theme-toggle-btn {
|
|
||||||
background-color: var(--button-bg);
|
|
||||||
border: 1px solid var(--button-border);
|
|
||||||
color: var(--button-text);
|
|
||||||
border-radius: 50%;
|
|
||||||
width: 40px;
|
|
||||||
height: 40px;
|
|
||||||
font-size: 20px;
|
|
||||||
display: flex;
|
|
||||||
align-items: center;
|
|
||||||
justify-content: center;
|
|
||||||
transition: background-color 0.3s, color 0.3s, transform 0.2s;
|
|
||||||
cursor: pointer;
|
|
||||||
}
|
|
||||||
|
|
||||||
.theme-toggle-btn:hover {
|
|
||||||
background-color: var(--pagination-hover-bg);
|
|
||||||
transform: rotate(20deg);
|
|
||||||
}
|
|
||||||
|
|
||||||
.theme-toggle-btn:active {
|
|
||||||
transform: scale(0.95);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ============================= */
|
|
||||||
/* Loading Spinner Styling */
|
|
||||||
/* ============================= */
|
|
||||||
#loading {
|
|
||||||
position: fixed;
|
|
||||||
left: 50%;
|
|
||||||
top: 50%;
|
|
||||||
transform: translate(-50%, -50%);
|
|
||||||
z-index: 1050;
|
|
||||||
display: none;
|
|
||||||
}
|
|
||||||
|
|
||||||
.spinner-border {
|
|
||||||
width: 4rem;
|
|
||||||
height: 4rem;
|
|
||||||
}
|
|
||||||
|
|
||||||
</style>
|
|
||||||
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<!-- Left Sidebar -->
|
|
||||||
<div id="sidebar" class="d-flex flex-column">
|
|
||||||
<ul class="nav flex-column">
|
|
||||||
|
|
||||||
<!-- Theme Toggle Button -->
|
|
||||||
<div class="nav-item">
|
|
||||||
<button onclick="toggleTheme()" class="theme-toggle-btn">
|
|
||||||
<span id="theme-icon">🌙</span>
|
|
||||||
</button>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Sources -->
|
|
||||||
<div class="nav-item mt-3">
|
|
||||||
<strong>Select sources</strong>
|
|
||||||
<form id="source-filter-form">
|
|
||||||
<!-- Toggle All Checkbox -->
|
|
||||||
<div class="form-check">
|
|
||||||
<input class="form-check-input" type="checkbox" id="toggle-all-sources">
|
|
||||||
<label class="form-check-label fw-bold" for="toggle-all-sources">
|
|
||||||
Toggle all
|
|
||||||
</label>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Individual Source Checkboxes -->
|
|
||||||
{% for source in sources %}
|
|
||||||
<div class="form-check">
|
|
||||||
<input class="form-check-input source-checkbox" type="checkbox" value="{{ source.id }}" id="source-{{ source.id }}">
|
|
||||||
<label class="form-check-label" for="source-{{ source.id }}">
|
|
||||||
{{ source.source }}
|
|
||||||
</label>
|
|
||||||
</div>
|
|
||||||
{% empty %}
|
|
||||||
<tr>
|
|
||||||
<td colspan="2" class="text-center">No sources available.</td>
|
|
||||||
</tr>
|
|
||||||
{% endfor %}
|
|
||||||
</form>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Status -->
|
|
||||||
<div class="nav-item mt-3">
|
|
||||||
<strong>Select status</strong>
|
|
||||||
<form id="status-filter-form">
|
|
||||||
<!-- Toggle All Checkbox -->
|
|
||||||
<div class="status-form-check">
|
|
||||||
<input class="form-check-input" type="checkbox" id="toggle-all-status">
|
|
||||||
<label class="form-check-label fw-bold" for="toggle-all-status">
|
|
||||||
Toggle all
|
|
||||||
</label>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Individual Status Checkboxes -->
|
|
||||||
{% for status in list_status %}
|
|
||||||
<div class="status-form-check">
|
|
||||||
<input class="form-check-input status-checkbox" type="checkbox" value="{{ status }}" id="status-{{ status }}">
|
|
||||||
<label class="form-check-label" for="status-{{ status }}">
|
|
||||||
{{ status }}
|
|
||||||
</label>
|
|
||||||
</div>
|
|
||||||
{% empty %}
|
|
||||||
<tr>
|
|
||||||
<td colspan="2" class="text-center">No sources available.</td>
|
|
||||||
</tr>
|
|
||||||
{% endfor %}
|
|
||||||
</form>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- URLs per page -->
|
|
||||||
<div class="nav-item mt-3">
|
|
||||||
<strong>URLs per page</strong>
|
|
||||||
<div class="card-body">
|
|
||||||
<!-- Individual Status Checkboxes -->
|
|
||||||
{% for url_per_page in list_urls_per_page %}
|
|
||||||
<div class="items-form-check">
|
|
||||||
<input class="form-check-input items" type="radio" name="items" id="value-{{ url_per_page }}" value="{{ url_per_page }}">
|
|
||||||
<label class="form-check-label" for="value-{{ url_per_page }}">{{ url_per_page }}</label>
|
|
||||||
</div>
|
|
||||||
{% empty %}
|
|
||||||
<tr>
|
|
||||||
<td colspan="2" class="text-center">No options available.</td>
|
|
||||||
</tr>
|
|
||||||
{% endfor %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Main Content Area -->
|
|
||||||
<div id="content" class="main-content">
|
|
||||||
<div class="container mt-4">
|
|
||||||
|
|
||||||
<!-- Table -->
|
|
||||||
<div id="item-list">
|
|
||||||
{% include 'item_list_partial.html' %}
|
|
||||||
</div>
|
|
||||||
<!-- Loading... -->
|
|
||||||
<div id="loading" class="text-center mt-3" style="display:none;">
|
|
||||||
<div class="spinner-border text-primary" role="status">
|
|
||||||
<span class="visually-hidden">Loading...</span>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@@ -1,87 +0,0 @@
|
|||||||
{% load custom_filters %}
|
|
||||||
|
|
||||||
<div class="table-responsive">
|
|
||||||
<table class="table table-hover">
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th scope="col"><strong>URL</strong></th>
|
|
||||||
<th scope="col"><strong>Fetch date</strong></th>
|
|
||||||
<th scope="col"><strong>Sources</strong></th>
|
|
||||||
<th scope="col"><strong>Status</strong></th>
|
|
||||||
<th scope="col"><strong>Action</strong></th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
{% for item in page_obj %}
|
|
||||||
<tr>
|
|
||||||
<td><a href="{{ item.url }}/" target="_blank">{{ item.url }}</a></td>
|
|
||||||
<td>{{ item.ts_fetch }}</td>
|
|
||||||
<td>
|
|
||||||
{% with sources_map|dict_get:item.id as sources %}
|
|
||||||
{% if sources %}
|
|
||||||
{% for source in sources %}
|
|
||||||
<span class="badge bg-secondary">{{ source }}</span>
|
|
||||||
{% endfor %}
|
|
||||||
{% else %}
|
|
||||||
<span class="text-muted">No sources</span>
|
|
||||||
{% endif %}
|
|
||||||
{% endwith %}
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
{% if item.status == 'raw' %}
|
|
||||||
<span class="badge bg-secondary">{{ item.status|capfirst }}</span>
|
|
||||||
{% elif item.status == 'error' %}
|
|
||||||
<span class="badge bg-danger">{{ item.status|capfirst }}</span>
|
|
||||||
{% elif item.status == 'valid' %}
|
|
||||||
<span class="badge bg-success">{{ item.status|capfirst }}</span>
|
|
||||||
{% elif item.status == 'unknown' %}
|
|
||||||
<span class="badge bg-warning">{{ item.status|capfirst }}</span>
|
|
||||||
{% elif item.status == 'invalid' %}
|
|
||||||
<span class="badge bg-danger">{{ item.status|capfirst }}</span>
|
|
||||||
{% elif item.status == 'duplicate' %}
|
|
||||||
<span class="badge bg-info">{{ item.status|capfirst }}</span>
|
|
||||||
{% else %}
|
|
||||||
<span class="badge bg-light">Unknown</span>
|
|
||||||
{% endif %}
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<a href="url/{{ item.id }}" class="btn btn-primary btn-sm" target="_blank">Details</a>
|
|
||||||
</td>
|
|
||||||
|
|
||||||
</tr>
|
|
||||||
{% empty %}
|
|
||||||
<tr>
|
|
||||||
<td colspan="4" class="text-center">No items available.</td>
|
|
||||||
</tr>
|
|
||||||
{% endfor %}
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="d-flex justify-content-center mt-3">
|
|
||||||
<nav>
|
|
||||||
<ul class="pagination">
|
|
||||||
{% if page_obj.has_previous %}
|
|
||||||
<li class="page-item">
|
|
||||||
<a class="page-link" href="#" data-page="1">First</a>
|
|
||||||
</li>
|
|
||||||
<li class="page-item">
|
|
||||||
<a class="page-link" href="#" data-page="{{ page_obj.previous_page_number }}">Previous</a>
|
|
||||||
</li>
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
<li class="page-item active">
|
|
||||||
<span class="page-link">Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}</span>
|
|
||||||
</li>
|
|
||||||
|
|
||||||
{% if page_obj.has_next %}
|
|
||||||
<li class="page-item">
|
|
||||||
<a class="page-link" href="#" data-page="{{ page_obj.next_page_number }}">Next</a>
|
|
||||||
</li>
|
|
||||||
<li class="page-item">
|
|
||||||
<a class="page-link" href="#" data-page="{{ page_obj.paginator.num_pages }}">Last</a>
|
|
||||||
</li>
|
|
||||||
{% endif %}
|
|
||||||
</ul>
|
|
||||||
</nav>
|
|
||||||
</div>
|
|
||||||
@@ -1,211 +0,0 @@
|
|||||||
<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
||||||
<title>{% block title %}News{% endblock %}</title>
|
|
||||||
|
|
||||||
<!-- Bootstrap CSS -->
|
|
||||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
|
|
||||||
<!-- Add jQuery from CDN (before other scripts) -->
|
|
||||||
<script src="https://code.jquery.com/jquery-3.6.4.min.js"></script>
|
|
||||||
<!-- Markdown -->
|
|
||||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
|
||||||
|
|
||||||
<!-- Custom Styles -->
|
|
||||||
<style>
|
|
||||||
body {
|
|
||||||
background-color: #f4f4f4;
|
|
||||||
}
|
|
||||||
.navbar-dark .navbar-nav .nav-link {
|
|
||||||
color: rgba(255,255,255,0.75);
|
|
||||||
}
|
|
||||||
.chat-box {
|
|
||||||
background-color: #fff;
|
|
||||||
border: 1px solid #ddd;
|
|
||||||
padding: 15px;
|
|
||||||
border-radius: 8px;
|
|
||||||
overflow-y: auto; /* Enable vertical scrolling */
|
|
||||||
max-width: 100%;
|
|
||||||
min-height: 150px;
|
|
||||||
max-height: 450px;
|
|
||||||
white-space: normal;
|
|
||||||
word-wrap: break-word;
|
|
||||||
word-break: break-word;
|
|
||||||
}
|
|
||||||
|
|
||||||
</style>
|
|
||||||
|
|
||||||
</head>
|
|
||||||
<script>
|
|
||||||
|
|
||||||
function fetchDetails(urlId, url) {
|
|
||||||
// Show the loading spinner
|
|
||||||
document.getElementById("loading-spinner").style.display = "block";
|
|
||||||
|
|
||||||
// Get the input value
|
|
||||||
let inputText = document.getElementById(`custom-input-${urlId}`).value;
|
|
||||||
// Get the input model
|
|
||||||
let selectedModel = document.getElementById(`options-${urlId}`).value;
|
|
||||||
// Check if a model is selected
|
|
||||||
if (!selectedModel) {
|
|
||||||
alert("Please select a model before fetching details.");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fetch URL
|
|
||||||
let fetchUrl = `/news/url/${urlId}/fetch/?url=${encodeURIComponent(url)}&model=${encodeURIComponent(selectedModel)}&text=${encodeURIComponent(inputText)}`;
|
|
||||||
|
|
||||||
let resultContainer = $("#chat-output");
|
|
||||||
resultContainer.html(""); // Clear previous content before fetching
|
|
||||||
|
|
||||||
let fetchButton = $("button[onclick^='fetchDetails']"); // Select the button
|
|
||||||
fetchButton.prop("disabled", true); // Disable button
|
|
||||||
|
|
||||||
|
|
||||||
fetch(fetchUrl)
|
|
||||||
.then(response => {
|
|
||||||
if (!response.ok) {
|
|
||||||
throw new Error("Error on network response");
|
|
||||||
}
|
|
||||||
const reader = response.body.getReader();
|
|
||||||
const decoder = new TextDecoder();
|
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////
|
|
||||||
|
|
||||||
let accumulatedText = ""; // Store streamed text before rendering Markdown
|
|
||||||
// Create a temporary container for streaming response
|
|
||||||
let messageContainer = $('<div class="chat-message"></div>');
|
|
||||||
//let messageContainer = $('');
|
|
||||||
resultContainer.append(messageContainer);
|
|
||||||
//////////////////////////////////////
|
|
||||||
|
|
||||||
function read() {
|
|
||||||
return reader.read().then(({ done, value }) => {
|
|
||||||
if (done) {
|
|
||||||
//////////////////////////////////////
|
|
||||||
messageContainer.html(marked.parse(accumulatedText));
|
|
||||||
//////////////////////////////////////
|
|
||||||
fetchButton.prop("disabled", false); // Re-enable button when done
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////////
|
|
||||||
// Decode the streamed chunk
|
|
||||||
let chunk = decoder.decode(value);
|
|
||||||
// Append to the accumulated text
|
|
||||||
accumulatedText += chunk;
|
|
||||||
// Render Markdown progressively (but safely)
|
|
||||||
messageContainer.html(marked.parse(accumulatedText));
|
|
||||||
//////////////////////////////////////
|
|
||||||
|
|
||||||
//////////////////////////////////////
|
|
||||||
// ORIGINAL:
|
|
||||||
//let text = decoder.decode(value).replace(/\n/g, "<br>");
|
|
||||||
//resultContainer.append(text); // Append streamed text
|
|
||||||
//////////////////////////////////////
|
|
||||||
|
|
||||||
resultContainer.scrollTop(resultContainer[0].scrollHeight); // Auto-scroll to bottom
|
|
||||||
return read();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
return read();
|
|
||||||
})
|
|
||||||
.catch(error => {
|
|
||||||
resultContainer.html(`<p class="text-danger">Error fetching details: ${error.message}</p>`);
|
|
||||||
fetchButton.prop("disabled", false); // Re-enable button on error
|
|
||||||
})
|
|
||||||
.finally(() => {
|
|
||||||
// Hide the loading spinner after request is complete
|
|
||||||
document.getElementById("loading-spinner").style.display = "none";
|
|
||||||
});
|
|
||||||
;
|
|
||||||
}
|
|
||||||
</script>
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<!-- Main Content -->
|
|
||||||
<div class="container mt-4">
|
|
||||||
<h2>URL Details</h2>
|
|
||||||
<table class="table table-bordered">
|
|
||||||
<tr>
|
|
||||||
<th>URL</th>
|
|
||||||
<td><a href="{{ url_item.url }}" target="_blank">{{ url_item.url }}</a></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<th>Fetch Date</th>
|
|
||||||
<td>{{ url_item.ts_fetch }}</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<th>Sources</th>
|
|
||||||
<td>{{ sources|join:", " }}</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<th>Status</th>
|
|
||||||
<td>{{ url_item.status }}</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<th>Title</th>
|
|
||||||
<td>{{ url_content.title }}</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<th>Description</th>
|
|
||||||
<td>{{ url_content.description }}</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<th>Content</th>
|
|
||||||
<td>{{ url_content.content }}</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<th>Tags</th>
|
|
||||||
<td>{{ url_content.tags }}</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<th>Authors</th>
|
|
||||||
<td>{{ url_content.authors }}</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<th>Image URLs</th>
|
|
||||||
<td>{{ url_content.image_urls }}</td>
|
|
||||||
</tr>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<!-- Independent form for optional values -->
|
|
||||||
<form onsubmit="fetchDetailsWithSelection(event, {{ url_item.id }}, '{{ url_item.url }}')">
|
|
||||||
<label for="options-{{ url_item.id }}">Model:</label>
|
|
||||||
<select id="options-{{ url_item.id }}" class="form-control mb-2">
|
|
||||||
<!-- <option value="">-- Select an option --</option> -->
|
|
||||||
{% for model in models %}
|
|
||||||
<option value="{{ model }}">{{ model }}</option>
|
|
||||||
{% endfor %}
|
|
||||||
</select>
|
|
||||||
</form>
|
|
||||||
|
|
||||||
<!-- Input field with a default value -->
|
|
||||||
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
|
|
||||||
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="3">{{ prompt }} {{ url_item.url }}</textarea>
|
|
||||||
|
|
||||||
<!-- Fetch details button -->
|
|
||||||
<button class="btn btn-primary" onclick="fetchDetails({{ url_item.id }}, '{{ url_item.url }}')">
|
|
||||||
Fetch Details
|
|
||||||
</button>
|
|
||||||
|
|
||||||
<!-- Chatbot-style response box -->
|
|
||||||
<div class="chat-box mt-3 p-3 border rounded">
|
|
||||||
<div id="chat-output"></div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Loading Spinner (Hidden by Default) -->
|
|
||||||
<div id="loading-spinner" class="spinner-border text-primary mt-3" role="status" style="display: none;">
|
|
||||||
<span class="visually-hidden">Loading...</span>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- Bootstrap JS -->
|
|
||||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
|
|
||||||
|
|
||||||
{% block extra_js %}{% endblock %}
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
from django import template
|
|
||||||
|
|
||||||
register = template.Library()
|
|
||||||
|
|
||||||
@register.filter
|
|
||||||
def dict_get(dictionary, key):
|
|
||||||
"""Custom filter to get a value from a dictionary in Django templates."""
|
|
||||||
return dictionary.get(key, [])
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
from django.test import TestCase
|
|
||||||
|
|
||||||
# Create your tests here.
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
from django.urls import path
|
|
||||||
|
|
||||||
from . import views
|
|
||||||
|
|
||||||
urlpatterns = [
|
|
||||||
path("", views.news, name="home"),
|
|
||||||
path('url/<int:id>/', views.url_detail_view, name='url_detail'),
|
|
||||||
path('url/<int:id>/fetch/', views.fetch_details, name='fetch_details'),]
|
|
||||||
@@ -1,104 +0,0 @@
|
|||||||
from django.http import StreamingHttpResponse, HttpResponse, JsonResponse
|
|
||||||
from django.shortcuts import render, get_object_or_404
|
|
||||||
from django.core.paginator import Paginator
|
|
||||||
import requests
|
|
||||||
from django.http import StreamingHttpResponse
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
import ollama
|
|
||||||
|
|
||||||
from .models import Urls, Source, UrlsSource, UrlContent
|
|
||||||
|
|
||||||
# Create your views here.
|
|
||||||
def index(request):
|
|
||||||
return HttpResponse("Hello, world. You're at the news index.")
|
|
||||||
|
|
||||||
def news(request):
|
|
||||||
# URLs
|
|
||||||
urls = Urls.objects.all()
|
|
||||||
# Sources
|
|
||||||
sources = Source.objects.all()
|
|
||||||
|
|
||||||
# Parameters
|
|
||||||
page_number = request.GET.get("page", 1)
|
|
||||||
num_items = request.GET.get("items", 15)
|
|
||||||
source_ids = request.GET.get("sources", ','.join([str(s.id) for s in sources]))
|
|
||||||
status_filters = request.GET.get("status", None)
|
|
||||||
|
|
||||||
# Filters
|
|
||||||
if (status_filters) and (status_filters != "all"):
|
|
||||||
urls = urls.filter(status__in=status_filters.split(","))
|
|
||||||
if (source_ids) and (source_ids != "all"):
|
|
||||||
# TODO: Distinct needed?
|
|
||||||
urls = urls.filter(urlssource__id_source__in=source_ids.split(",")).distinct()
|
|
||||||
|
|
||||||
# Pagination
|
|
||||||
paginator = Paginator(urls, num_items)
|
|
||||||
page_obj = paginator.get_page(page_number)
|
|
||||||
|
|
||||||
# Map URL IDs to their sources, only for subset of URLs (page of interest)
|
|
||||||
sources_map = {
|
|
||||||
url.id: list(Source.objects.filter(urlssource__id_url=url).values_list('source', flat=True))
|
|
||||||
for url in page_obj.object_list
|
|
||||||
}
|
|
||||||
|
|
||||||
context = {
|
|
||||||
"page_obj": page_obj,
|
|
||||||
"sources": sources,
|
|
||||||
"sources_map": sources_map,
|
|
||||||
"list_status": Urls.STATUS_ENUM.values,
|
|
||||||
"list_urls_per_page": [15, 50, 100],
|
|
||||||
}
|
|
||||||
|
|
||||||
# If request is AJAX, return JSON response
|
|
||||||
if request.headers.get("X-Requested-With") == "XMLHttpRequest":
|
|
||||||
return JsonResponse({'items_html': render(request, 'item_list_partial.html', context).content.decode('utf-8')})
|
|
||||||
|
|
||||||
return render(request, "item_list.html", context)
|
|
||||||
|
|
||||||
|
|
||||||
def url_detail_view(request, id):
|
|
||||||
url_item = get_object_or_404(Urls, id=id)
|
|
||||||
url_sources = list(Source.objects.filter(urlssource__id_url=url_item).values_list('source', flat=True))
|
|
||||||
try:
|
|
||||||
url_content = UrlContent.objects.get(pk=id)
|
|
||||||
except UrlContent.DoesNotExist:
|
|
||||||
url_content = {}
|
|
||||||
|
|
||||||
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
|
|
||||||
# LLM models available
|
|
||||||
client = ollama.Client(host = 'https://ollamamodel.matitos.org')
|
|
||||||
models = sorted([m.model for m in client.list().models])
|
|
||||||
# default_model = "llama3.2:3b"
|
|
||||||
|
|
||||||
context = {
|
|
||||||
'url_item': url_item,
|
|
||||||
'sources': url_sources,
|
|
||||||
'models': models,
|
|
||||||
#'default_model': default_model,
|
|
||||||
'prompt': "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:",
|
|
||||||
#"prompt": "Image you are a journalist, TLDR in a paragraph:",
|
|
||||||
#"prompt": "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
|
|
||||||
'url_content': url_content,
|
|
||||||
}
|
|
||||||
return render(request, 'url_detail.html', context)
|
|
||||||
|
|
||||||
def fetch_details(request, id):
|
|
||||||
url_item = get_object_or_404(Urls, id=id)
|
|
||||||
url_param = request.GET.get("url", "") # Get URL
|
|
||||||
model = request.GET.get("model", "") # Get LLM model
|
|
||||||
text = request.GET.get("text", "") # Get LLM prompt
|
|
||||||
|
|
||||||
# LLM
|
|
||||||
client = ollama.Client(host = 'https://ollamamodel.matitos.org')
|
|
||||||
|
|
||||||
def stream_response():
|
|
||||||
msg_content = {
|
|
||||||
"role": "user",
|
|
||||||
"content": text,
|
|
||||||
}
|
|
||||||
response = client.chat(model=model, messages=[msg_content], stream=True)
|
|
||||||
for chunk in response:
|
|
||||||
yield chunk["message"]["content"] # Stream each chunk of text
|
|
||||||
|
|
||||||
return StreamingHttpResponse(stream_response(), content_type="text/plain")
|
|
||||||
Reference in New Issue
Block a user