Urls web visualization, cleaning obsolete code

This commit is contained in:
Luciano Gervasoni
2025-03-25 02:51:16 +01:00
parent 0c6b5f1ea4
commit 24b4614049
52 changed files with 371 additions and 3293 deletions

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -11,7 +11,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -20,17 +20,108 @@
"text": [
"db_postgres\n",
"db_redis\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
" ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.5s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.6s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.7s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠇ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.8s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠏ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.9s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠋ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.1s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.2s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.3s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.4s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.5s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.6s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.7s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n",
" ⠇ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m1.8s \u001b[0m\n",
" ⠋ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" ⠋ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" ⠋ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
" ⠏ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m1.9s \u001b[0m\n",
" ⠙ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠙ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠙ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
" ⠋ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.0s \u001b[0m\n",
" ⠹ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠹ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠹ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n",
" ⠙ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.1s \u001b[0m\n",
" ⠸ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" ⠸ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" ⠸ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n",
" ⠹ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠼ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠼ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠸ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿\u001b[0m] 166.8kB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠴ 82780b9b6d69 Downloading \u001b[39m 166.8kB/16.38MB\u001b[0m \u001b[34m0.5s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠼ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.833MB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠦ 82780b9b6d69 Downloading \u001b[39m 9.833MB/16.38MB\u001b[0m \u001b[34m0.6s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠴ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿\u001b[0m] 163.8kB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.5s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠿ 82780b9b6d69 Extracting \u001b[39m 163.8kB/16.38MB\u001b[0m \u001b[34m0.7s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠦ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.667MB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.6s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" ⠿ 82780b9b6d69 Extracting \u001b[39m 9.667MB/16.38MB\u001b[0m \u001b[34m0.8s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
" \u001b[32m✔\u001b[0m matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣿\u001b[0m] 0B/0B Pulled \u001b[32m\u001b[0m \u001b[34m2.7s \u001b[0m\n",
" \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n",
" \u001b[32m✔\u001b[0m 82780b9b6d69 Pull complete \u001b[32m\u001b[0m \u001b[34m0.9s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
" ⠋ Container db_redis \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" ⠋ Container db_postgres \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" ⠋ Container dozzle \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠿ Container dozzle \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n",
" Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠿ Container dozzle \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h"
]
@@ -42,7 +133,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -143,6 +234,7 @@
" # Feeds\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n",
" # Websites of interest\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.missingkids.org/poster', 'url_host');\" )\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n",
" # Search keywords\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
@@ -159,7 +251,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -211,7 +303,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -260,7 +352,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -285,7 +377,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 7,
"metadata": {},
"outputs": [
{

View File

@@ -1,46 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"conda create -n matitos_fetcher python=3.12\n",
"conda activate matitos_fetcher\n",
"conda install -c conda-forge curl\n",
"pip install ipykernel \"psycopg[binary]\" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!uvicorn app:app --host 0.0.0.0 --port 5000 --reload"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos_fetcher",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,17 +0,0 @@
FROM continuumio/miniconda3:25.1.1-2
# App repository
COPY . /opt/app/
RUN conda install -c conda-forge curl
RUN pip install --no-cache-dir --upgrade "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]
RUN pip freeze
# GoogleNews-1.6.10 Pillow-10.1.0 PyYAML-6.0.1 aiofiles-23.2.1 anyio-3.7.1 beautifulsoup4-4.9.3 bs4-0.0.1 click-8.1.7 cssselect-1.2.0 dateparser-1.2.0 dnspython-1.16.0 duckduckgo_search-3.9.8 fastapi-0.104.1 fastapi-utils-0.2.1 feedfinder2-0.0.4 feedparser-6.0.10 filelock-3.13.1 gnews-0.3.6 greenlet-3.0.1 h11-0.14.0 h2-4.1.0 hpack-4.0.0 httpcore-1.0.2 httpx-0.25.2 hyperframe-6.0.1 jieba3k-0.35.1 joblib-1.3.2 lxml-4.9.3 newspaper3k-0.2.8 nltk-3.8.1 numpy-1.26.2 psycopg-3.1.13 psycopg-binary-3.1.13 pydantic-1.10.13 pymongo-3.12.3 python-dateutil-2.8.2 python-dotenv-0.19.2 pytz-2023.3.post1 redis-5.0.1 regex-2023.10.3 requests-2.26.0 requests-file-1.5.1 sgmllib3k-1.0.0 six-1.16.0 sniffio-1.3.0 socksio-1.0.0 soupsieve-2.5 sqlalchemy-1.4.50 starlette-0.27.0 tinysegmenter-0.3 tldextract-5.1.1 typing-extensions-4.8.0 tzlocal-5.2 uvicorn-0.24.0.post1
WORKDIR /opt/app
# https://www.uvicorn.org/settings/#resource-limits
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
# docker build -t fetch_app .
# docker run --rm --name container_fetch_app fetch_app

View File

@@ -1,20 +0,0 @@
# Fetcher
```
conda create -n matitos_fetcher python=3.12
conda activate matitos_fetcher
conda install -c conda-forge curl
pip install ipykernel "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]
```
* Fetcher app
- Contains several endpoints to perform a specific fetching type task
- For more details, check in [app.py](app.py) /{fetch_type}
* Build and run
- Important: To be deployed with other micro-services, [docker-compose.yml](../docker-compose.yml)
```
docker build -t fetch_app .
docker run --rm --name container_fetch_app fetch_app
```

View File

@@ -1,79 +0,0 @@
from src.fetch_feed import FetchFeed
from src.fetch_parser import FetchParser
from src.fetch_search import FetchSearch
from src.missing_kids_fetch import MissingKidsFetch
from src.missing_kids_status import MissingKidsStatus
from src.url_status import UpdateErrorURLs
from src.db_utils import DB_Handler
import src.credentials as cred
from logging_ import get_logger
from fastapi import FastAPI, BackgroundTasks
##################################################################################################
logger = get_logger()
logger.info("Environment: {}".format(cred.ENVIRONMENT))
db_handler = DB_Handler(cred.db_connect_info, cred.redis_connect_info)
app = FastAPI()
@app.get("/")
def hello_world():
return {"message": "Ok"}
@app.get("/{process_type}")
async def process(background_tasks: BackgroundTasks, process_type: str):
# Concurrent job running
logger.info("Triggered: {}".format(process_type))
if (process_type == "fetch_feeds"):
task_run = FetchFeed(db_handler).run
elif (process_type == "fetch_parser"):
task_run = FetchParser(db_handler).run
elif (process_type == "search") or (process_type == "search_full"):
task_run = FetchSearch(cred.db_connect_info, cred.redis_connect_info, full=True).run
elif (process_type == "search_reduced"):
task_run = FetchSearch(cred.db_connect_info, cred.redis_connect_info, full=False).run
# Selenium based
elif (process_type == "fetch_missing_kids_reduced"):
task_run = MissingKidsFetch(db_handler, num_pages=4).run
elif (process_type == "fetch_missing_kids_full"):
task_run = MissingKidsFetch(db_handler, num_pages=100000).run
elif (process_type == "update_missing_kids_status_reduced"):
task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status
elif (process_type == "update_missing_kids_status_full"):
task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status
elif (process_type == "update_error_urls"):
task_run = UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status
else:
return {"message": "ERROR. Unknown fetcher type!"}
# Run task
background_tasks.add_task(task_run)
# Return message
return {"message": "Started {}: Ok".format(process_type)}
"""
# TODO: Instead of background tasks!
import rq
import redis
# Redis connection
redis_conn = redis.Redis(host='localhost', port=6379, db=0)
queue = rq.Queue(connection=redis_conn)
# ...
# Queue the processing task
dict_args= {"db_handler": db_handler, }
queue.enqueue(task_run, **dict_args)
# https://python-rq.org/
"""

View File

@@ -1,502 +0,0 @@
import psycopg
import redis
import traceback
import random
import requests
import json
import os
from .url_utils import process_article
from .logger import get_logger
logger = get_logger()
# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
# The rest, elsewhere
class DB_Handler():
def __init__(self, db_connect_info, redis_connect_info):
logger.debug("Initializing URL DB writer")
self.db_connect_info = db_connect_info
self.redis_instance = redis.Redis(host=redis_connect_info.get("host"), port=redis_connect_info.get("port"))
self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours
try:
self.redis_instance.ping()
logger.debug("Succesfully pinged Redis")
except Exception as e:
logger.warning("Error trying to ping Redis: {}".format(str(e)))
def get_urls_count(self, last_minutes_check):
#####################
### Get number of URLs within last X minutes
#####################
try:
# Update
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0]
except Exception as e:
logger.warning("Error updating URLs status: {}".format(str(e)))
num_urls = None
return num_urls
def _get_url_host_list(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
# List of URL host
list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()]
# Clean http / https from URLs
list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host]
# Clean last slash if exists
list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host]
except Exception as e:
logger.warning("Exception fetching URL host list: " + str(e))
list_url_host = []
return list_url_host
def _get_search_list(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
# List of keyword searches
list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()]
except Exception as e:
logger.warning("Exception fetching searches list: " + str(e))
list_search_text = []
return list_search_text
def _get_feed_urls(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
# Decode (tuple with 1 element)
list_url_feeds = [l[0] for l in list_url_feeds]
except Exception as e:
logger.warning("Exception fetching RSS sites: " + str(e))
list_url_feeds = []
return list_url_feeds
def _get_url_hosts(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
# Decode (tuple with 1 element)
list_url_hosts = [l[0] for l in list_url_hosts]
except Exception as e:
logger.warning("Exception fetching RSS sites: " + str(e))
list_url_hosts = []
return list_url_hosts
def _format(self, values):
# Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729
# String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value)
if (type(values) == list) or (type(values) == tuple):
insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")"
elif (type(values) == str):
insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" )
else:
logger.warning("Error formatting input values: {}".format(values))
assert False
return insert_args
def _get_cached_canonical_url(self, url):
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
try:
filter_url = self.redis_instance.get(url)
if (filter_url is not None):
filter_url = filter_url.decode("utf-8")
except Exception as e:
logger.warning("Exception querying Redis: {}".format(str(e)))
filter_url = None
return filter_url
def _update_urls_status(self, dict_status_ids):
#####################
### Update status to array of URL IDs
#####################
try:
# Update
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# Autocommit at end of transaction (Atomic insert of URLs and sources)
with conn.transaction() as tx:
for key_status, value_ids in dict_status_ids.items():
cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids])))
except Exception as e:
logger.warning("Error updating URLs status: {}".format(str(e)))
def _get_missing_kids_urls(self, num_urls=None):
#####################
### Get list of Missing Kids URLs
#####################
try:
missing_kids_ids_and_urls = []
if (num_urls is None):
limit = 500
else:
limit = num_urls
offset = 0
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
while True:
# Query
missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
# Finished?
if (len(missing_kids_ids_and_urls_query) == 0):
break
# Extend
missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query
# Offset
offset += len(missing_kids_ids_and_urls_query)
# Stop?
if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls):
break
except Exception as e:
logger.warning("Error getting Missing Kids URLs: {}".format(str(e)))
missing_kids_ids_and_urls = []
return missing_kids_ids_and_urls
def _get_error_urls(self, num_urls=None):
#####################
### Get list of Missing Kids URLs
#####################
try:
error_urls = []
if (num_urls is None):
limit = 500
else:
limit = num_urls
offset = 0
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
while True:
# Query
error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
# Finished?
if (len(error_urls_query) == 0):
break
# Extend
error_urls = error_urls + error_urls_query
# Offset
offset += len(error_urls_query)
# Stop?
if (num_urls is not None) and (len(error_urls) >= num_urls):
break
except Exception as e:
logger.warning("Error getting Error URLs: {}".format(str(e)))
error_urls = []
return error_urls
def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched...
"""
# TODO: REFACTOR
For each input url
Already processed?
-> Update on Redis expire time
-> Associate to source
Not processed? Get main URL:
-> URL Canonical valid?
-> Rely on this as main URL
-> URL Canonical not valid?
-> Use input url, unless it's a news.google.com link
-> If news.google.com link, filter out. REDIS?
Main URL processing:
-> Update in REDIS, association url -> url_canonical
-> url != url_canonical: Add in duplicate table
If both != news.google.com
"""
# URLs to insert, URLs duplicated association, URL to Canonical form
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {}
# URL VS CANONICAL:
# News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen
# Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/
for url in urls_fetched:
# Domain to filter? Input url
filter_due_to_domain = False
for domain_to_filter in list_domains_to_filter:
if (domain_to_filter in url):
logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url))
filter_due_to_domain = True
if (filter_due_to_domain):
continue
# URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
cached_canonical_url = self._get_cached_canonical_url(url)
if (cached_canonical_url is not None):
# Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry)
dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y
# If url has been processed, so was its canonical form
logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url))
continue
# Process TODO: Add language...
url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple)
# TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id)
# Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB)
if (url_canonical is None) and ("news.google.com" in url):
logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url))
continue
# Canonical URL still news.google.com? Continue (avoid inserting in DB)
if (url_canonical is not None) and ("news.google.com" in url_canonical):
logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical))
continue
# Domain to filter? Input canonical_url
filter_due_to_domain = False
for domain_to_filter in list_domains_to_filter:
if (url_canonical is not None) and (domain_to_filter in url_canonical):
filter_due_to_domain = True
if (filter_due_to_domain):
logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical))
continue
if (url_canonical is None) or (article_status == "error"):
logger.debug("Processing failed for URL: {}".format(url))
# Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based
if ("news.google.com" in url) or ("consent.google.com" in url):
logging.debug("Not able to process Google News link, skipping: {}".format(url))
else:
dict_full_urls_to_canonical[url] = url # X -> X
list_insert_url_tuple_args.append( (url, article_status) )
continue
# URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different
if (url_canonical != url):
list_tuple_canonical_duplicate_urls.append( (url_canonical, url) )
# Dict: url -> canonical (update association)
dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X
# Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
if (self._get_cached_canonical_url(url_canonical) is not None):
# Canonical URL was already processed
logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical))
else:
# Insert url_canonical to DB formatted
list_insert_url_tuple_args.append( (url_canonical, article_status) )
# Canonical URL different? Process
if (url_canonical != url):
if ("news.google.com" in url) or ("consent.google.com" in url):
logging.debug("Not adding google.news.com based link, skipping: {}".format(url))
else:
# Fetched url -> duplicate (using canonical as main link)
article_status = "duplicate"
# Insert url (non-canonical) to DB formatted
list_insert_url_tuple_args.append( (url, article_status) )
return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical
def _insert_urls(self, cursor, list_insert_url_tuple_args):
#####################
### Insert URLs with status
#####################
if (len(list_insert_url_tuple_args) > 0):
insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] )
# Insert. (url_1, status_1), (url_2, status_2), ...
sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args)
# logger.debug("SQL CODE: {}".format(sql_code))
c = cursor.execute(sql_code)
# NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT)
# https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488
def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls):
#####################
### Insert duplicated URLs
#####################
if (len(list_tuple_canonical_duplicate_urls) > 0):
# Flatten, format, set to remove duplicates
args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")"
# Dict: url -> id
dict_url_to_id = {}
# Get url -> id association to populate duplicated URLs
for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall():
dict_url_to_id[url_] = id_
# Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB
# ORIGINAL CODE. Issue, might not have found association to all urls
### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls]
list_tuple_canonical_duplicate_urls_ids = []
for (url_1, url_2) in list_tuple_canonical_duplicate_urls:
id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2)
if (id_url_1 is None) or (id_url_2 is None):
logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2))
else:
list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) )
if (len(list_tuple_canonical_duplicate_urls_ids) > 0):
insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] )
# Insert. (id_url_canonical_1, id_url_1), ...
sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args)
# logger.debug("SQL CODE: {}".format(sql_code))
c = cursor.execute(sql_code)
def _get_pattern_status_list(self):
#####################
### Get list of domains to filter
#####################
# TODO: Cache on redis and query once every N hours? ...
try:
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# TODO: Cache on Redis
list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall()
except Exception as e:
logger.warning("Error getting pattern status list: {}".format(str(e)))
list_pattern_status = []
return list_pattern_status
def _get_domains_to_filter(self):
#####################
### Get list of domains to filter
#####################
# TODO: Cache on redis and query once every N hours? ...
try:
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# TODO: Cache on Redis
sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ]
except Exception as e:
logger.warning("Error getting domains to filter: {}".format(str(e)))
sites_to_filter = []
return sites_to_filter
def _get_cached_source_id(self, source):
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
try:
source_id = self.redis_instance.get(source)
if (source_id is not None):
source_id = source_id.decode("utf-8")
except Exception as e:
logger.warning("Exception querying Redis: {}".format(str(e)))
source_id = None
return source_id
def _get_source_id(self, cursor, source):
#####################
### Get source corresponding id
#####################
# Cached?
id_source = self._get_cached_source_id(source)
if (id_source is None):
c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone()
if (c is None) or (len(c) == 0):
# Source does not exist, insert and get id
c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone()
# Decode source id
id_source = c[0]
# Cache
print("*"*10, source, id_source)
self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds)
return id_source
def _get_urls_id(self, cursor, urls_full):
#####################
### Get id of inserted and filtered URLs
#####################
# TODO: Cache url -> url_id, url_canonical
if (len(urls_full) == 0):
return []
# Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source
in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")"
id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ]
return id_urls_related
def _insert_urls_source(self, cursor, id_urls_related, id_source):
#####################
### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ...
#####################
if (len(id_urls_related) == 0) or (id_source is None):
return
columns = "(id_url, id_source)"
insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] )
# Insert
sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args)
# logger.debug("SQL CODE: {}".format(sql_code))
c = cursor.execute(sql_code)
def write_batch(self, urls_fetched, source):
# Chunks of 50 elements
n = 50
# Divide in small chunks
urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)]
# Process
for urls_fetched_chunk_i in urls_fetched_chunks:
self._write_small_batch(urls_fetched_chunk_i, source)
def _write_small_batch(self, urls_fetched, source):
try:
logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source))
if (len(urls_fetched) == 0):
logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
return
# Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests)
random.shuffle(urls_fetched)
# Get list of domains to filter
list_domains_to_filter = self._get_domains_to_filter()
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = self._get_pattern_status_list()
# Sort pattern tuples by priority
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
# Process URLs to update DB
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple)
# Full set of URL and its canonical form (to associate them to a search), both to insert and filter
urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) )
# Insert
with psycopg.connect(self.db_connect_info) as conn:
# Open cursor
cursor = conn.cursor()
# Autocommit at end of transaction (Atomic insert of URLs and sources)
with conn.transaction() as tx:
# Insert processed URLs
self._insert_urls(cursor, list_insert_url_tuple_args)
# Insert URLs duplicated (canonical != fetched url)
self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls)
# Get source id in DB
id_source = self._get_source_id(cursor, source)
# Get IDs of all related URLs
id_urls_related = self._get_urls_id(cursor, urls_full)
# Insert search source associated to URLs
self._insert_urls_source(cursor, id_urls_related, id_source)
# Update Redis status of inserted and filtered URLs after writing to DB
for url, url_canonical in dict_full_urls_to_canonical.items():
try:
# Set with updated expiry time
self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds)
if (url != url_canonical):
self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds)
except Exception as e:
logger.warning("Exception running set in Redis: {}".format(str(e)))
if (len(list_insert_url_tuple_args) > 0):
try:
webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token)
payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) })
r = requests.post(endpoint_message, data=payload)
except Exception as e:
logger.warning("Webhook failed: {}".format(str(e)))
logger.debug("URL DB write finished")
except Exception as e:
logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) )
logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) )

View File

@@ -1,48 +0,0 @@
from .db_utils import DB_Handler
import feedparser
import dateutil
from .logger import get_logger
logger = get_logger()
class FetchFeed():
def __init__(self, db_handler: DB_Handler) -> None:
logger.debug("Initializing News feed")
self.db_handler = db_handler
def run(self):
try:
logger.debug("Starting NewsFeed.run()")
# Get feeds
list_url_feeds = self.db_handler._get_feed_urls()
logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds)))
# Process via RSS feeds
for url_feed in list_url_feeds:
# Initialize
urls_fetched, urls_publish_date = [], []
# Fetch feeds
feeds = feedparser.parse(url_feed)
# Parse
for f in feeds.get("entries", []):
# Get URL
url = f.get("link", None)
# Process?
if (url is not None):
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)
# URL
urls_fetched.append(url)
# URL fetching source
source = "feed {}".format(url_feed)
# Write to DB
self.db_handler.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsFeed.run(): {}".format(str(e)))

View File

@@ -1,45 +0,0 @@
from .db_utils import DB_Handler
import newspaper
from .logger import get_logger
logger = get_logger()
class FetchParser():
def __init__(self, db_handler: DB_Handler) -> None:
logger.debug("Initializing News SiteParsing newspaper4k")
self.db_handler = db_handler
# TODO: MOVE LOGIC ELSEWHERE!
def _postprocess(self, article_urls):
return [url.replace("#comment-stream", "") for url in article_urls]
def run(self):
try:
logger.debug("Starting NewsSiteParsing.run() for {}")
# Get URL hosts
list_url_hosts = self.db_handler._get_url_hosts()
logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
# Process newspaper4k build method
for url_host_feed in list_url_hosts:
# Protocol
if not (url_host_feed.startswith("http")):
url_host_feed_formatted = "https://" + url_host_feed
else:
url_host_feed_formatted = url_host_feed
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
# Source object
url_host_built = newspaper.build(url_host_feed_formatted)
# Get articles URL list
urls_fetched = url_host_built.article_urls()
# TODO: MOVE!
# Post-processing
urls_fetched = self._postprocess(urls_fetched)
# URL fetching source
source = "newspaper4k {}".format(url_host_feed)
# Write to DB
self.db_handler.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))

View File

@@ -1,73 +0,0 @@
from .db_utils import DB_Handler
from .utils import get_searxng_instances
from .fetch_search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch
from .logger import get_logger
logger = get_logger()
class FetchSearch():
def __init__(self, db_handler: DB_Handler, full=True) -> None:
logger.debug("Initializing News feed")
self.db_handler = db_handler
self.full_search = full
def _run_fetching(self, search_text):
logger.debug("Starting _run_fetching() for {}".format(search_text))
# Common parameters
lang, region = "en", "US"
### PreSearch
dict_params_news = {"search": search_text}
FetcherPreSearch(**dict_params_news).fetch_articles(self.db_handler)
### DuckDuckGo
period = "d"
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
FetcherDuckDuckGo(**dict_params_news).fetch_articles(self.db_handler)
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
FetcherDuckDuckGo(**dict_params_general).fetch_articles(self.db_handler)
if (self.full_search):
# Avoid site:{} search due to G-Bypass required time
if ("site:" not in search_text):
### GNews
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
FetcherGNews(**dict_params).fetch_articles(self.db_handler)
### GoogleNews
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
FetcherGoogleNews(**dict_params_news).fetch_articles(self.db_handler)
# dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
if False:
### SearxNG
period = "day"
for searx_instance in get_searxng_instances():
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
# Append thread
FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)
logger.debug("Finished _run_fetching()")
def run(self):
try:
logger.info("Fetching text searches & URL hosts of interest")
# Get text searches of interest
list_search_text_of_interest = self.db_handler._get_search_list()
# Get URL host of interest
list_url_host = self.db_handler._get_url_host_list()
# Get text searches for URL hosts
list_search_text_url_host = ["site:{}".format(l) for l in list_url_host]
for search_text in list_search_text_of_interest + list_search_text_url_host:
logger.debug("Fetching news for search: {}".format(search_text))
self._run_fetching(search_text)
logger.info("Finished fetching text searches & URL hosts of interest")
except Exception as e:
logger.warning("Exception in NewsSearch.run(): {}".format(str(e)))

View File

@@ -1,384 +0,0 @@
from duckduckgo_search import DDGS
from gnews import GNews
from GoogleNews import GoogleNews
import requests
from bs4 import BeautifulSoup
import os
import time
import json
import numpy as np
import random
from .google_bypass import GoogleByPass
from abc import ABC, abstractmethod
from .logger import get_logger
logger = get_logger()
# Generic fetcher (fetches articles, writes to DB)
class FetcherAbstract(ABC):
@abstractmethod
def _fetch(self):
pass
def fetch_articles(self, db_writer):
logger.debug("Starting fetch() for {}".format(self.name))
# Fetch articles
list_news = self._fetch()
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
# Write to DB
db_writer.write_batch(list_news, self.name)
# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
user_agents_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
"Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
"Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
]
class FetcherPreSearch(FetcherAbstract):
def __init__(self, search):
"""
# period ->
- h = hours (eg: 12h)
- d = days (eg: 7d)
- m = months (eg: 6m)
- y = years (eg: 1y)
"""
self.search = search
self.period = "1d" # TODO Fixed for the moment
# self.lang = lang
# self.region = region
search_category = "news"
self.name = "presearch {} {} {}".format(search, search_category, self.period)
def _fetch(self):
try:
# PreSearch fetching endpoint, parameter search keyword
presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search)
# Timeout: 15 minutes
r = requests.get(presearch_fetch_endpoint, timeout=900)
# Decode
list_news = json.loads(r.text).get("list_urls", [])
except Exception as e:
logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e)))
list_news = []
return list_news
class FetcherGNews(FetcherAbstract):
def __init__(self, search, period, lang="en", region="US"):
"""
# period ->
- h = hours (eg: 12h)
- d = days (eg: 7d)
- m = months (eg: 6m)
- y = years (eg: 1y)
"""
self.search = search
self.period = period
self.lang = lang
self.region = region
search_category = "news"
self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
def _fetch(self):
try:
list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search)
# Decode
list_news = []
for l in list_dict_news:
list_news.append(l.get("url"))
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
# Bypass Google links
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
return list_news_redirections
class FetcherGoogleNews(FetcherAbstract):
def __init__(self, search, search_category="news", period="1d", lang="en", region="US"):
assert(search_category in ["news", "general"])
self.lang = lang
self.region = region
self.period = period
self.search_category = search_category
self.search = search
self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
def _fetch(self):
try:
# Initialize
g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region)
g.enableException(True)
if (self.search_category == "general"):
set_links = set()
# Search
g.search(self.search)
# Iterate pages
MAX_ITER_PAGES = 15
for i in range(MAX_ITER_PAGES):
time.sleep(random.uniform(1, 1.5))
num_before = len(set_links)
# Get page
try:
links = g.page_at(i)
except Exception as e:
logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e)))
break
# Links
for l in links:
# '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ'
url = l.get("link").split("url=")[-1]
set_links.add(url)
num_after = len(set_links)
# Finished?
if (num_before == num_after):
logger.debug("Iterated {} pages on GoogleNews general search".format(i))
break
# To list
list_news = list(set_links)
elif (self.search_category == "news"):
# Search
g.get_news(self.search)
# Fetch
list_news = g.get_links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
# Bypass Google links
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
return list_news_redirections
class FetcherDuckDuckGo(FetcherAbstract):
def __init__(self, search, search_category, period, lang="wt", region="wt"):
assert(search_category in ["news", "general"])
assert(period in ["d", "w", "m", "y"])
self.search = search
self.search_category = search_category
self.period = period
self.lang_region = "{}-{}".format(lang, region)
self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region)
def _fetch(self):
try:
list_news = []
with DDGS(timeout=10) as ddgs:
if (self.search_category == "general"):
generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region)
elif (self.search_category == "news"):
generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region)
for l in generator_links:
list_news.append( l.get("url", l.get("href")) )
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
return list_news
class FetcherSearxNews(FetcherAbstract):
def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"):
assert(search_category in ["news", "general"])
assert(period in [None, "day", "week", "month", "year"])
# Random header (minimize prob of web-scrapping detection)
self.headers = {
'User-agent': str(np.random.choice(user_agents_list)),
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*',
'Connection': 'keep-alive',
}
""" # Optional header
self.headers = {
'User-agent': str(np.random.choice(user_agents_list)),
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'TE': 'trailers',
'Sec-Fetch-Site': 'cross-site',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Dest': 'document',
}
"""
self.search = search
self.searx_instance = searx_instance
self.lang_region = "{}-{}".format(lang, region)
self.search_category = search_category
self.period = period
self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5
self.request_timeout = 240
period_name_mapping = {
None: "no_date_range",
"day": "1d",
"week": "1w",
"month": "1m",
"year": "1y",
}
self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region)
logger.info("SearX - Initialized SearX fetcher: {}".format(self.name))
def _request_and_decode(self, url_search):
# Initial random time sleep (minimize chance of getting blocked)
time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher))
# Request
logger.debug("SearX - Searching: {}".format(url_search))
try:
r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout)
except Exception as e:
logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e))
return []
if (r.status_code == 200):
# Status code Ok
pass
elif (r.status_code == 429):
# TooManyRequests, "Rate limit exceeded"
logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text))
return []
elif (r.status_code != 200):
logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text))
return []
else:
logger.debug("SearX - Status code: {}".format(r.status_code))
# Decode request
soup = BeautifulSoup(r.text, 'html.parser')
page_url_set = set()
# h3 links
for elem in soup.find_all('h3'):
# Get url
url = elem.find('a').get('href')
page_url_set.add(url)
return page_url_set
def _get_news_list(self):
############################################################
# Domain & search parameter
search_domain = os.path.join(self.searx_instance, "search?q=")
# Search keywords
search_formatted = self.search.replace(" ", "+").replace(":", "%3A")
# Period formatted
period_formatted = "&time_range={}".format(self.period) if self.period is not None else ""
# Search parameters
search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted)
# Combined url search
url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters)
############################################################
# Request and decode on page=1
url_set = self._request_and_decode(url_search_nopage)
# No results?
if (len(url_set) == 0):
logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage))
return []
# Iterate pages
search_numpage = 2
while True:
# Combine url search with page number
url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage)
# Request and decode on page=X
url_set_i = self._request_and_decode(url_search_with_page)
# Length before merging
length_current = len(url_set)
# Merge
url_set = url_set.union(url_set_i)
# Length after merging
length_merged = len(url_set)
# No new elements?
if (length_current == length_merged):
logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage))
break
# Next page
search_numpage += 1
return list(url_set)
def _fetch(self):
try:
# Fetch news
list_news = self._get_news_list()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
list_news = []
return list_news

View File

@@ -1,26 +0,0 @@
import requests
import json
from .logger import get_logger
logger = get_logger()
class GoogleByPass():
def __init__(self) -> None:
pass
def bypass_google_urls(self, list_urls):
if (len(list_urls) == 0):
return []
try:
# Endpoint
gbypass_endpoint = "http://selenium_app:80/get_redirection"
# Timeout: 20 minutes
timeout = 60*20
r = requests.post(gbypass_endpoint, json={"list_urls": list_urls}, timeout=timeout)
# Decode
list_urls_redirections = json.loads(r.text).get("list_urls_redirections", [])
except Exception as e:
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
list_urls_redirections = []
return list_urls_redirections

View File

@@ -1,22 +0,0 @@
import logging
import os
os.makedirs("logs", exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
logger.setLevel(logging.INFO)
# To file log: INFO / WARNING / ERROR
fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
logger.addHandler(fh)
# To file log: WARNING / ERROR
fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh_.setLevel(logging.WARNING)
logger.addHandler(fh_)
def get_logger():
return logger

View File

@@ -1,36 +0,0 @@
from .db_utils import DB_Handler
import requests
import json
from .logger import get_logger
logger = get_logger()
class MissingKidsFetch():
def __init__(self, db_handler: DB_Handler, num_pages) -> None:
logger.debug("Initializing News MissingKids")
self.db_handler = db_handler
self.num_pages = num_pages
self.missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}"
def run(self):
try:
logger.debug("Starting NewsMissingKids.run()")
try:
# Timeout
if (self.num_pages > 15):
timeout = 60*90 # 1.5h
else:
timeout = 60*5 # 5 min
# Request
r = requests.get(self.missingkids_fetch_endpoint.format(self.num_pages), timeout=timeout)
# Decode
urls_fetched = json.loads(r.text).get("list_urls", [])
except Exception as e:
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
urls_fetched = []
# URL fetching source
source = "missingkids fetcher"
# Write to DB
self.db_handler.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e)))

View File

@@ -1,98 +0,0 @@
from .db_utils import URL_DB_Writer
from .url_utils import get_missing_kid_status
from .logger import get_logger
logger = get_logger()
def get_missing_kid_status(url, return_canonical_url=False):
import time
import requests
# Sleep
time.sleep(0.75)
try:
# Request
r = requests.get(url, timeout=300)
# Decode
status_code = r.status_code
# Canonical URL removing parameters
url_canonical = r.url
except Exception as e:
logger.warning("Exception on get URL status request: {}. {}".format(url, str(e)))
status_code = None
url_canonical = url
if (status_code == 200):
status = "valid"
elif (status_code == 404):
status = "invalid"
else:
status = "unknown"
logger.debug("Missing Kid URL {} status: {}".format(url, status))
if (return_canonical_url):
return status, url_canonical
else:
return status
class MissingKidsStatus():
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
self.num_urls = num_urls
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
def update_missing_kids_status(self):
try:
logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls))
# List of URLs
list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls)
# Dict: status -> IDs to update to new status
dict_status_ids, dict_status_urls = {}, {}
# Check URLs with invalid status?
skip_invalid_check = False
flush_every, flush_current = 20, 0
# Iterate URLs
for (id, url, current_status) in list_ids_and_urls:
# Skip duplicate URLs
if (current_status == "duplicate"):
continue
# Skip invalid URLs?
if (skip_invalid_check):
if (current_status == "invalid"):
continue
# Get status
new_status = get_missing_kid_status(url)
# Different? Update
if (current_status != new_status):
# Extend array
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
# Debugging dict
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
# +1 processed
flush_current += 1
# Flush batch?
if (flush_every == flush_current):
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
# Flush remaining batch
if (flush_current > 0):
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
logger.info("Finished updating status to Missing Kids URLs")
except Exception as e:
logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e)))

View File

@@ -1,62 +0,0 @@
from .db_utils import URL_DB_Writer
from .url_utils import process_article
from .logger import get_logger
logger = get_logger()
class UpdateErrorURLs():
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
self.num_urls = num_urls
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
def update_error_urls_status(self):
try:
logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls))
# List of URLs with status 'error'
list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls)
# Current status
current_status = "error"
# Dict: status -> IDs to update to new status
dict_status_ids, dict_status_urls = {}, {}
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = self.db_writer._get_pattern_status_list()
# Sort pattern tuples by priority
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
flush_every, flush_current = 20, 0
# Iterate URLs
for (id, url) in list_ids_and_urls:
# Get status
url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple)
# Different? Update
if (current_status != new_status):
# Extend array
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
# Debugging dict
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
# +1 processed
flush_current += 1
# Flush batch?
if (flush_every == flush_current):
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
# Flush remaining batch
if (flush_current > 0):
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
logger.info("Finished updating status to URLs with error")
except Exception as e:
logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))

View File

@@ -1,262 +0,0 @@
from gnews import GNews
import dateutil.parser
from datetime import datetime, timedelta
from .utils import remove_http_s
import time
import random
import traceback
import requests
import json
import re
from bs4 import BeautifulSoup
from .logger import get_logger
logger = get_logger()
def get_published_date(article):
try:
"""
# Already fetched publish date information?
if (publish_date_ is not None):
return publish_date_
"""
# List of potential publish dates
potential_dates = []
# Publish date is the best match
potential_dates.append(article.publish_date)
# Publish date metadata is the following best match
potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
# Iterate remaining keys
for key in article.meta_data.keys():
if ("date" in key):
potential_dates.append(article.meta_data[key])
def invalid_date(p_date):
# Today + 2 days, article from the future?
today_plus_two = datetime.utcnow() + timedelta(days=2)
# Article from the future?
return p_date.timestamp() > today_plus_two.timestamp()
for date_ in potential_dates:
# String date? parse
if (type(date_) == str):
try:
date_ = dateutil.parser.parse(date_)
except Exception as e:
logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
date_ = None
# Valid?
if (date_ is not None) and (not invalid_date(date_)):
return date_
logger.debug("Article with no published date: {}".format(article.url))
return None
except Exception as e:
logger.info("Error while retrieving published date for URL: {}".format(article.url))
return None
def get_url_host(article_source_url, url):
# https://www.blabla.com/blabla -> www.blabla.com
if (article_source_url != ""):
# Article source URL already extracted, save path if any
return remove_http_s(article_source_url) # .split("/")[0]
else:
return remove_http_s(url).split("/")[0]
def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
# Status "raw", "duplicated" and "error" should remain the way they are
# Assumption: List of patterns sorted by importance
if (article_status in ["valid", "invalid", "unknown"]):
# Regular expression pattern matching: https://regexr.com/
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
# Matching?
matching = bool(re.match(regex_pattern, url))
# Update article status
if (matching):
if (status_if_match != article_status):
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
return status_if_match
# Pattern matching not required or not found, original article status
return article_status
def bypass_google_link(article_url):
def bypass_google_consent(article_url):
# Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
# https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
try:
# Request
r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
# Decode
soup = BeautifulSoup(r.text, 'html.parser')
url_of_interest = soup.a['href']
except Exception as e:
logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
url_of_interest = None
# Not able to bypass?
if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
url_of_interest = None
return url_of_interest
def bypass_google_using_service(article_url):
try:
# e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
gbypass_endpoint = "http://selenium_app:80/get_redirection"
# Timeout: 5 minutes
r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
# Decode
redirect_url = json.loads(r.text).get("redirect_url", "")
except Exception as e:
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
redirect_url = ""
return redirect_url
logger.debug("Starting gbypass_endpoint()")
article_url_bypassed = None
# Bypass using request
if ("consent.google.com" in article_url):
article_url_bypassed = bypass_google_consent(article_url)
# Not bypassed yet? Bypass using service
if (article_url_bypassed is None):
article_url_bypassed = bypass_google_using_service(article_url)
# if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
if (article_url_bypassed == "") or (article_url_bypassed is None):
# Empty URL returned by Gbypass
logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
return None
else:
logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
return article_url_bypassed
def process_article(article_url, list_pattern_status_tuple, language="en"):
# TODO:
"""
https://github.com/fhamborg/news-please
https://github.com/fhamborg/Giveme5W1H
https://github.com/santhoshse7en/news-fetch
"""
try:
logger.debug("Starting process_article()")
if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
# Bypass to get redirection
article_url = bypass_google_link(article_url)
# Error?
if (article_url is None):
return None, {}, "error"
elif ("missingkids.org/poster" in article_url):
# Get status
article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
article_elements = {
"url_full": article_url,
"url_canonical": url_canonical
}
return url_canonical, article_elements, article_status
else:
# Avoid Too many requests (feeds, ...)
time.sleep(0.75)
logger.debug("Processing: {}".format(article_url))
# Default status unless something happens
article_status = "valid"
# Parse article
# TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
# TODO: Language per config
article = GNews(language).get_full_article(url=article_url)
# Article parsed?
if (article is None) or (not article.is_parsed):
logger.debug("Article not parsed: {}".format(article_url))
return article_url, {}, "error"
# Canonical link as main URL
url_canonical = article.canonical_link
# Empty canonical URL?
if (article.canonical_link is None) or (article.canonical_link == ""):
# URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
try:
# Remove text after parameter call
url = article.url.split("?")[0]
# Remove comment-stream
url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
# Article
article_attempt = GNews(language).get_full_article(url=url)
# Retrieving same title? Update article based on clean URL
if (article_attempt is not None) and (article_attempt.title == article.title):
article = article_attempt
except Exception as e:
logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
else: # Default behaviour
logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
# By default, URL same as canonical
url_canonical = article.url
elif (article.url != article.canonical_link):
# If different, stick to canonical URL
logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
else:
# If same, continue...
pass
# Update config to determine if content is valid
article.config.MIN_WORD_COUNT = 150
article.config.MIN_SENT_COUNT = 6
# Valid URL?
if (not article.is_valid_url()):
logger.debug("Not a valid news article: {}".format(url_canonical))
article_status = "invalid"
# Is the article's body text is long enough to meet standard article requirements?
if (not article.is_valid_body()):
logger.debug("Article body not valid: {}".format(url_canonical))
article_status = "unknown"
if (article.images != article.imgs):
logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
# article.keywords, article.meta_keywords, article.summary
# article.movies
# article.top_image
# Check if article status needs to be updated
article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
article_elements = {
'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com
'title': article.title, # Report: Election Integrity Partnership Worked with Feds to Censor News Sites in 2020
'description': article.meta_description, # Coalition committed to respond in early 2022 but failed to do so, while Labor has not issued a full response since taking office
'text': article.text, # ${Article content}
'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
'authors': article.authors, # ['Christopher Knaus']
'language': article.meta_lang, # en
'tags': list(article.tags), # ['Wide Open Border', 'My Son Hunter Movie', ...]
'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...]
'url_canonical': url_canonical, # Canonical URL (redirection)
# 'html': article.html, # HTML article
}
logger.debug("Processing OK: {}".format(url_canonical))
return url_canonical, article_elements, article_status
except Exception as e:
logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
return None, {}, "error"

View File

@@ -1,33 +0,0 @@
def remove_http_s(url):
url = url.replace("https://", "") if url.startswith("https://") else url
url = url.replace("http://", "") if url.startswith("http://") else url
return url
def is_valid_url(url):
if (url.startswith("https://")):
return True
else:
return False
def get_searxng_instances():
# SearxNG instances: https://searx.space/
searx_instances = set()
searx_instances.add("https://searx.work/")
searx_instances.add("https://search.ononoki.org/")
searx_instances.add("https://searxng.nicfab.eu/")
searx_instances.add("https://searx.be/")
# searx_instances.add("https://searx.fmac.xyz/")
# searx_instances.add("https://northboot.xyz/") # FIX
# searx_instances.add("https://serx.ml/") # Offline
# searx_instances.add("https://searx.ru/")
# searx_instances.add("https://searx.sp-codes.de/")
# searx_instances.add("https://searxng.nicfab.eu/")
# searx_instances.add("https://s.frlt.one/")
# searx_instances.add("https://search.sapti.me/")
# To list
list_searx_instances = list(searx_instances)
return list_searx_instances

3
app_selenium/README.md Normal file
View File

@@ -0,0 +1,3 @@
* Missing kids posters fetch (num_pages=X)
* ...

View File

@@ -17,7 +17,7 @@ class Search(models.Model):
db_table = 'search'
def __str__(self):
return "[{}]->{}".format(self.type, self.search)
return "[{}: {}]".format(self.type, self.search)
class Source(models.Model):
id = models.SmallAutoField(primary_key=True)

View File

@@ -130,7 +130,7 @@ class DB_Handler():
# Get or create URL with canonical form
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
# Get the source-search IDs associated to obj_url.id
list_url_source_search = UrlsSourceSearch.objects.fiter(id_url=obj_url)
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
for obj_url_source_search in list_url_source_search:
# Associate same sources to url_canonical (it might already exist)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)

View File

@@ -9,7 +9,7 @@
<script>
function getQueryString(pageNumber, itemsNumber, sources, statuses){
function getQueryString(pageNumber, itemsNumber, sources, searches, statuses){
// Query parameters. If input is null, get most recent value
let queryParams = new URLSearchParams(window.location.search);
// page
@@ -21,6 +21,9 @@
// sources
if (sources == null) sources = queryParams.get("sources") ?? "all";
queryParams.set("sources", sources);
// searches
if (searches == null) searches = queryParams.get("searches") ?? "all";
queryParams.set("searches", searches);
// status
if (statuses == null) statuses = queryParams.get("status") ?? "all";
queryParams.set("status", statuses);
@@ -33,11 +36,11 @@
return queryParamsString;
}
function loadPage(pageNumber, itemsNumber, sources, statuses) {
function loadPage(pageNumber, itemsNumber, sources, searches, statuses) {
$("#item-list").fadeTo(100, 0.5); // Smooth fade effect
$("#loading").show();
queryParamsString = getQueryString(pageNumber, itemsNumber, sources, statuses);
queryParamsString = getQueryString(pageNumber, itemsNumber, sources, searches, statuses);
$.ajax({
url: "?" + queryParamsString,
@@ -58,7 +61,7 @@
$(document).on("click", ".pagination a", function (event) {
event.preventDefault();
let page = $(this).attr("data-page");
loadPage(pageNumber=page, itemsNumber=null, sources=null, statuses=null);
loadPage(pageNumber=page, itemsNumber=null, sources=null, searches=null, statuses=null);
});
$(document).ready(function () {
@@ -68,25 +71,63 @@
////////////////////////////////////////////////////////////////////////////
const sourcesToggleAll = $("#toggle-all-sources");
const sourcesCheckboxes = $(".source-checkbox");
const searchesToggleAll = $("#toggle-all-searches");
const searchesCheckboxes = $(".search-checkbox");
const statusesToggleAll = $("#toggle-all-status");
const statusCheckboxes = $(".status-checkbox");
function updateFilters() {
// Get selected sources
let selectedSources = sourcesCheckboxes.filter(":checked").map(function () {
if (sourcesToggleAll.prop("checked")) {
selectedSources = "all";
}
else {
if (sourcesCheckboxes.filter(":checked").length > 0 ){
selectedSources = sourcesCheckboxes.filter(":checked").map(function () {
return $(this).val();
}).get().join(",");
}
else {
selectedSources = "none";
}
}
// Get selected searches
if (searchesToggleAll.prop("checked")) {
selectedSearches = "all";
}
else {
if (searchesCheckboxes.filter(":checked").length > 0 ){
selectedSearches = searchesCheckboxes.filter(":checked").map(function () {
return $(this).val();
}).get().join(",");
}
else {
selectedSearches = "none";
}
}
// Get selected URL statuses
let selectedStatuses = statusCheckboxes.filter(":checked").map(function () {
if (statusesToggleAll.prop("checked")) {
selectedStatuses = "all";
}
else {
if (statusCheckboxes.filter(":checked").length > 0 ){
selectedStatuses = statusCheckboxes.filter(":checked").map(function () {
return $(this).val();
}).get().join(",");
}
else {
selectedStatuses = "none";
}
}
// Get selected items per page
let selectedItems = $("input[name='items']:checked").val();
// Update pagination and reload data
loadPage(1, selectedItems, selectedSources, selectedStatuses);
loadPage(1, selectedItems, selectedSources, selectedSearches, selectedStatuses);
}
////////////////////////////////////////////////////////////////////////////
@@ -101,6 +142,15 @@
sourcesToggleAll.prop("checked", sourcesCheckboxes.length === sourcesCheckboxes.filter(":checked").length);
updateFilters();
});
// Searches
searchesToggleAll.on("change", function () {
searchesCheckboxes.prop("checked", searchesToggleAll.prop("checked"));
updateFilters();
});
searchesCheckboxes.on("change", function () {
searchesToggleAll.prop("checked", searchesCheckboxes.length === searchesCheckboxes.filter(":checked").length);
updateFilters();
});
// Status
statusesToggleAll.on("change", function () {
statusCheckboxes.prop("checked", statusesToggleAll.prop("checked"));
@@ -121,11 +171,15 @@
// Sources
sourcesCheckboxes.each(function () { $(this).prop("checked", true); });
sourcesToggleAll.prop("checked", true);
// Searches
searchesCheckboxes.each(function () { $(this).prop("checked", true); });
searchesToggleAll.prop("checked", true);
// Statuses
statusCheckboxes.each(function () { $(this).prop("checked", true); });
statusesToggleAll.prop("checked", true);
// Items
$("input[name='items'][value='" + 15 + "']").prop("checked", true);
// $("input[name='items'][value='" + 15 + "']").prop("checked", true);
// loadPage(pageNumber=page, itemsNumber=null, sources=null, searches=null, statuses=null);
});
////////////////////////////////////////////////////////////////////////////
@@ -148,6 +202,23 @@
let savedTheme = localStorage.getItem("theme") ||
(window.matchMedia("(prefers-color-scheme: dark)").matches ? "dark" : "light");
setTheme(savedTheme);
// Local browser timestamp aware for ts_fetch print
document.querySelectorAll(".timestamp").forEach(function (el) {
const ts = el.getAttribute("data-ts");
if (ts) {
const options = {
day: "2-digit",
month: "2-digit",
year: "numeric",
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
hour12: false // Use 24-hour format
}; // "en-GB" for DD-MM-YYYY
const localDate = new Date(ts).toLocaleString("en-GB", options); // Adjust to browser's timezone
el.innerHTML = `${localDate}`;
}
});
});
////////////////////////////////////////////////////////////////////////////
</script>
@@ -174,6 +245,9 @@
box-shadow: 2px 0 5px rgba(0, 0, 0, 0.1);
padding: 15px;
transition: width 0.3s ease;
/* Enable scrolling */
overflow-y: auto;
max-height: 100vh;
}
#sidebar .nav-link {
@@ -313,10 +387,10 @@
}
th:nth-child(1), td:nth-child(1) { width: 50%; } /* URL column */
th:nth-child(2), td:nth-child(2) { width: 20%; } /* Fetch Date */
th:nth-child(3), td:nth-child(3) { width: 20%; } /* Sources */
th:nth-child(4), td:nth-child(4) { width: 5%; } /* Status */
th:nth-child(5), td:nth-child(5) { width: 5%; } /* Action */
th:nth-child(2), td:nth-child(2) { width: 27.5%; } /* Fetch Date */
th:nth-child(3), td:nth-child(3) { width: 10%; } /* Sources */
th:nth-child(4), td:nth-child(4) { width: 10%; } /* Searches */
th:nth-child(5), td:nth-child(5) { width: 2.5%; } /* Status */
/* ============================= */
/* Pagination Styling */
@@ -408,32 +482,22 @@
</button>
</div>
<!-- Sources -->
<!-- URLs per page -->
<div class="nav-item mt-3">
<strong>Select sources</strong>
<form id="source-filter-form">
<!-- Toggle All Checkbox -->
<div class="form-check">
<input class="form-check-input" type="checkbox" id="toggle-all-sources">
<label class="form-check-label fw-bold" for="toggle-all-sources">
Toggle all
</label>
</div>
<!-- Individual Source Checkboxes -->
{% for source in sources %}
<div class="form-check">
<input class="form-check-input source-checkbox" type="checkbox" value="{{ source.id }}" id="source-{{ source.id }}">
<label class="form-check-label" for="source-{{ source.id }}">
{{ source.source }}
</label>
<strong>URLs per page</strong>
<div class="card-body">
<!-- Individual Status Checkboxes -->
{% for url_per_page in list_urls_per_page %}
<div class="items-form-check">
<input class="form-check-input items" type="radio" name="items" id="value-{{ url_per_page }}" value="{{ url_per_page }}">
<label class="form-check-label" for="value-{{ url_per_page }}">{{ url_per_page }}</label>
</div>
{% empty %}
<tr>
<td colspan="2" class="text-center">No sources available.</td>
<td colspan="2" class="text-center">No options available.</td>
</tr>
{% endfor %}
</form>
</div>
</div>
<!-- Status -->
@@ -457,6 +521,33 @@
</label>
</div>
{% empty %}
<tr>
<td colspan="2" class="text-center">No statuses available.</td>
</tr>
{% endfor %}
</form>
</div>
<!-- Sources -->
<div class="nav-item mt-3">
<strong>Select sources</strong>
<form id="source-filter-form">
<!-- Toggle All Checkbox -->
<div class="form-check">
<input class="form-check-input" type="checkbox" id="toggle-all-sources">
<label class="form-check-label fw-bold" for="toggle-all-sources">
Toggle all
</label>
</div>
<!-- Individual Source Checkboxes -->
{% for source in sources %}
<div class="form-check">
<input class="form-check-input source-checkbox" type="checkbox" value="{{ source.id }}" id="source-{{ source.id }}">
<label class="form-check-label" for="source-{{ source.id }}">
{{ source.source }}
</label>
</div>
{% empty %}
<tr>
<td colspan="2" class="text-center">No sources available.</td>
</tr>
@@ -464,24 +555,32 @@
</form>
</div>
<!-- URLs per page -->
<!-- Searches -->
<div class="nav-item mt-3">
<strong>URLs per page</strong>
<div class="card-body">
<!-- Individual Status Checkboxes -->
{% for url_per_page in list_urls_per_page %}
<div class="items-form-check">
<input class="form-check-input items" type="radio" name="items" id="value-{{ url_per_page }}" value="{{ url_per_page }}">
<label class="form-check-label" for="value-{{ url_per_page }}">{{ url_per_page }}</label>
<strong>Select searches</strong>
<form id="search-filter-form">
<!-- Toggle All Checkbox -->
<div class="form-check">
<input class="form-check-input" type="checkbox" id="toggle-all-searches">
<label class="form-check-label fw-bold" for="toggle-all-searches">
Toggle all
</label>
</div>
<!-- Individual Search Checkboxes -->
{% for search in searches %}
<div class="form-check">
<input class="form-check-input search-checkbox" type="checkbox" value="{{ search.id }}" id="search-{{ search.id }}">
<label class="form-check-label" for="search-{{ search.id }}">
[{{ search.type }}] {{ search.search }}
</label>
</div>
{% empty %}
<tr>
<td colspan="2" class="text-center">No options available.</td>
<td colspan="2" class="text-center">No search available.</td>
</tr>
{% endfor %}
</form>
</div>
</div>
</ul>

View File

@@ -7,15 +7,18 @@
<th scope="col"><strong>URL</strong></th>
<th scope="col"><strong>Fetch date</strong></th>
<th scope="col"><strong>Sources</strong></th>
<th scope="col"><strong>Search</strong></th>
<th scope="col"><strong>Status</strong></th>
<th scope="col"><strong>Action</strong></th>
</tr>
</thead>
<tbody>
{% for item in page_obj %}
<tr>
<td><a href="{{ item.url }}/" target="_blank">{{ item.url }}</a></td>
<td>{{ item.ts_fetch }}</td>
<td>
<a href="./{{ item.id }}" class="btn btn-primary btn-sm" target="_blank"></a>
<a href="{{ item.url }}/" target="_blank">{{ item.url }}</a>
</td>
<td class="timestamp" data-ts="{{ item.ts_fetch|date:'c' }}">{{ item.ts_fetch }}</td>
<td>
{% with sources_map|dict_get:item.id as sources %}
{% if sources %}
@@ -27,6 +30,17 @@
{% endif %}
{% endwith %}
</td>
<td>
{% with searches_map|dict_get:item.id as searches %}
{% if searches %}
{% for search in searches %}
<span class="badge bg-secondary">{{ search }}</span>
{% endfor %}
{% else %}
<span class="text-muted">No searches</span>
{% endif %}
{% endwith %}
</td>
<td>
{% if item.status == 'raw' %}
<span class="badge bg-secondary">{{ item.status|capfirst }}</span>
@@ -44,10 +58,6 @@
<span class="badge bg-light">Unknown</span>
{% endif %}
</td>
<td>
<a href="url/{{ item.id }}" class="btn btn-primary btn-sm" target="_blank">Details</a>
</td>
</tr>
{% empty %}
<tr>

View File

@@ -54,7 +54,7 @@
}
// Fetch URL
let fetchUrl = `/news/url/${urlId}/fetch/?url=${encodeURIComponent(url)}&model=${encodeURIComponent(selectedModel)}&text=${encodeURIComponent(inputText)}`;
let fetchUrl = `/api/url/${urlId}/fetch/?url=${encodeURIComponent(url)}&model=${encodeURIComponent(selectedModel)}&text=${encodeURIComponent(inputText)}`;
let resultContainer = $("#chat-output");
resultContainer.html(""); // Clear previous content before fetching
@@ -100,12 +100,6 @@
messageContainer.html(marked.parse(accumulatedText));
//////////////////////////////////////
//////////////////////////////////////
// ORIGINAL:
//let text = decoder.decode(value).replace(/\n/g, "<br>");
//resultContainer.append(text); // Append streamed text
//////////////////////////////////////
resultContainer.scrollTop(resultContainer[0].scrollHeight); // Auto-scroll to bottom
return read();
});
@@ -135,12 +129,16 @@
</tr>
<tr>
<th>Fetch Date</th>
<td>{{ url_item.ts_fetch }}</td>
<td>{{ url_item.ts_fetch }} UTC</td>
</tr>
<tr>
<th>Sources</th>
<th>Source</th>
<td>{{ sources|join:", " }}</td>
</tr>
<tr>
<th>Search</th>
<td>{{ searches|join:", " }}</td>
</tr>
<tr>
<th>Status</th>
<td>{{ url_item.status }}</td>
@@ -175,7 +173,6 @@
<form onsubmit="fetchDetailsWithSelection(event, {{ url_item.id }}, '{{ url_item.url }}')">
<label for="options-{{ url_item.id }}">Model:</label>
<select id="options-{{ url_item.id }}" class="form-control mb-2">
<!-- <option value="">-- Select an option --</option> -->
{% for model in models %}
<option value="{{ model }}">{{ model }}</option>
{% endfor %}
@@ -186,21 +183,23 @@
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="3">{{ prompt }} {{ url_item.url }}</textarea>
<div class="d-flex align-items-center">
<!-- Fetch details button -->
<button class="btn btn-primary" onclick="fetchDetails({{ url_item.id }}, '{{ url_item.url }}')">
Fetch Details
</button>
<!-- Loading Spinner (Hidden by Default) -->
<div id="loading-spinner" class="spinner-border text-primary ms-2" role="status" style="display: none;">
<span class="visually-hidden">Loading...</span>
</div>
</div>
<!-- Chatbot-style response box -->
<div class="chat-box mt-3 p-3 border rounded">
<div id="chat-output"></div>
</div>
<!-- Loading Spinner (Hidden by Default) -->
<div id="loading-spinner" class="spinner-border text-primary mt-3" role="status" style="display: none;">
<span class="visually-hidden">Loading...</span>
</div>
</div>
<!-- Bootstrap JS -->

View File

@@ -3,7 +3,7 @@ from . import views
urlpatterns = [
path('', views.link_list, name='link_list'),
path('url/', views.news, name='url_detail'),
path('url/', views.urls, name='url_detail'),
path('url/<int:id>/', views.url_detail_view, name='url_detail'),
path('url/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
path('task/<str:task>', views.trigger_task, name='trigger_task'),

View File

@@ -18,62 +18,78 @@ def link_list(request):
prefix = "http://localhost:8000/api/task"
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_500000"]
db_links = ["http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500"]
return JsonResponse({"links": ["http://localhost:8000/api/url"] + db_links + [os.path.join(prefix, l) for l in links]})
list_links = [
# DB
"http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500",
# Admin panel
"http://localhost:8000/admin",
# URLs
"http://localhost:8000/api/url",
# API tasks
] + [os.path.join(prefix, l) for l in links]
# Json
return JsonResponse({"links": list_links })
from django.http import StreamingHttpResponse, HttpResponse, JsonResponse
from django.http import StreamingHttpResponse, JsonResponse
from django.shortcuts import render, get_object_or_404
from django.core.paginator import Paginator
import requests
from django.http import StreamingHttpResponse
import json
import time
import ollama
from .models import Urls, Source, Search, UrlsSourceSearch, UrlContent
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
# Create your views here.
def news(request):
def urls(request):
# URLs
urls = Urls.objects.all()
# Sources
sources = Source.objects.all()
seaerches = Search.objects.all()
searches = Search.objects.all()
# Parameters
page_number = request.GET.get("page", 1)
num_items = request.GET.get("items", 15)
source_ids = request.GET.get("sources", ','.join([str(s.id) for s in sources]))
search_ids = request.GET.get("searches", ','.join([str(s.id) for s in searches]))
status_filters = request.GET.get("status", None)
# Filters
if (status_filters) and (status_filters != "all"):
if (status_filters == "none"):
urls = []
else:
urls = urls.filter(status__in=status_filters.split(","))
if (source_ids) and (source_ids != "all"):
# TODO: Distinct needed?
# urls = urls.filter(urlssource__id_source__in=source_ids.split(",")).distinct()
pass
if (source_ids == "none"):
urls = []
else:
urls = urls.filter(urlssourcesearch__id_source__in=source_ids.split(",")) # .distinct()
if (search_ids) and (search_ids != "all"):
if (search_ids == "none"):
urls = []
else:
urls = urls.filter(urlssourcesearch__id_search__in=search_ids.split(",")) # .distinct()
# Pagination
paginator = Paginator(urls, num_items)
page_obj = paginator.get_page(page_number)
# Map URL IDs to their sources, only for subset of URLs (page of interest)
sources_map= {}
"""
# Map URL IDs to their sources & searches, only for subset of URLs (page of interest)
sources_map = {
url.id: list(Source.objects.filter(urlssource__id_url=url).values_list('source', flat=True))
for url in page_obj.object_list
url.id: list(Source.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
}
searches_map = {
url.id: list(Search.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
}
"""
context = {
"page_obj": page_obj,
"sources": sources,
"searches": searches,
"sources_map": sources_map,
"searches_map": searches_map,
"list_status": Urls.STATUS_ENUM.values,
"list_urls_per_page": [15, 50, 100],
"list_urls_per_page": [15, 100, 500],
}
# If request is AJAX, return JSON response
@@ -83,32 +99,54 @@ def news(request):
return render(request, "item_list.html", context)
class OllamaClient():
def __init__(self):
self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
def _get_default_model(self):
return "gemma3:1b"
def get_models(self):
models = sorted([m.model for m in self.client.list().models])
if (self._get_default_model() in models):
return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
else:
return models
def get_prompt(self):
return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
# return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
#return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
def url_detail_view(request, id):
url_item = get_object_or_404(Urls, id=id)
url_sources = list(Source.objects.filter(urlssource__id_url=url_item).values_list('source', flat=True))
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
# url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
try:
url_content = UrlContent.objects.get(pk=id)
except UrlContent.DoesNotExist:
url_content = {}
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
# LLM models available
client = ollama.Client(host = 'https://ollamamodel.matitos.org')
models = sorted([m.model for m in client.list().models])
# default_model = "llama3.2:3b"
ollama = OllamaClient()
context = {
'url_item': url_item,
'sources': url_sources,
'models': models,
#'default_model': default_model,
'prompt': "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:",
#"prompt": "Image you are a journalist, TLDR in a paragraph:",
#"prompt": "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
'searches': url_searches,
'models': ollama.get_models(),
'prompt': ollama.get_prompt(),
'url_content': url_content,
}
return render(request, 'url_detail.html', context)
# TODO: move to ollamajs...
def fetch_details(request, id):
url_item = get_object_or_404(Urls, id=id)
url_param = request.GET.get("url", "") # Get URL
@@ -116,14 +154,14 @@ def fetch_details(request, id):
text = request.GET.get("text", "") # Get LLM prompt
# LLM
client = ollama.Client(host = 'https://ollamamodel.matitos.org')
ollama = OllamaClient()
def stream_response():
msg_content = {
"role": "user",
"content": text,
}
response = client.chat(model=model, messages=[msg_content], stream=True)
response = ollama.client.chat(model=model, messages=[msg_content], stream=True)
for chunk in response:
yield chunk["message"]["content"] # Stream each chunk of text

View File

@@ -124,9 +124,6 @@ SCHEDULER_QUEUES = {
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
#'USERNAME': 'some-user',
#'PASSWORD': 'some-password',
#'DEFAULT_TIMEOUT': 360,
}
}
SCHEDULER_CONFIG = {

View File

@@ -20,6 +20,5 @@ from django.urls import path, include
urlpatterns = [
path('admin/', admin.site.urls),
path('api/', include('api.urls')),
#path('scheduler/', include('django_rq.urls')),
path('scheduler/', include('scheduler.urls')),
]

View File

@@ -1,22 +0,0 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mysite.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

View File

@@ -1,16 +0,0 @@
"""
ASGI config for mysite project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mysite.settings')
application = get_asgi_application()

View File

@@ -1,132 +0,0 @@
"""
Django settings for mysite project.
Generated by 'django-admin startproject' using Django 5.1.6.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/5.1/ref/settings/
"""
import os
from pathlib import Path
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-0+jg0u+%s@sj759i7@jn*%-#jl)8&#=siclb5908pwe!7=*$qb'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'news.apps.NewsConfig',
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'mysite.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'mysite.wsgi.application'
# Database
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql',
'NAME': os.environ.get("DJANGO_DB_NAME", "matitos"),
'USER': os.environ.get("DJANGO_DB_USER", "supermatitos"),
'PASSWORD': os.environ.get("DJANGO_DB_PASSWORD", "supermatitos"),
'HOST': os.environ.get("DJANGO_DB_HOST", "localhost"),
'PORT': os.environ.get("DJANGO_DB_PORT", "5432"),
#'OPTIONS': {
# 'options': '-c default_transaction_read_only=on'
#}
}
}
# Password validation
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/5.1/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/5.1/howto/static-files/
STATIC_URL = 'static/'
# Default primary key field type
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'

View File

@@ -1,26 +0,0 @@
"""
URL configuration for mysite project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/5.1/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import include, path
from django.views.generic.base import RedirectView
urlpatterns = [
path("", RedirectView.as_view(url='news/', permanent=False)),
path("news/", include("news.urls")),
path('admin/', admin.site.urls),
# path("facerecognition", include("facerecognition.urls")),
]

View File

@@ -1,16 +0,0 @@
"""
WSGI config for mysite project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mysite.settings')
application = get_wsgi_application()

View File

@@ -1,9 +0,0 @@
from django.contrib import admin
# Register your models here.
from .models import Urls, UrlsSource, Source
admin.site.register(Urls)
admin.site.register(UrlsSource)
admin.site.register(Source)

View File

@@ -1,6 +0,0 @@
from django.apps import AppConfig
class NewsConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'news'

View File

@@ -1,38 +0,0 @@
# Generated by Django 5.1.6 on 2025-02-20 15:36
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='SOURCE',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('source', models.TextField()),
],
),
migrations.CreateModel(
name='URL',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('url', models.TextField()),
('pub_date', models.DateTimeField(verbose_name='date published')),
],
),
migrations.CreateModel(
name='URL_SOURCE',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('source', models.ForeignKey(on_delete=django.db.models.deletion.RESTRICT, to='news.source')),
('url', models.ForeignKey(on_delete=django.db.models.deletion.RESTRICT, to='news.url')),
],
),
]

View File

@@ -1,25 +0,0 @@
# Generated by Django 5.1.6 on 2025-02-20 16:11
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('news', '0001_initial'),
]
operations = [
migrations.AlterModelTable(
name='source',
table='source',
),
migrations.AlterModelTable(
name='url',
table='urls',
),
migrations.AlterModelTable(
name='url_source',
table='urls_source',
),
]

View File

@@ -1,33 +0,0 @@
# Generated by Django 5.1.6 on 2025-02-20 16:18
import django.db.models.functions.datetime
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('news', '0002_alter_source_table_alter_url_table_and_more'),
]
operations = [
migrations.RemoveField(
model_name='url',
name='pub_date',
),
migrations.AddField(
model_name='url',
name='status',
field=models.CharField(choices=[('raw', 'Raw'), ('error', 'Error'), ('valid', 'Valid'), ('unknown', 'Unknown'), ('invalid', 'Invalid'), ('duplicate', 'Duplicate')], default='raw'),
),
migrations.AddField(
model_name='url',
name='ts_fetch',
field=models.DateTimeField(db_default=django.db.models.functions.datetime.Now(), verbose_name='Date fetched'),
),
migrations.AlterField(
model_name='url',
name='url',
field=models.TextField(verbose_name='URL'),
),
]

View File

@@ -1,17 +0,0 @@
# Generated by Django 5.1.6 on 2025-02-20 16:32
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('news', '0003_remove_url_pub_date_url_status_url_ts_fetch_and_more'),
]
operations = [
migrations.AlterUniqueTogether(
name='url_source',
unique_together={('url', 'source')},
),
]

View File

@@ -1,59 +0,0 @@
# Generated by Django 5.1.6 on 2025-02-20 16:53
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('news', '0004_alter_url_source_unique_together'),
]
operations = [
migrations.CreateModel(
name='Urls',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('url', models.TextField(unique=True)),
('ts_fetch', models.DateTimeField()),
('status', models.TextField(choices=[('raw', 'Raw'), ('error', 'Error'), ('valid', 'Valid'), ('unknown', 'Unknown'), ('invalid', 'Invalid'), ('duplicate', 'Duplicate')], default='raw')),
],
options={
'db_table': 'urls',
'managed': False,
},
),
migrations.RemoveField(
model_name='url_source',
name='url',
),
migrations.AlterUniqueTogether(
name='url_source',
unique_together=None,
),
migrations.RemoveField(
model_name='url_source',
name='source',
),
migrations.AlterModelOptions(
name='source',
options={'managed': False},
),
migrations.CreateModel(
name='UrlsSource',
fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='news.urls')),
],
options={
'db_table': 'urls_source',
'managed': False,
},
),
migrations.DeleteModel(
name='URL',
),
migrations.DeleteModel(
name='URL_SOURCE',
),
]

View File

@@ -1,17 +0,0 @@
# Generated by Django 5.1.6 on 2025-03-06 09:36
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('news', '0005_urls_remove_url_source_url_and_more'),
]
operations = [
migrations.AlterModelOptions(
name='urls',
options={'managed': False, 'ordering': ['-ts_fetch']},
),
]

View File

@@ -1,61 +0,0 @@
from django.db import models
from django.contrib.postgres.fields import ArrayField
# Create your models here.
class Urls(models.Model):
class STATUS_ENUM(models.TextChoices):
RAW = "raw"
ERROR = "error"
VALID = "valid"
UNKNOWN = "unknown"
INVALID = "invalid"
DUPLICATE = "duplicate"
url = models.TextField(unique=True)
ts_fetch = models.DateTimeField()
status = models.TextField(choices=STATUS_ENUM, default=STATUS_ENUM.RAW) # This field type is a guess.
def __str__(self):
return self.url
class Meta:
managed = False
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
ordering = ["-ts_fetch"]
class Source(models.Model):
id = models.SmallAutoField(primary_key=True)
source = models.TextField(unique=True)
def __str__(self):
return self.source
class Meta:
managed = False
db_table = 'source'
class UrlsSource(models.Model):
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source) found, that is not supported. The first column is selected.
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
def __str__(self):
return "Source: {}, URL: {}".format(self.id_source, self.id_url)
class Meta:
managed = False
db_table = 'urls_source'
unique_together = (('id_url', 'id_source'),)
class UrlContent(models.Model):
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True)
date_published = models.DateTimeField(blank=True, null=True)
title = models.TextField(blank=True, null=True)
description = models.TextField(blank=True, null=True)
content = models.TextField(blank=True, null=True)
tags = ArrayField(models.TextField(blank=True, null=True))
authors = ArrayField(models.TextField(blank=True, null=True))
image_urls = ArrayField(models.TextField(blank=True, null=True))
class Meta:
managed = False
db_table = 'url_content'

View File

@@ -1,508 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>News</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<script>
function getQueryString(pageNumber, itemsNumber, sources, statuses){
// Query parameters. If input is null, get most recent value
let queryParams = new URLSearchParams(window.location.search);
// page
if (pageNumber == null) pageNumber = queryParams.get("page") ?? 1;
queryParams.set("page", pageNumber);
// items
if (itemsNumber == null) itemsNumber = queryParams.get("items") ?? 15;
queryParams.set("items", itemsNumber);
// sources
if (sources == null) sources = queryParams.get("sources") ?? "all";
queryParams.set("sources", sources);
// status
if (statuses == null) statuses = queryParams.get("status") ?? "all";
queryParams.set("status", statuses);
// Encoding fix: %2C -> ,
let queryParamsString = queryParams.toString();
while (queryParamsString.includes("%2C")) {
queryParamsString = queryParamsString.replace("%2C", ",");
}
return queryParamsString;
}
function loadPage(pageNumber, itemsNumber, sources, statuses) {
$("#item-list").fadeTo(100, 0.5); // Smooth fade effect
$("#loading").show();
queryParamsString = getQueryString(pageNumber, itemsNumber, sources, statuses);
$.ajax({
url: "?" + queryParamsString,
type: "GET",
headers: { "X-Requested-With": "XMLHttpRequest" },
success: function (data) {
$("#item-list").fadeTo(0, 1).html(data.items_html); // Restore opacity smoothly
$("#loading").hide();
// Update URL without reloading
window.history.pushState({}, "", "?" + queryParamsString);
}
});
}
////////////////////////////////////////////////////////////////////////////
// Pagination
////////////////////////////////////////////////////////////////////////////
$(document).on("click", ".pagination a", function (event) {
event.preventDefault();
let page = $(this).attr("data-page");
loadPage(pageNumber=page, itemsNumber=null, sources=null, statuses=null);
});
$(document).ready(function () {
////////////////////////////////////////////////////////////////////////////
// Filter updates
////////////////////////////////////////////////////////////////////////////
const sourcesToggleAll = $("#toggle-all-sources");
const sourcesCheckboxes = $(".source-checkbox");
const statusesToggleAll = $("#toggle-all-status");
const statusCheckboxes = $(".status-checkbox");
function updateFilters() {
// Get selected sources
let selectedSources = sourcesCheckboxes.filter(":checked").map(function () {
return $(this).val();
}).get().join(",");
// Get selected URL statuses
let selectedStatuses = statusCheckboxes.filter(":checked").map(function () {
return $(this).val();
}).get().join(",");
// Get selected items per page
let selectedItems = $("input[name='items']:checked").val();
// Update pagination and reload data
loadPage(1, selectedItems, selectedSources, selectedStatuses);
}
////////////////////////////////////////////////////////////////////////////
// Change triggers
////////////////////////////////////////////////////////////////////////////
// Sources
sourcesToggleAll.on("change", function () {
sourcesCheckboxes.prop("checked", sourcesToggleAll.prop("checked"));
updateFilters();
});
sourcesCheckboxes.on("change", function () {
sourcesToggleAll.prop("checked", sourcesCheckboxes.length === sourcesCheckboxes.filter(":checked").length);
updateFilters();
});
// Status
statusesToggleAll.on("change", function () {
statusCheckboxes.prop("checked", statusesToggleAll.prop("checked"));
updateFilters();
});
statusCheckboxes.on("change", function () {
// If all checkboxes are checked, mark "Toggle All" as checked
statusesToggleAll.prop("checked", statusCheckboxes.length === statusCheckboxes.filter(":checked").length);
updateFilters();
});
// Items change trigger update
$(".items").on("change", updateFilters);
////////////////////////////////////////////////////////////////////////////
// Default values
////////////////////////////////////////////////////////////////////////////
// Sources
sourcesCheckboxes.each(function () { $(this).prop("checked", true); });
sourcesToggleAll.prop("checked", true);
// Statuses
statusCheckboxes.each(function () { $(this).prop("checked", true); });
statusesToggleAll.prop("checked", true);
// Items
$("input[name='items'][value='" + 15 + "']").prop("checked", true);
});
////////////////////////////////////////////////////////////////////////////
// Theme logic
////////////////////////////////////////////////////////////////////////////
function setTheme(mode) {
document.documentElement.setAttribute("data-theme", mode);
document.documentElement.setAttribute("data-bs-theme", mode);
localStorage.setItem("theme", mode);
document.getElementById("theme-icon").innerHTML = mode === "dark" ? "🌞" : "🌙";
document.body.classList.toggle("dark-mode", mode === "dark");
}
function toggleTheme() {
let currentTheme = document.documentElement.getAttribute("data-theme");
setTheme(currentTheme === "dark" ? "light" : "dark");
}
document.addEventListener("DOMContentLoaded", function () {
let savedTheme = localStorage.getItem("theme") ||
(window.matchMedia("(prefers-color-scheme: dark)").matches ? "dark" : "light");
setTheme(savedTheme);
});
////////////////////////////////////////////////////////////////////////////
</script>
<style>
/* Content Area */
#content {
margin-left: 170px; /* Match sidebar width */
min-width: calc(100vw - 170px); /* Ensure it doesn't shrink into the sidebar */
width: calc(100vw - 170px); /* Expands based on screen size */
padding: 20px;
overflow-x: auto; /* Prevent content from being squeezed */
transition: margin-left 0.3s ease;
}
/* Sidebar Styles */
#sidebar {
height: 100vh;
position: fixed;
top: 0;
left: 0;
width: 170px; /* Default width */
background-color: var(--bg-color);
box-shadow: 2px 0 5px rgba(0, 0, 0, 0.1);
padding: 15px;
transition: width 0.3s ease;
}
#sidebar .nav-link {
color: var(--text-color);
}
#sidebar .nav-link:hover {
background-color: var(--pagination-hover-bg);
}
/* ============================= */
/* Responsive Enhancements */
/* ============================= */
@media (min-width: 1200px) {
.table {
width: 95%; /* Allows table to take more space */
margin: 0 auto; /* Centers the table */
}
}
@media (max-width: 768px) {
#sidebar {
width: 70px; /* Collapse sidebar to smaller width */
/*padding: 10px;*/
}
#content {
margin-left: 70px; /* Adjust margin to match collapsed sidebar */
min-width: calc(100vw - 70px); /* Prevent overlap */
/*padding: 10px;*/
}
/* Adjust table for small screens */
.table-responsive {
overflow-x: auto;
}
.table th,
.table td {
white-space: nowrap; /* Prevent text wrapping in cells */
}
.table a {
word-break: break-word; /* Ensure long URLs break properly */
}
}
/* ============================= */
/* Global Styles */
/* ============================= */
body {
background-color: var(--bg-color);
color: var(--text-color);
transition: background-color 0.3s, color 0.3s;
}
/* ============================= */
/* Light & Dark Mode Variables */
/* ============================= */
:root {
--bg-color: #ffffff;
--text-color: #212529;
--table-bg: #ffffff;
--table-text: #000000;
--table-border: #dee2e6;
--link-color: #007bff;
--pagination-bg: #ffffff;
--pagination-border: #dee2e6;
--pagination-hover-bg: #f8f9fa;
--pagination-active-bg: #007bff;
--pagination-active-text: #ffffff;
--button-bg: #f8f9fa;
--button-border: #ced4da;
--button-text: #212529;
}
[data-theme="dark"] {
--bg-color: #121212;
--text-color: #e0e0e0;
--table-bg: #1e1e1e;
--table-text: #ffffff;
--table-border: #2c2c2c;
--link-color: #9ec5fe;
--pagination-bg: #1e1e1e;
--pagination-border: #444;
--pagination-hover-bg: #333;
--pagination-active-bg: #007bff;
--pagination-active-text: #ffffff;
--button-bg: #1e1e1e;
--button-border: #444;
--button-text: #e0e0e0;
}
/* ============================= */
/* Table Styling */
/* ============================= */
.table-responsive {
width: 100%; /* Ensure it spans the full width of its container */
max-width: 100%;
overflow-x: auto;
}
.table {
background-color: var(--table-bg);
color: var(--table-text);
border: 1px solid var(--table-border);
transition: background-color 0.3s, color 0.3s;
width: 100%; /* Ensures it takes full width of its container */
table-layout: auto; /* Allows columns to adjust dynamically */
/*white-space: nowrap;*/ /* Prevents text wrapping in cells */
}
.table th,
.table td {
border-color: var(--table-border);
}
.table thead {
background-color: var(--pagination-active-bg);
color: var(--pagination-active-text);
}
[data-theme="dark"] .table {
background-color: var(--table-bg);
color: var(--table-text);
}
[data-theme="dark"] .table th,
[data-theme="dark"] .table td {
border-color: var(--table-border);
}
[data-theme="dark"] .table thead {
background-color: #333;
color: #fff;
}
th:nth-child(1), td:nth-child(1) { width: 50%; } /* URL column */
th:nth-child(2), td:nth-child(2) { width: 20%; } /* Fetch Date */
th:nth-child(3), td:nth-child(3) { width: 20%; } /* Sources */
th:nth-child(4), td:nth-child(4) { width: 5%; } /* Status */
th:nth-child(5), td:nth-child(5) { width: 5%; } /* Action */
/* ============================= */
/* Pagination Styling */
/* ============================= */
.pagination {
display: flex;
justify-content: center;
padding: 10px 0;
}
.pagination .page-link {
background-color: var(--pagination-bg);
border-color: var(--pagination-border);
color: var(--text-color);
padding: 10px 14px;
margin: 0 5px;
border-radius: 8px;
transition: background-color 0.3s, color 0.3s, transform 0.2s;
}
.pagination .page-link:hover {
background-color: var(--pagination-hover-bg);
transform: scale(1.05);
}
.pagination .active .page-link {
background-color: var(--pagination-active-bg);
color: var(--pagination-active-text);
border-color: var(--pagination-active-bg);
}
/* ============================= */
/* Theme Toggle Button */
/* ============================= */
.theme-toggle-btn {
background-color: var(--button-bg);
border: 1px solid var(--button-border);
color: var(--button-text);
border-radius: 50%;
width: 40px;
height: 40px;
font-size: 20px;
display: flex;
align-items: center;
justify-content: center;
transition: background-color 0.3s, color 0.3s, transform 0.2s;
cursor: pointer;
}
.theme-toggle-btn:hover {
background-color: var(--pagination-hover-bg);
transform: rotate(20deg);
}
.theme-toggle-btn:active {
transform: scale(0.95);
}
/* ============================= */
/* Loading Spinner Styling */
/* ============================= */
#loading {
position: fixed;
left: 50%;
top: 50%;
transform: translate(-50%, -50%);
z-index: 1050;
display: none;
}
.spinner-border {
width: 4rem;
height: 4rem;
}
</style>
</head>
<body>
<!-- Left Sidebar -->
<div id="sidebar" class="d-flex flex-column">
<ul class="nav flex-column">
<!-- Theme Toggle Button -->
<div class="nav-item">
<button onclick="toggleTheme()" class="theme-toggle-btn">
<span id="theme-icon">🌙</span>
</button>
</div>
<!-- Sources -->
<div class="nav-item mt-3">
<strong>Select sources</strong>
<form id="source-filter-form">
<!-- Toggle All Checkbox -->
<div class="form-check">
<input class="form-check-input" type="checkbox" id="toggle-all-sources">
<label class="form-check-label fw-bold" for="toggle-all-sources">
Toggle all
</label>
</div>
<!-- Individual Source Checkboxes -->
{% for source in sources %}
<div class="form-check">
<input class="form-check-input source-checkbox" type="checkbox" value="{{ source.id }}" id="source-{{ source.id }}">
<label class="form-check-label" for="source-{{ source.id }}">
{{ source.source }}
</label>
</div>
{% empty %}
<tr>
<td colspan="2" class="text-center">No sources available.</td>
</tr>
{% endfor %}
</form>
</div>
<!-- Status -->
<div class="nav-item mt-3">
<strong>Select status</strong>
<form id="status-filter-form">
<!-- Toggle All Checkbox -->
<div class="status-form-check">
<input class="form-check-input" type="checkbox" id="toggle-all-status">
<label class="form-check-label fw-bold" for="toggle-all-status">
Toggle all
</label>
</div>
<!-- Individual Status Checkboxes -->
{% for status in list_status %}
<div class="status-form-check">
<input class="form-check-input status-checkbox" type="checkbox" value="{{ status }}" id="status-{{ status }}">
<label class="form-check-label" for="status-{{ status }}">
{{ status }}
</label>
</div>
{% empty %}
<tr>
<td colspan="2" class="text-center">No sources available.</td>
</tr>
{% endfor %}
</form>
</div>
<!-- URLs per page -->
<div class="nav-item mt-3">
<strong>URLs per page</strong>
<div class="card-body">
<!-- Individual Status Checkboxes -->
{% for url_per_page in list_urls_per_page %}
<div class="items-form-check">
<input class="form-check-input items" type="radio" name="items" id="value-{{ url_per_page }}" value="{{ url_per_page }}">
<label class="form-check-label" for="value-{{ url_per_page }}">{{ url_per_page }}</label>
</div>
{% empty %}
<tr>
<td colspan="2" class="text-center">No options available.</td>
</tr>
{% endfor %}
</div>
</div>
</ul>
</div>
<!-- Main Content Area -->
<div id="content" class="main-content">
<div class="container mt-4">
<!-- Table -->
<div id="item-list">
{% include 'item_list_partial.html' %}
</div>
<!-- Loading... -->
<div id="loading" class="text-center mt-3" style="display:none;">
<div class="spinner-border text-primary" role="status">
<span class="visually-hidden">Loading...</span>
</div>
</div>
</div>
</div>
</body>
</html>

View File

@@ -1,87 +0,0 @@
{% load custom_filters %}
<div class="table-responsive">
<table class="table table-hover">
<thead>
<tr>
<th scope="col"><strong>URL</strong></th>
<th scope="col"><strong>Fetch date</strong></th>
<th scope="col"><strong>Sources</strong></th>
<th scope="col"><strong>Status</strong></th>
<th scope="col"><strong>Action</strong></th>
</tr>
</thead>
<tbody>
{% for item in page_obj %}
<tr>
<td><a href="{{ item.url }}/" target="_blank">{{ item.url }}</a></td>
<td>{{ item.ts_fetch }}</td>
<td>
{% with sources_map|dict_get:item.id as sources %}
{% if sources %}
{% for source in sources %}
<span class="badge bg-secondary">{{ source }}</span>
{% endfor %}
{% else %}
<span class="text-muted">No sources</span>
{% endif %}
{% endwith %}
</td>
<td>
{% if item.status == 'raw' %}
<span class="badge bg-secondary">{{ item.status|capfirst }}</span>
{% elif item.status == 'error' %}
<span class="badge bg-danger">{{ item.status|capfirst }}</span>
{% elif item.status == 'valid' %}
<span class="badge bg-success">{{ item.status|capfirst }}</span>
{% elif item.status == 'unknown' %}
<span class="badge bg-warning">{{ item.status|capfirst }}</span>
{% elif item.status == 'invalid' %}
<span class="badge bg-danger">{{ item.status|capfirst }}</span>
{% elif item.status == 'duplicate' %}
<span class="badge bg-info">{{ item.status|capfirst }}</span>
{% else %}
<span class="badge bg-light">Unknown</span>
{% endif %}
</td>
<td>
<a href="url/{{ item.id }}" class="btn btn-primary btn-sm" target="_blank">Details</a>
</td>
</tr>
{% empty %}
<tr>
<td colspan="4" class="text-center">No items available.</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<div class="d-flex justify-content-center mt-3">
<nav>
<ul class="pagination">
{% if page_obj.has_previous %}
<li class="page-item">
<a class="page-link" href="#" data-page="1">First</a>
</li>
<li class="page-item">
<a class="page-link" href="#" data-page="{{ page_obj.previous_page_number }}">Previous</a>
</li>
{% endif %}
<li class="page-item active">
<span class="page-link">Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}</span>
</li>
{% if page_obj.has_next %}
<li class="page-item">
<a class="page-link" href="#" data-page="{{ page_obj.next_page_number }}">Next</a>
</li>
<li class="page-item">
<a class="page-link" href="#" data-page="{{ page_obj.paginator.num_pages }}">Last</a>
</li>
{% endif %}
</ul>
</nav>
</div>

View File

@@ -1,211 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% block title %}News{% endblock %}</title>
<!-- Bootstrap CSS -->
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<!-- Add jQuery from CDN (before other scripts) -->
<script src="https://code.jquery.com/jquery-3.6.4.min.js"></script>
<!-- Markdown -->
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<!-- Custom Styles -->
<style>
body {
background-color: #f4f4f4;
}
.navbar-dark .navbar-nav .nav-link {
color: rgba(255,255,255,0.75);
}
.chat-box {
background-color: #fff;
border: 1px solid #ddd;
padding: 15px;
border-radius: 8px;
overflow-y: auto; /* Enable vertical scrolling */
max-width: 100%;
min-height: 150px;
max-height: 450px;
white-space: normal;
word-wrap: break-word;
word-break: break-word;
}
</style>
</head>
<script>
function fetchDetails(urlId, url) {
// Show the loading spinner
document.getElementById("loading-spinner").style.display = "block";
// Get the input value
let inputText = document.getElementById(`custom-input-${urlId}`).value;
// Get the input model
let selectedModel = document.getElementById(`options-${urlId}`).value;
// Check if a model is selected
if (!selectedModel) {
alert("Please select a model before fetching details.");
return;
}
// Fetch URL
let fetchUrl = `/news/url/${urlId}/fetch/?url=${encodeURIComponent(url)}&model=${encodeURIComponent(selectedModel)}&text=${encodeURIComponent(inputText)}`;
let resultContainer = $("#chat-output");
resultContainer.html(""); // Clear previous content before fetching
let fetchButton = $("button[onclick^='fetchDetails']"); // Select the button
fetchButton.prop("disabled", true); // Disable button
fetch(fetchUrl)
.then(response => {
if (!response.ok) {
throw new Error("Error on network response");
}
const reader = response.body.getReader();
const decoder = new TextDecoder();
//////////////////////////////////////
let accumulatedText = ""; // Store streamed text before rendering Markdown
// Create a temporary container for streaming response
let messageContainer = $('<div class="chat-message"></div>');
//let messageContainer = $('');
resultContainer.append(messageContainer);
//////////////////////////////////////
function read() {
return reader.read().then(({ done, value }) => {
if (done) {
//////////////////////////////////////
messageContainer.html(marked.parse(accumulatedText));
//////////////////////////////////////
fetchButton.prop("disabled", false); // Re-enable button when done
return;
}
//////////////////////////////////////
// Decode the streamed chunk
let chunk = decoder.decode(value);
// Append to the accumulated text
accumulatedText += chunk;
// Render Markdown progressively (but safely)
messageContainer.html(marked.parse(accumulatedText));
//////////////////////////////////////
//////////////////////////////////////
// ORIGINAL:
//let text = decoder.decode(value).replace(/\n/g, "<br>");
//resultContainer.append(text); // Append streamed text
//////////////////////////////////////
resultContainer.scrollTop(resultContainer[0].scrollHeight); // Auto-scroll to bottom
return read();
});
}
return read();
})
.catch(error => {
resultContainer.html(`<p class="text-danger">Error fetching details: ${error.message}</p>`);
fetchButton.prop("disabled", false); // Re-enable button on error
})
.finally(() => {
// Hide the loading spinner after request is complete
document.getElementById("loading-spinner").style.display = "none";
});
;
}
</script>
<body>
<!-- Main Content -->
<div class="container mt-4">
<h2>URL Details</h2>
<table class="table table-bordered">
<tr>
<th>URL</th>
<td><a href="{{ url_item.url }}" target="_blank">{{ url_item.url }}</a></td>
</tr>
<tr>
<th>Fetch Date</th>
<td>{{ url_item.ts_fetch }}</td>
</tr>
<tr>
<th>Sources</th>
<td>{{ sources|join:", " }}</td>
</tr>
<tr>
<th>Status</th>
<td>{{ url_item.status }}</td>
</tr>
<tr>
<th>Title</th>
<td>{{ url_content.title }}</td>
</tr>
<tr>
<th>Description</th>
<td>{{ url_content.description }}</td>
</tr>
<tr>
<th>Content</th>
<td>{{ url_content.content }}</td>
</tr>
<tr>
<th>Tags</th>
<td>{{ url_content.tags }}</td>
</tr>
<tr>
<th>Authors</th>
<td>{{ url_content.authors }}</td>
</tr>
<tr>
<th>Image URLs</th>
<td>{{ url_content.image_urls }}</td>
</tr>
</table>
<!-- Independent form for optional values -->
<form onsubmit="fetchDetailsWithSelection(event, {{ url_item.id }}, '{{ url_item.url }}')">
<label for="options-{{ url_item.id }}">Model:</label>
<select id="options-{{ url_item.id }}" class="form-control mb-2">
<!-- <option value="">-- Select an option --</option> -->
{% for model in models %}
<option value="{{ model }}">{{ model }}</option>
{% endfor %}
</select>
</form>
<!-- Input field with a default value -->
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="3">{{ prompt }} {{ url_item.url }}</textarea>
<!-- Fetch details button -->
<button class="btn btn-primary" onclick="fetchDetails({{ url_item.id }}, '{{ url_item.url }}')">
Fetch Details
</button>
<!-- Chatbot-style response box -->
<div class="chat-box mt-3 p-3 border rounded">
<div id="chat-output"></div>
</div>
<!-- Loading Spinner (Hidden by Default) -->
<div id="loading-spinner" class="spinner-border text-primary mt-3" role="status" style="display: none;">
<span class="visually-hidden">Loading...</span>
</div>
</div>
<!-- Bootstrap JS -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
{% block extra_js %}{% endblock %}
</body>
</html>

View File

@@ -1,8 +0,0 @@
from django import template
register = template.Library()
@register.filter
def dict_get(dictionary, key):
"""Custom filter to get a value from a dictionary in Django templates."""
return dictionary.get(key, [])

View File

@@ -1,3 +0,0 @@
from django.test import TestCase
# Create your tests here.

View File

@@ -1,8 +0,0 @@
from django.urls import path
from . import views
urlpatterns = [
path("", views.news, name="home"),
path('url/<int:id>/', views.url_detail_view, name='url_detail'),
path('url/<int:id>/fetch/', views.fetch_details, name='fetch_details'),]

View File

@@ -1,104 +0,0 @@
from django.http import StreamingHttpResponse, HttpResponse, JsonResponse
from django.shortcuts import render, get_object_or_404
from django.core.paginator import Paginator
import requests
from django.http import StreamingHttpResponse
import json
import time
import ollama
from .models import Urls, Source, UrlsSource, UrlContent
# Create your views here.
def index(request):
return HttpResponse("Hello, world. You're at the news index.")
def news(request):
# URLs
urls = Urls.objects.all()
# Sources
sources = Source.objects.all()
# Parameters
page_number = request.GET.get("page", 1)
num_items = request.GET.get("items", 15)
source_ids = request.GET.get("sources", ','.join([str(s.id) for s in sources]))
status_filters = request.GET.get("status", None)
# Filters
if (status_filters) and (status_filters != "all"):
urls = urls.filter(status__in=status_filters.split(","))
if (source_ids) and (source_ids != "all"):
# TODO: Distinct needed?
urls = urls.filter(urlssource__id_source__in=source_ids.split(",")).distinct()
# Pagination
paginator = Paginator(urls, num_items)
page_obj = paginator.get_page(page_number)
# Map URL IDs to their sources, only for subset of URLs (page of interest)
sources_map = {
url.id: list(Source.objects.filter(urlssource__id_url=url).values_list('source', flat=True))
for url in page_obj.object_list
}
context = {
"page_obj": page_obj,
"sources": sources,
"sources_map": sources_map,
"list_status": Urls.STATUS_ENUM.values,
"list_urls_per_page": [15, 50, 100],
}
# If request is AJAX, return JSON response
if request.headers.get("X-Requested-With") == "XMLHttpRequest":
return JsonResponse({'items_html': render(request, 'item_list_partial.html', context).content.decode('utf-8')})
return render(request, "item_list.html", context)
def url_detail_view(request, id):
url_item = get_object_or_404(Urls, id=id)
url_sources = list(Source.objects.filter(urlssource__id_url=url_item).values_list('source', flat=True))
try:
url_content = UrlContent.objects.get(pk=id)
except UrlContent.DoesNotExist:
url_content = {}
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
# LLM models available
client = ollama.Client(host = 'https://ollamamodel.matitos.org')
models = sorted([m.model for m in client.list().models])
# default_model = "llama3.2:3b"
context = {
'url_item': url_item,
'sources': url_sources,
'models': models,
#'default_model': default_model,
'prompt': "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:",
#"prompt": "Image you are a journalist, TLDR in a paragraph:",
#"prompt": "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
'url_content': url_content,
}
return render(request, 'url_detail.html', context)
def fetch_details(request, id):
url_item = get_object_or_404(Urls, id=id)
url_param = request.GET.get("url", "") # Get URL
model = request.GET.get("model", "") # Get LLM model
text = request.GET.get("text", "") # Get LLM prompt
# LLM
client = ollama.Client(host = 'https://ollamamodel.matitos.org')
def stream_response():
msg_content = {
"role": "user",
"content": text,
}
response = client.chat(model=model, messages=[msg_content], stream=True)
for chunk in response:
yield chunk["message"]["content"] # Stream each chunk of text
return StreamingHttpResponse(stream_response(), content_type="text/plain")