diff --git a/1-DB.ipynb b/1-DB.ipynb index f2306d3..ecba672 100644 --- a/1-DB.ipynb +++ b/1-DB.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -20,17 +20,108 @@ "text": [ "db_postgres\n", "db_redis\n", - "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n", + " ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.5s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.6s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.7s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠇ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.8s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠏ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m0.9s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠋ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠙ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.1s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠹ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.2s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠸ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.3s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠼ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.4s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠴ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.5s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠦ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.6s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠧ matitos_dozzle Pulling \u001b[39m\u001b[0m \u001b[34m1.7s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/1\n", + " ⠇ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m1.8s \u001b[0m\n", + " ⠋ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n", + " ⠋ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n", + " ⠋ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n", + " ⠏ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m1.9s \u001b[0m\n", + " ⠙ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n", + " ⠙ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n", + " ⠙ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.1s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n", + " ⠋ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.0s \u001b[0m\n", + " ⠹ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " ⠹ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " ⠹ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.2s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 0/4\n", + " ⠙ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⠀⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.1s \u001b[0m\n", + " ⠸ b5b68a794063 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n", + " ⠸ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n", + " ⠸ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.3s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n", + " ⠹ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⠀⠀\u001b[0m] 0B/0B Pulling \u001b[39m\u001b[0m \u001b[34m2.2s \u001b[0m\n", + " \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " ⠼ 764914624645 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " ⠼ 82780b9b6d69 Pulling fs layer \u001b[39m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n", + " ⠸ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⠀\u001b[0m] 166.8kB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.3s \u001b[0m\n", + " \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " ⠴ 82780b9b6d69 Downloading \u001b[39m 166.8kB/16.38MB\u001b[0m \u001b[34m0.5s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n", + " ⠼ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.833MB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.4s \u001b[0m\n", + " \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " ⠦ 82780b9b6d69 Downloading \u001b[39m 9.833MB/16.38MB\u001b[0m \u001b[34m0.6s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n", + " ⠴ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⠀\u001b[0m] 163.8kB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.5s \u001b[0m\n", + " \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " ⠿ 82780b9b6d69 Extracting \u001b[39m 163.8kB/16.38MB\u001b[0m \u001b[34m0.7s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n", + " ⠦ matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣤\u001b[0m] 9.667MB/16.38MB Pulling \u001b[39m\u001b[0m \u001b[34m2.6s \u001b[0m\n", + " \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " ⠿ 82780b9b6d69 Extracting \u001b[39m 9.667MB/16.38MB\u001b[0m \u001b[34m0.8s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n", + " \u001b[32m✔\u001b[0m matitos_dozzle \u001b[33m3 layers\u001b[0m [\u001b[32m\u001b[1m⣿⣿⣿\u001b[0m] 0B/0B Pulled \u001b[32m\u001b[0m \u001b[34m2.7s \u001b[0m\n", + " \u001b[32m✔\u001b[0m b5b68a794063 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " \u001b[32m✔\u001b[0m 764914624645 Pull complete \u001b[32m\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " \u001b[32m✔\u001b[0m 82780b9b6d69 Pull complete \u001b[32m\u001b[0m \u001b[34m0.9s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n", + " ⠋ Container db_redis \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n", + " ⠋ Container db_postgres \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n", + " ⠋ Container dozzle \u001b[39mCreating\u001b[0m \u001b[34m0.0s \u001b[0m\n", "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", + " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", + " ⠿ Container dozzle \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n", - " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/4\n", + " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " ⠿ Container dozzle \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n", + " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n", " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", "\u001b[?25h" ] @@ -42,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -143,6 +234,7 @@ " # Feeds\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n", " # Websites of interest\n", + " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.missingkids.org/poster', 'url_host');\" )\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n", " # Search keywords\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n", @@ -159,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -211,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -260,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -285,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": {}, "outputs": [ { diff --git a/OBSOLETE_app_fetcher/Dev.ipynb b/OBSOLETE_app_fetcher/Dev.ipynb deleted file mode 100644 index 88f7eed..0000000 --- a/OBSOLETE_app_fetcher/Dev.ipynb +++ /dev/null @@ -1,46 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "conda create -n matitos_fetcher python=3.12\n", - "conda activate matitos_fetcher\n", - "conda install -c conda-forge curl\n", - "pip install ipykernel \"psycopg[binary]\" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!uvicorn app:app --host 0.0.0.0 --port 5000 --reload" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "matitos_fetcher", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/OBSOLETE_app_fetcher/Dockerfile b/OBSOLETE_app_fetcher/Dockerfile deleted file mode 100644 index 9679bda..0000000 --- a/OBSOLETE_app_fetcher/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -FROM continuumio/miniconda3:25.1.1-2 - -# App repository -COPY . /opt/app/ - -RUN conda install -c conda-forge curl -RUN pip install --no-cache-dir --upgrade "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean] -RUN pip freeze -# GoogleNews-1.6.10 Pillow-10.1.0 PyYAML-6.0.1 aiofiles-23.2.1 anyio-3.7.1 beautifulsoup4-4.9.3 bs4-0.0.1 click-8.1.7 cssselect-1.2.0 dateparser-1.2.0 dnspython-1.16.0 duckduckgo_search-3.9.8 fastapi-0.104.1 fastapi-utils-0.2.1 feedfinder2-0.0.4 feedparser-6.0.10 filelock-3.13.1 gnews-0.3.6 greenlet-3.0.1 h11-0.14.0 h2-4.1.0 hpack-4.0.0 httpcore-1.0.2 httpx-0.25.2 hyperframe-6.0.1 jieba3k-0.35.1 joblib-1.3.2 lxml-4.9.3 newspaper3k-0.2.8 nltk-3.8.1 numpy-1.26.2 psycopg-3.1.13 psycopg-binary-3.1.13 pydantic-1.10.13 pymongo-3.12.3 python-dateutil-2.8.2 python-dotenv-0.19.2 pytz-2023.3.post1 redis-5.0.1 regex-2023.10.3 requests-2.26.0 requests-file-1.5.1 sgmllib3k-1.0.0 six-1.16.0 sniffio-1.3.0 socksio-1.0.0 soupsieve-2.5 sqlalchemy-1.4.50 starlette-0.27.0 tinysegmenter-0.3 tldextract-5.1.1 typing-extensions-4.8.0 tzlocal-5.2 uvicorn-0.24.0.post1 - -WORKDIR /opt/app - -# https://www.uvicorn.org/settings/#resource-limits -CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"] - -# docker build -t fetch_app . -# docker run --rm --name container_fetch_app fetch_app diff --git a/OBSOLETE_app_fetcher/README.md b/OBSOLETE_app_fetcher/README.md deleted file mode 100644 index b0827cb..0000000 --- a/OBSOLETE_app_fetcher/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Fetcher - -``` -conda create -n matitos_fetcher python=3.12 -conda activate matitos_fetcher -conda install -c conda-forge curl -pip install ipykernel "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean] -``` - - -* Fetcher app - - Contains several endpoints to perform a specific fetching type task - - For more details, check in [app.py](app.py) /{fetch_type} - -* Build and run - - Important: To be deployed with other micro-services, [docker-compose.yml](../docker-compose.yml) -``` -docker build -t fetch_app . -docker run --rm --name container_fetch_app fetch_app -``` diff --git a/OBSOLETE_app_fetcher/app.py b/OBSOLETE_app_fetcher/app.py deleted file mode 100644 index 276ccdc..0000000 --- a/OBSOLETE_app_fetcher/app.py +++ /dev/null @@ -1,79 +0,0 @@ -from src.fetch_feed import FetchFeed -from src.fetch_parser import FetchParser -from src.fetch_search import FetchSearch - -from src.missing_kids_fetch import MissingKidsFetch -from src.missing_kids_status import MissingKidsStatus - -from src.url_status import UpdateErrorURLs -from src.db_utils import DB_Handler - -import src.credentials as cred -from logging_ import get_logger - -from fastapi import FastAPI, BackgroundTasks -################################################################################################## - -logger = get_logger() -logger.info("Environment: {}".format(cred.ENVIRONMENT)) - -db_handler = DB_Handler(cred.db_connect_info, cred.redis_connect_info) - -app = FastAPI() - -@app.get("/") -def hello_world(): - return {"message": "Ok"} - -@app.get("/{process_type}") -async def process(background_tasks: BackgroundTasks, process_type: str): - # Concurrent job running - logger.info("Triggered: {}".format(process_type)) - - if (process_type == "fetch_feeds"): - task_run = FetchFeed(db_handler).run - elif (process_type == "fetch_parser"): - task_run = FetchParser(db_handler).run - elif (process_type == "search") or (process_type == "search_full"): - task_run = FetchSearch(cred.db_connect_info, cred.redis_connect_info, full=True).run - elif (process_type == "search_reduced"): - task_run = FetchSearch(cred.db_connect_info, cred.redis_connect_info, full=False).run - - # Selenium based - elif (process_type == "fetch_missing_kids_reduced"): - task_run = MissingKidsFetch(db_handler, num_pages=4).run - elif (process_type == "fetch_missing_kids_full"): - task_run = MissingKidsFetch(db_handler, num_pages=100000).run - - elif (process_type == "update_missing_kids_status_reduced"): - task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status - elif (process_type == "update_missing_kids_status_full"): - task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status - - elif (process_type == "update_error_urls"): - task_run = UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status - else: - return {"message": "ERROR. Unknown fetcher type!"} - - # Run task - background_tasks.add_task(task_run) - # Return message - return {"message": "Started {}: Ok".format(process_type)} - -""" -# TODO: Instead of background tasks! - -import rq -import redis - -# Redis connection -redis_conn = redis.Redis(host='localhost', port=6379, db=0) -queue = rq.Queue(connection=redis_conn) - -# ... -# Queue the processing task -dict_args= {"db_handler": db_handler, } -queue.enqueue(task_run, **dict_args) - -# https://python-rq.org/ -""" \ No newline at end of file diff --git a/OBSOLETE_app_fetcher/src/db_utils.py b/OBSOLETE_app_fetcher/src/db_utils.py deleted file mode 100644 index eca1a73..0000000 --- a/OBSOLETE_app_fetcher/src/db_utils.py +++ /dev/null @@ -1,502 +0,0 @@ -import psycopg -import redis -import traceback -import random -import requests -import json -import os -from .url_utils import process_article -from .logger import get_logger -logger = get_logger() - -# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ... -# The rest, elsewhere - -class DB_Handler(): - def __init__(self, db_connect_info, redis_connect_info): - logger.debug("Initializing URL DB writer") - self.db_connect_info = db_connect_info - self.redis_instance = redis.Redis(host=redis_connect_info.get("host"), port=redis_connect_info.get("port")) - self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours - - try: - self.redis_instance.ping() - logger.debug("Succesfully pinged Redis") - except Exception as e: - logger.warning("Error trying to ping Redis: {}".format(str(e))) - - def get_urls_count(self, last_minutes_check): - ##################### - ### Get number of URLs within last X minutes - ##################### - try: - # Update - with psycopg.connect(self.db_connect_info) as conn: - # Open cursor - cursor = conn.cursor() - num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0] - except Exception as e: - logger.warning("Error updating URLs status: {}".format(str(e))) - num_urls = None - return num_urls - - def _get_url_host_list(self): - try: - with psycopg.connect(self.db_connect_info) as conn: - # List of URL host - list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()] - # Clean http / https from URLs - list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host] - # Clean last slash if exists - list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host] - except Exception as e: - logger.warning("Exception fetching URL host list: " + str(e)) - list_url_host = [] - return list_url_host - - def _get_search_list(self): - try: - with psycopg.connect(self.db_connect_info) as conn: - # List of keyword searches - list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()] - except Exception as e: - logger.warning("Exception fetching searches list: " + str(e)) - list_search_text = [] - return list_search_text - - def _get_feed_urls(self): - try: - with psycopg.connect(self.db_connect_info) as conn: - list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall() - # Decode (tuple with 1 element) - list_url_feeds = [l[0] for l in list_url_feeds] - except Exception as e: - logger.warning("Exception fetching RSS sites: " + str(e)) - list_url_feeds = [] - return list_url_feeds - - def _get_url_hosts(self): - try: - with psycopg.connect(self.db_connect_info) as conn: - list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall() - # Decode (tuple with 1 element) - list_url_hosts = [l[0] for l in list_url_hosts] - except Exception as e: - logger.warning("Exception fetching RSS sites: " + str(e)) - list_url_hosts = [] - return list_url_hosts - - def _format(self, values): - # Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729 - # String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value) - if (type(values) == list) or (type(values) == tuple): - insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")" - elif (type(values) == str): - insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" ) - else: - logger.warning("Error formatting input values: {}".format(values)) - assert False - return insert_args - - def _get_cached_canonical_url(self, url): - ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB - try: - filter_url = self.redis_instance.get(url) - if (filter_url is not None): - filter_url = filter_url.decode("utf-8") - except Exception as e: - logger.warning("Exception querying Redis: {}".format(str(e))) - filter_url = None - return filter_url - - def _update_urls_status(self, dict_status_ids): - ##################### - ### Update status to array of URL IDs - ##################### - try: - # Update - with psycopg.connect(self.db_connect_info) as conn: - # Open cursor - cursor = conn.cursor() - # Autocommit at end of transaction (Atomic insert of URLs and sources) - with conn.transaction() as tx: - for key_status, value_ids in dict_status_ids.items(): - cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids]))) - except Exception as e: - logger.warning("Error updating URLs status: {}".format(str(e))) - - def _get_missing_kids_urls(self, num_urls=None): - ##################### - ### Get list of Missing Kids URLs - ##################### - try: - missing_kids_ids_and_urls = [] - if (num_urls is None): - limit = 500 - else: - limit = num_urls - offset = 0 - with psycopg.connect(self.db_connect_info) as conn: - # Open cursor - cursor = conn.cursor() - while True: - # Query - missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall() - # Finished? - if (len(missing_kids_ids_and_urls_query) == 0): - break - # Extend - missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query - # Offset - offset += len(missing_kids_ids_and_urls_query) - # Stop? - if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls): - break - - except Exception as e: - logger.warning("Error getting Missing Kids URLs: {}".format(str(e))) - missing_kids_ids_and_urls = [] - return missing_kids_ids_and_urls - - def _get_error_urls(self, num_urls=None): - ##################### - ### Get list of Missing Kids URLs - ##################### - try: - error_urls = [] - if (num_urls is None): - limit = 500 - else: - limit = num_urls - offset = 0 - with psycopg.connect(self.db_connect_info) as conn: - # Open cursor - cursor = conn.cursor() - while True: - # Query - error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall() - # Finished? - if (len(error_urls_query) == 0): - break - # Extend - error_urls = error_urls + error_urls_query - # Offset - offset += len(error_urls_query) - # Stop? - if (num_urls is not None) and (len(error_urls) >= num_urls): - break - - except Exception as e: - logger.warning("Error getting Error URLs: {}".format(str(e))) - error_urls = [] - return error_urls - - def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched... - """ - # TODO: REFACTOR - For each input url - - Already processed? - -> Update on Redis expire time - -> Associate to source - Not processed? Get main URL: - -> URL Canonical valid? - -> Rely on this as main URL - -> URL Canonical not valid? - -> Use input url, unless it's a news.google.com link - -> If news.google.com link, filter out. REDIS? - Main URL processing: - -> Update in REDIS, association url -> url_canonical - -> url != url_canonical: Add in duplicate table - If both != news.google.com - """ - - # URLs to insert, URLs duplicated association, URL to Canonical form - list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {} - - # URL VS CANONICAL: - # News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen - # Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/ - - for url in urls_fetched: - # Domain to filter? Input url - filter_due_to_domain = False - for domain_to_filter in list_domains_to_filter: - if (domain_to_filter in url): - logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url)) - filter_due_to_domain = True - if (filter_due_to_domain): - continue - - # URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB - cached_canonical_url = self._get_cached_canonical_url(url) - if (cached_canonical_url is not None): - # Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry) - dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y - # If url has been processed, so was its canonical form - logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url)) - continue - - # Process TODO: Add language... - url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple) - # TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id) - - # Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB) - if (url_canonical is None) and ("news.google.com" in url): - logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url)) - continue - # Canonical URL still news.google.com? Continue (avoid inserting in DB) - if (url_canonical is not None) and ("news.google.com" in url_canonical): - logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical)) - continue - - # Domain to filter? Input canonical_url - filter_due_to_domain = False - for domain_to_filter in list_domains_to_filter: - if (url_canonical is not None) and (domain_to_filter in url_canonical): - filter_due_to_domain = True - if (filter_due_to_domain): - logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical)) - continue - - if (url_canonical is None) or (article_status == "error"): - logger.debug("Processing failed for URL: {}".format(url)) - # Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based - if ("news.google.com" in url) or ("consent.google.com" in url): - logging.debug("Not able to process Google News link, skipping: {}".format(url)) - else: - dict_full_urls_to_canonical[url] = url # X -> X - list_insert_url_tuple_args.append( (url, article_status) ) - continue - - # URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different - if (url_canonical != url): - list_tuple_canonical_duplicate_urls.append( (url_canonical, url) ) - # Dict: url -> canonical (update association) - dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X - - # Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB - if (self._get_cached_canonical_url(url_canonical) is not None): - # Canonical URL was already processed - logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical)) - else: - # Insert url_canonical to DB formatted - list_insert_url_tuple_args.append( (url_canonical, article_status) ) - # Canonical URL different? Process - if (url_canonical != url): - if ("news.google.com" in url) or ("consent.google.com" in url): - logging.debug("Not adding google.news.com based link, skipping: {}".format(url)) - else: - # Fetched url -> duplicate (using canonical as main link) - article_status = "duplicate" - # Insert url (non-canonical) to DB formatted - list_insert_url_tuple_args.append( (url, article_status) ) - - return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical - - def _insert_urls(self, cursor, list_insert_url_tuple_args): - ##################### - ### Insert URLs with status - ##################### - if (len(list_insert_url_tuple_args) > 0): - insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] ) - # Insert. (url_1, status_1), (url_2, status_2), ... - sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args) - # logger.debug("SQL CODE: {}".format(sql_code)) - c = cursor.execute(sql_code) - # NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT) - # https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488 - - def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls): - ##################### - ### Insert duplicated URLs - ##################### - if (len(list_tuple_canonical_duplicate_urls) > 0): - # Flatten, format, set to remove duplicates - args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")" - - # Dict: url -> id - dict_url_to_id = {} - # Get url -> id association to populate duplicated URLs - for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall(): - dict_url_to_id[url_] = id_ - - # Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB - # ORIGINAL CODE. Issue, might not have found association to all urls - ### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls] - - list_tuple_canonical_duplicate_urls_ids = [] - for (url_1, url_2) in list_tuple_canonical_duplicate_urls: - id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2) - if (id_url_1 is None) or (id_url_2 is None): - logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2)) - else: - list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) ) - - if (len(list_tuple_canonical_duplicate_urls_ids) > 0): - insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] ) - # Insert. (id_url_canonical_1, id_url_1), ... - sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args) - # logger.debug("SQL CODE: {}".format(sql_code)) - c = cursor.execute(sql_code) - - def _get_pattern_status_list(self): - ##################### - ### Get list of domains to filter - ##################### - # TODO: Cache on redis and query once every N hours? ... - try: - with psycopg.connect(self.db_connect_info) as conn: - # Open cursor - cursor = conn.cursor() - # TODO: Cache on Redis - list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall() - except Exception as e: - logger.warning("Error getting pattern status list: {}".format(str(e))) - list_pattern_status = [] - return list_pattern_status - - def _get_domains_to_filter(self): - ##################### - ### Get list of domains to filter - ##################### - # TODO: Cache on redis and query once every N hours? ... - try: - with psycopg.connect(self.db_connect_info) as conn: - # Open cursor - cursor = conn.cursor() - # TODO: Cache on Redis - sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ] - except Exception as e: - logger.warning("Error getting domains to filter: {}".format(str(e))) - sites_to_filter = [] - return sites_to_filter - - def _get_cached_source_id(self, source): - ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB - try: - source_id = self.redis_instance.get(source) - if (source_id is not None): - source_id = source_id.decode("utf-8") - except Exception as e: - logger.warning("Exception querying Redis: {}".format(str(e))) - source_id = None - return source_id - - def _get_source_id(self, cursor, source): - ##################### - ### Get source corresponding id - ##################### - # Cached? - id_source = self._get_cached_source_id(source) - if (id_source is None): - c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone() - if (c is None) or (len(c) == 0): - # Source does not exist, insert and get id - c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone() - # Decode source id - id_source = c[0] - # Cache - print("*"*10, source, id_source) - self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds) - return id_source - - def _get_urls_id(self, cursor, urls_full): - ##################### - ### Get id of inserted and filtered URLs - ##################### - # TODO: Cache url -> url_id, url_canonical - if (len(urls_full) == 0): - return [] - # Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source - in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")" - id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ] - return id_urls_related - - def _insert_urls_source(self, cursor, id_urls_related, id_source): - ##################### - ### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ... - ##################### - if (len(id_urls_related) == 0) or (id_source is None): - return - columns = "(id_url, id_source)" - insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] ) - # Insert - sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args) - # logger.debug("SQL CODE: {}".format(sql_code)) - c = cursor.execute(sql_code) - - def write_batch(self, urls_fetched, source): - # Chunks of 50 elements - n = 50 - # Divide in small chunks - urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)] - # Process - for urls_fetched_chunk_i in urls_fetched_chunks: - self._write_small_batch(urls_fetched_chunk_i, source) - - def _write_small_batch(self, urls_fetched, source): - try: - logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source)) - - if (len(urls_fetched) == 0): - logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source)) - return - - # Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests) - random.shuffle(urls_fetched) - - # Get list of domains to filter - list_domains_to_filter = self._get_domains_to_filter() - # Get list of (pattern, priority, status) tuples to override status if required - list_pattern_status_tuple = self._get_pattern_status_list() - # Sort pattern tuples by priority - list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True) - - # Process URLs to update DB - list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple) - # Full set of URL and its canonical form (to associate them to a search), both to insert and filter - urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) ) - - # Insert - with psycopg.connect(self.db_connect_info) as conn: - # Open cursor - cursor = conn.cursor() - # Autocommit at end of transaction (Atomic insert of URLs and sources) - with conn.transaction() as tx: - # Insert processed URLs - self._insert_urls(cursor, list_insert_url_tuple_args) - # Insert URLs duplicated (canonical != fetched url) - self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls) - - # Get source id in DB - id_source = self._get_source_id(cursor, source) - # Get IDs of all related URLs - id_urls_related = self._get_urls_id(cursor, urls_full) - # Insert search source associated to URLs - self._insert_urls_source(cursor, id_urls_related, id_source) - - # Update Redis status of inserted and filtered URLs after writing to DB - for url, url_canonical in dict_full_urls_to_canonical.items(): - try: - # Set with updated expiry time - self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds) - if (url != url_canonical): - self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds) - except Exception as e: - logger.warning("Exception running set in Redis: {}".format(str(e))) - - if (len(list_insert_url_tuple_args) > 0): - try: - webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN") - endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token) - - payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) }) - r = requests.post(endpoint_message, data=payload) - except Exception as e: - logger.warning("Webhook failed: {}".format(str(e))) - - logger.debug("URL DB write finished") - except Exception as e: - logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) ) - logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) ) \ No newline at end of file diff --git a/OBSOLETE_app_fetcher/src/fetch_feed.py b/OBSOLETE_app_fetcher/src/fetch_feed.py deleted file mode 100644 index b3398b0..0000000 --- a/OBSOLETE_app_fetcher/src/fetch_feed.py +++ /dev/null @@ -1,48 +0,0 @@ -from .db_utils import DB_Handler -import feedparser -import dateutil -from .logger import get_logger -logger = get_logger() - -class FetchFeed(): - def __init__(self, db_handler: DB_Handler) -> None: - logger.debug("Initializing News feed") - self.db_handler = db_handler - - def run(self): - try: - logger.debug("Starting NewsFeed.run()") - # Get feeds - list_url_feeds = self.db_handler._get_feed_urls() - logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds))) - - # Process via RSS feeds - for url_feed in list_url_feeds: - # Initialize - urls_fetched, urls_publish_date = [], [] - # Fetch feeds - feeds = feedparser.parse(url_feed) - # Parse - for f in feeds.get("entries", []): - # Get URL - url = f.get("link", None) - # Process? - if (url is not None): - # Available publish date? - publish_date_parsed = f.get("published_parsed") - if (publish_date_parsed is None): - publish_date = f.get("published", None) - if (publish_date is not None): - publish_date_parsed = dateutil.parser.parse(publish_date) - - # Published date - urls_publish_date.append(publish_date_parsed) - # URL - urls_fetched.append(url) - - # URL fetching source - source = "feed {}".format(url_feed) - # Write to DB - self.db_handler.write_batch(urls_fetched, source) - except Exception as e: - logger.warning("Exception in NewsFeed.run(): {}".format(str(e))) diff --git a/OBSOLETE_app_fetcher/src/fetch_parser.py b/OBSOLETE_app_fetcher/src/fetch_parser.py deleted file mode 100644 index c3a73cb..0000000 --- a/OBSOLETE_app_fetcher/src/fetch_parser.py +++ /dev/null @@ -1,45 +0,0 @@ -from .db_utils import DB_Handler -import newspaper -from .logger import get_logger -logger = get_logger() - -class FetchParser(): - def __init__(self, db_handler: DB_Handler) -> None: - logger.debug("Initializing News SiteParsing newspaper4k") - self.db_handler = db_handler - - # TODO: MOVE LOGIC ELSEWHERE! - def _postprocess(self, article_urls): - return [url.replace("#comment-stream", "") for url in article_urls] - - def run(self): - try: - logger.debug("Starting NewsSiteParsing.run() for {}") - - # Get URL hosts - list_url_hosts = self.db_handler._get_url_hosts() - logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts))) - - # Process newspaper4k build method - for url_host_feed in list_url_hosts: - # Protocol - if not (url_host_feed.startswith("http")): - url_host_feed_formatted = "https://" + url_host_feed - else: - url_host_feed_formatted = url_host_feed - - logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted)) - # Source object - url_host_built = newspaper.build(url_host_feed_formatted) - # Get articles URL list - urls_fetched = url_host_built.article_urls() - # TODO: MOVE! - # Post-processing - urls_fetched = self._postprocess(urls_fetched) - - # URL fetching source - source = "newspaper4k {}".format(url_host_feed) - # Write to DB - self.db_handler.write_batch(urls_fetched, source) - except Exception as e: - logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e))) \ No newline at end of file diff --git a/OBSOLETE_app_fetcher/src/fetch_search.py b/OBSOLETE_app_fetcher/src/fetch_search.py deleted file mode 100644 index 8c42c42..0000000 --- a/OBSOLETE_app_fetcher/src/fetch_search.py +++ /dev/null @@ -1,73 +0,0 @@ -from .db_utils import DB_Handler -from .utils import get_searxng_instances -from .fetch_search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch -from .logger import get_logger -logger = get_logger() - -class FetchSearch(): - def __init__(self, db_handler: DB_Handler, full=True) -> None: - logger.debug("Initializing News feed") - self.db_handler = db_handler - self.full_search = full - - def _run_fetching(self, search_text): - logger.debug("Starting _run_fetching() for {}".format(search_text)) - - # Common parameters - lang, region = "en", "US" - - ### PreSearch - dict_params_news = {"search": search_text} - FetcherPreSearch(**dict_params_news).fetch_articles(self.db_handler) - - ### DuckDuckGo - period = "d" - dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period} - FetcherDuckDuckGo(**dict_params_news).fetch_articles(self.db_handler) - dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period} - FetcherDuckDuckGo(**dict_params_general).fetch_articles(self.db_handler) - - if (self.full_search): - # Avoid site:{} search due to G-Bypass required time - if ("site:" not in search_text): - ### GNews - dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period} - FetcherGNews(**dict_params).fetch_articles(self.db_handler) - - ### GoogleNews - dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period} - FetcherGoogleNews(**dict_params_news).fetch_articles(self.db_handler) - # dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period} - - if False: - ### SearxNG - period = "day" - for searx_instance in get_searxng_instances(): - dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period} - dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period} - # Append thread - FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler) - FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler) - - logger.debug("Finished _run_fetching()") - - def run(self): - try: - logger.info("Fetching text searches & URL hosts of interest") - - # Get text searches of interest - list_search_text_of_interest = self.db_handler._get_search_list() - - # Get URL host of interest - list_url_host = self.db_handler._get_url_host_list() - # Get text searches for URL hosts - list_search_text_url_host = ["site:{}".format(l) for l in list_url_host] - - for search_text in list_search_text_of_interest + list_search_text_url_host: - logger.debug("Fetching news for search: {}".format(search_text)) - self._run_fetching(search_text) - - logger.info("Finished fetching text searches & URL hosts of interest") - except Exception as e: - logger.warning("Exception in NewsSearch.run(): {}".format(str(e))) - \ No newline at end of file diff --git a/OBSOLETE_app_fetcher/src/fetch_search_sources.py b/OBSOLETE_app_fetcher/src/fetch_search_sources.py deleted file mode 100644 index 25813b5..0000000 --- a/OBSOLETE_app_fetcher/src/fetch_search_sources.py +++ /dev/null @@ -1,384 +0,0 @@ -from duckduckgo_search import DDGS -from gnews import GNews -from GoogleNews import GoogleNews - -import requests -from bs4 import BeautifulSoup -import os -import time -import json -import numpy as np -import random -from .google_bypass import GoogleByPass -from abc import ABC, abstractmethod -from .logger import get_logger -logger = get_logger() - - - -# Generic fetcher (fetches articles, writes to DB) -class FetcherAbstract(ABC): - @abstractmethod - def _fetch(self): - pass - - def fetch_articles(self, db_writer): - logger.debug("Starting fetch() for {}".format(self.name)) - # Fetch articles - list_news = self._fetch() - logger.info("Found #{} articles for search: {}".format(len(list_news), self.name)) - # Write to DB - db_writer.write_batch(list_news, self.name) - -# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/ - -user_agents_list = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0", - "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48", - "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", - "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0", - "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", - "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15", - "Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41", - "Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", - "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" -] - - - - - -class FetcherPreSearch(FetcherAbstract): - def __init__(self, search): - """ - # period -> - - h = hours (eg: 12h) - - d = days (eg: 7d) - - m = months (eg: 6m) - - y = years (eg: 1y) - """ - self.search = search - self.period = "1d" # TODO Fixed for the moment - # self.lang = lang - # self.region = region - search_category = "news" - self.name = "presearch {} {} {}".format(search, search_category, self.period) - - def _fetch(self): - try: - # PreSearch fetching endpoint, parameter search keyword - presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search) - # Timeout: 15 minutes - r = requests.get(presearch_fetch_endpoint, timeout=900) - # Decode - list_news = json.loads(r.text).get("list_urls", []) - except Exception as e: - logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e))) - list_news = [] - return list_news - - - -class FetcherGNews(FetcherAbstract): - def __init__(self, search, period, lang="en", region="US"): - """ - # period -> - - h = hours (eg: 12h) - - d = days (eg: 7d) - - m = months (eg: 6m) - - y = years (eg: 1y) - """ - self.search = search - self.period = period - self.lang = lang - self.region = region - search_category = "news" - self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region)) - - def _fetch(self): - try: - list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search) - # Decode - list_news = [] - for l in list_dict_news: - list_news.append(l.get("url")) - except Exception as e: - logger.warning("Exception fetching {}: {}".format(self.name, str(e))) - list_news = [] - - # Bypass Google links - list_news_redirections = GoogleByPass().bypass_google_urls(list_news) - - return list_news_redirections - -class FetcherGoogleNews(FetcherAbstract): - def __init__(self, search, search_category="news", period="1d", lang="en", region="US"): - assert(search_category in ["news", "general"]) - - self.lang = lang - self.region = region - self.period = period - self.search_category = search_category - self.search = search - self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region)) - - def _fetch(self): - try: - # Initialize - g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region) - g.enableException(True) - - if (self.search_category == "general"): - set_links = set() - # Search - g.search(self.search) - - # Iterate pages - MAX_ITER_PAGES = 15 - for i in range(MAX_ITER_PAGES): - time.sleep(random.uniform(1, 1.5)) - num_before = len(set_links) - - # Get page - try: - links = g.page_at(i) - except Exception as e: - logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e))) - break - # Links - for l in links: - # '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ' - url = l.get("link").split("url=")[-1] - set_links.add(url) - - num_after = len(set_links) - - # Finished? - if (num_before == num_after): - logger.debug("Iterated {} pages on GoogleNews general search".format(i)) - break - # To list - list_news = list(set_links) - elif (self.search_category == "news"): - # Search - g.get_news(self.search) - # Fetch - list_news = g.get_links() - - except Exception as e: - logger.warning("Exception fetching {}: {}".format(self.name, str(e))) - list_news = [] - - # Bypass Google links - list_news_redirections = GoogleByPass().bypass_google_urls(list_news) - - return list_news_redirections - -class FetcherDuckDuckGo(FetcherAbstract): - def __init__(self, search, search_category, period, lang="wt", region="wt"): - assert(search_category in ["news", "general"]) - assert(period in ["d", "w", "m", "y"]) - self.search = search - self.search_category = search_category - self.period = period - self.lang_region = "{}-{}".format(lang, region) - self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region) - - def _fetch(self): - try: - list_news = [] - with DDGS(timeout=10) as ddgs: - if (self.search_category == "general"): - generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region) - elif (self.search_category == "news"): - generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region) - - for l in generator_links: - list_news.append( l.get("url", l.get("href")) ) - - except Exception as e: - logger.warning("Exception fetching {}: {}".format(self.name, str(e))) - list_news = [] - return list_news - - -class FetcherSearxNews(FetcherAbstract): - def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"): - assert(search_category in ["news", "general"]) - assert(period in [None, "day", "week", "month", "year"]) - # Random header (minimize prob of web-scrapping detection) - self.headers = { - 'User-agent': str(np.random.choice(user_agents_list)), - 'Accept-Encoding': 'gzip, deflate', - 'Accept': '*/*', - 'Connection': 'keep-alive', - } - """ # Optional header - self.headers = { - 'User-agent': str(np.random.choice(user_agents_list)), - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'TE': 'trailers', - 'Sec-Fetch-Site': 'cross-site', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Dest': 'document', - } - """ - self.search = search - self.searx_instance = searx_instance - self.lang_region = "{}-{}".format(lang, region) - self.search_category = search_category - self.period = period - self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5 - self.request_timeout = 240 - - period_name_mapping = { - None: "no_date_range", - "day": "1d", - "week": "1w", - "month": "1m", - "year": "1y", - } - self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region) - logger.info("SearX - Initialized SearX fetcher: {}".format(self.name)) - - def _request_and_decode(self, url_search): - # Initial random time sleep (minimize chance of getting blocked) - time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher)) - # Request - logger.debug("SearX - Searching: {}".format(url_search)) - try: - r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout) - except Exception as e: - logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e)) - return [] - - if (r.status_code == 200): - # Status code Ok - pass - elif (r.status_code == 429): - # TooManyRequests, "Rate limit exceeded" - logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text)) - return [] - elif (r.status_code != 200): - logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text)) - return [] - else: - logger.debug("SearX - Status code: {}".format(r.status_code)) - - # Decode request - soup = BeautifulSoup(r.text, 'html.parser') - page_url_set = set() - # h3 links - for elem in soup.find_all('h3'): - # Get url - url = elem.find('a').get('href') - page_url_set.add(url) - return page_url_set - - def _get_news_list(self): - ############################################################ - # Domain & search parameter - search_domain = os.path.join(self.searx_instance, "search?q=") - # Search keywords - search_formatted = self.search.replace(" ", "+").replace(":", "%3A") - # Period formatted - period_formatted = "&time_range={}".format(self.period) if self.period is not None else "" - # Search parameters - search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted) - # Combined url search - url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters) - ############################################################ - - # Request and decode on page=1 - url_set = self._request_and_decode(url_search_nopage) - # No results? - if (len(url_set) == 0): - logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage)) - return [] - - # Iterate pages - search_numpage = 2 - while True: - # Combine url search with page number - url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage) - # Request and decode on page=X - url_set_i = self._request_and_decode(url_search_with_page) - - # Length before merging - length_current = len(url_set) - # Merge - url_set = url_set.union(url_set_i) - # Length after merging - length_merged = len(url_set) - - # No new elements? - if (length_current == length_merged): - logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage)) - break - # Next page - search_numpage += 1 - - return list(url_set) - - def _fetch(self): - try: - # Fetch news - list_news = self._get_news_list() - except Exception as e: - logger.warning("Exception fetching {}: {}".format(self.name, str(e))) - list_news = [] - return list_news diff --git a/OBSOLETE_app_fetcher/src/google_bypass.py b/OBSOLETE_app_fetcher/src/google_bypass.py deleted file mode 100644 index 6e34e72..0000000 --- a/OBSOLETE_app_fetcher/src/google_bypass.py +++ /dev/null @@ -1,26 +0,0 @@ -import requests -import json -from .logger import get_logger -logger = get_logger() - -class GoogleByPass(): - def __init__(self) -> None: - pass - - def bypass_google_urls(self, list_urls): - if (len(list_urls) == 0): - return [] - - try: - # Endpoint - gbypass_endpoint = "http://selenium_app:80/get_redirection" - # Timeout: 20 minutes - timeout = 60*20 - r = requests.post(gbypass_endpoint, json={"list_urls": list_urls}, timeout=timeout) - # Decode - list_urls_redirections = json.loads(r.text).get("list_urls_redirections", []) - except Exception as e: - logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e))) - list_urls_redirections = [] - - return list_urls_redirections diff --git a/OBSOLETE_app_fetcher/src/logger.py b/OBSOLETE_app_fetcher/src/logger.py deleted file mode 100644 index 83f00b3..0000000 --- a/OBSOLETE_app_fetcher/src/logger.py +++ /dev/null @@ -1,22 +0,0 @@ -import logging - -import os -os.makedirs("logs", exist_ok=True) - -logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') -logger = logging.getLogger("news_fetcher") -logger.setLevel(logging.INFO) - -# To file log: INFO / WARNING / ERROR -fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4) -fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) -logger.addHandler(fh) - -# To file log: WARNING / ERROR -fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1) -fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) -fh_.setLevel(logging.WARNING) -logger.addHandler(fh_) - -def get_logger(): - return logger diff --git a/OBSOLETE_app_fetcher/src/missing_kids_fetch.py b/OBSOLETE_app_fetcher/src/missing_kids_fetch.py deleted file mode 100644 index ea92cb7..0000000 --- a/OBSOLETE_app_fetcher/src/missing_kids_fetch.py +++ /dev/null @@ -1,36 +0,0 @@ -from .db_utils import DB_Handler -import requests -import json -from .logger import get_logger -logger = get_logger() - -class MissingKidsFetch(): - def __init__(self, db_handler: DB_Handler, num_pages) -> None: - logger.debug("Initializing News MissingKids") - self.db_handler = db_handler - self.num_pages = num_pages - self.missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}" - - def run(self): - try: - logger.debug("Starting NewsMissingKids.run()") - try: - # Timeout - if (self.num_pages > 15): - timeout = 60*90 # 1.5h - else: - timeout = 60*5 # 5 min - # Request - r = requests.get(self.missingkids_fetch_endpoint.format(self.num_pages), timeout=timeout) - # Decode - urls_fetched = json.loads(r.text).get("list_urls", []) - except Exception as e: - logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e))) - urls_fetched = [] - - # URL fetching source - source = "missingkids fetcher" - # Write to DB - self.db_handler.write_batch(urls_fetched, source) - except Exception as e: - logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e))) diff --git a/OBSOLETE_app_fetcher/src/missing_kids_status.py b/OBSOLETE_app_fetcher/src/missing_kids_status.py deleted file mode 100644 index df0768a..0000000 --- a/OBSOLETE_app_fetcher/src/missing_kids_status.py +++ /dev/null @@ -1,98 +0,0 @@ -from .db_utils import URL_DB_Writer -from .url_utils import get_missing_kid_status -from .logger import get_logger -logger = get_logger() - - -def get_missing_kid_status(url, return_canonical_url=False): - import time - import requests - - # Sleep - time.sleep(0.75) - try: - # Request - r = requests.get(url, timeout=300) - # Decode - status_code = r.status_code - # Canonical URL removing parameters - url_canonical = r.url - except Exception as e: - logger.warning("Exception on get URL status request: {}. {}".format(url, str(e))) - status_code = None - url_canonical = url - - if (status_code == 200): - status = "valid" - elif (status_code == 404): - status = "invalid" - else: - status = "unknown" - - logger.debug("Missing Kid URL {} status: {}".format(url, status)) - if (return_canonical_url): - return status, url_canonical - else: - return status - -class MissingKidsStatus(): - def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None: - self.num_urls = num_urls - self.db_connect_info = db_connect_info - self.redis_connect_info = redis_connect_info - self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info) - - def update_missing_kids_status(self): - try: - logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls)) - # List of URLs - list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls) - # Dict: status -> IDs to update to new status - dict_status_ids, dict_status_urls = {}, {} - # Check URLs with invalid status? - skip_invalid_check = False - - flush_every, flush_current = 20, 0 - # Iterate URLs - for (id, url, current_status) in list_ids_and_urls: - # Skip duplicate URLs - if (current_status == "duplicate"): - continue - # Skip invalid URLs? - if (skip_invalid_check): - if (current_status == "invalid"): - continue - - # Get status - new_status = get_missing_kid_status(url) - # Different? Update - if (current_status != new_status): - # Extend array - dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id] - # Debugging dict - dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url] - # +1 processed - flush_current += 1 - - # Flush batch? - if (flush_every == flush_current): - logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls)) - # Update DB - self.db_writer._update_urls_status(dict_status_ids) - # Reset - flush_current = 0 - dict_status_ids, dict_status_urls = {}, {} - - # Flush remaining batch - if (flush_current > 0): - logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls)) - # Update DB - self.db_writer._update_urls_status(dict_status_ids) - # Reset - flush_current = 0 - dict_status_ids, dict_status_urls = {}, {} - - logger.info("Finished updating status to Missing Kids URLs") - except Exception as e: - logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e))) - \ No newline at end of file diff --git a/OBSOLETE_app_fetcher/src/url_status.py b/OBSOLETE_app_fetcher/src/url_status.py deleted file mode 100644 index 3948417..0000000 --- a/OBSOLETE_app_fetcher/src/url_status.py +++ /dev/null @@ -1,62 +0,0 @@ -from .db_utils import URL_DB_Writer -from .url_utils import process_article -from .logger import get_logger -logger = get_logger() - -class UpdateErrorURLs(): - def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None: - self.num_urls = num_urls - self.db_connect_info = db_connect_info - self.redis_connect_info = redis_connect_info - self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info) - - def update_error_urls_status(self): - try: - logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls)) - # List of URLs with status 'error' - list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls) - # Current status - current_status = "error" - # Dict: status -> IDs to update to new status - dict_status_ids, dict_status_urls = {}, {} - - # Get list of (pattern, priority, status) tuples to override status if required - list_pattern_status_tuple = self.db_writer._get_pattern_status_list() - # Sort pattern tuples by priority - list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True) - - flush_every, flush_current = 20, 0 - # Iterate URLs - for (id, url) in list_ids_and_urls: - # Get status - url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple) - # Different? Update - if (current_status != new_status): - # Extend array - dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id] - # Debugging dict - dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url] - # +1 processed - flush_current += 1 - - # Flush batch? - if (flush_every == flush_current): - logger.info("Updating status to URLs with error: {}".format(dict_status_urls)) - # Update DB - self.db_writer._update_urls_status(dict_status_ids) - # Reset - flush_current = 0 - dict_status_ids, dict_status_urls = {}, {} - - # Flush remaining batch - if (flush_current > 0): - logger.info("Updating status to URLs with error: {}".format(dict_status_urls)) - # Update DB - self.db_writer._update_urls_status(dict_status_ids) - # Reset - flush_current = 0 - dict_status_ids, dict_status_urls = {}, {} - - logger.info("Finished updating status to URLs with error") - except Exception as e: - logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e))) diff --git a/OBSOLETE_app_fetcher/src/url_utils.py b/OBSOLETE_app_fetcher/src/url_utils.py deleted file mode 100644 index 169976f..0000000 --- a/OBSOLETE_app_fetcher/src/url_utils.py +++ /dev/null @@ -1,262 +0,0 @@ -from gnews import GNews -import dateutil.parser -from datetime import datetime, timedelta -from .utils import remove_http_s -import time -import random -import traceback -import requests -import json -import re -from bs4 import BeautifulSoup - -from .logger import get_logger -logger = get_logger() - -def get_published_date(article): - try: - """ - # Already fetched publish date information? - if (publish_date_ is not None): - return publish_date_ - """ - - # List of potential publish dates - potential_dates = [] - # Publish date is the best match - potential_dates.append(article.publish_date) - # Publish date metadata is the following best match - potential_dates.append(article.meta_data.get('article', {}).get("published_time", None)) - # Iterate remaining keys - for key in article.meta_data.keys(): - if ("date" in key): - potential_dates.append(article.meta_data[key]) - - def invalid_date(p_date): - # Today + 2 days, article from the future? - today_plus_two = datetime.utcnow() + timedelta(days=2) - # Article from the future? - return p_date.timestamp() > today_plus_two.timestamp() - - for date_ in potential_dates: - # String date? parse - if (type(date_) == str): - try: - date_ = dateutil.parser.parse(date_) - except Exception as e: - logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url)) - date_ = None - # Valid? - if (date_ is not None) and (not invalid_date(date_)): - return date_ - - logger.debug("Article with no published date: {}".format(article.url)) - return None - except Exception as e: - logger.info("Error while retrieving published date for URL: {}".format(article.url)) - return None - -def get_url_host(article_source_url, url): - # https://www.blabla.com/blabla -> www.blabla.com - if (article_source_url != ""): - # Article source URL already extracted, save path if any - return remove_http_s(article_source_url) # .split("/")[0] - else: - return remove_http_s(url).split("/")[0] - -def get_status_pattern_matching(url, article_status, list_pattern_status_tuple): - # Regex pattern to update status on "valid", "invalid", and "unknown" status only - # Status "raw", "duplicated" and "error" should remain the way they are - # Assumption: List of patterns sorted by importance - if (article_status in ["valid", "invalid", "unknown"]): - # Regular expression pattern matching: https://regexr.com/ - for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple: - # Matching? - matching = bool(re.match(regex_pattern, url)) - # Update article status - if (matching): - if (status_if_match != article_status): - logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url)) - return status_if_match - # Pattern matching not required or not found, original article status - return article_status - - - -def bypass_google_link(article_url): - - def bypass_google_consent(article_url): - # Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1 - article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "") - - # https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests - headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' - } - cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'} - - try: - # Request - r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300) - # Decode - soup = BeautifulSoup(r.text, 'html.parser') - url_of_interest = soup.a['href'] - except Exception as e: - logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e))) - url_of_interest = None - - # Not able to bypass? - if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest): - url_of_interest = None - return url_of_interest - - def bypass_google_using_service(article_url): - try: - # e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen" - gbypass_endpoint = "http://selenium_app:80/get_redirection" - # Timeout: 5 minutes - r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300) - # Decode - redirect_url = json.loads(r.text).get("redirect_url", "") - except Exception as e: - logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e))) - redirect_url = "" - - return redirect_url - - logger.debug("Starting gbypass_endpoint()") - - article_url_bypassed = None - # Bypass using request - if ("consent.google.com" in article_url): - article_url_bypassed = bypass_google_consent(article_url) - # Not bypassed yet? Bypass using service - if (article_url_bypassed is None): - article_url_bypassed = bypass_google_using_service(article_url) - - # if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed): - if (article_url_bypassed == "") or (article_url_bypassed is None): - # Empty URL returned by Gbypass - logger.warning("Error while bypassing Gnews for URL: {}".format(article_url)) - return None - else: - logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url)) - return article_url_bypassed - -def process_article(article_url, list_pattern_status_tuple, language="en"): - # TODO: - """ - https://github.com/fhamborg/news-please - https://github.com/fhamborg/Giveme5W1H - https://github.com/santhoshse7en/news-fetch - """ - try: - logger.debug("Starting process_article()") - - if ("news.google.com" in article_url) or ("consent.google.com" in article_url): - # Bypass to get redirection - article_url = bypass_google_link(article_url) - # Error? - if (article_url is None): - return None, {}, "error" - elif ("missingkids.org/poster" in article_url): - # Get status - article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True) - article_elements = { - "url_full": article_url, - "url_canonical": url_canonical - } - return url_canonical, article_elements, article_status - else: - # Avoid Too many requests (feeds, ...) - time.sleep(0.75) - - logger.debug("Processing: {}".format(article_url)) - - # Default status unless something happens - article_status = "valid" - - # Parse article - # TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None - # TODO: Language per config - article = GNews(language).get_full_article(url=article_url) - - # Article parsed? - if (article is None) or (not article.is_parsed): - logger.debug("Article not parsed: {}".format(article_url)) - return article_url, {}, "error" - - # Canonical link as main URL - url_canonical = article.canonical_link - # Empty canonical URL? - if (article.canonical_link is None) or (article.canonical_link == ""): - # URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link - if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")): - logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url)) - try: - # Remove text after parameter call - url = article.url.split("?")[0] - # Remove comment-stream - url = url.replace("#comment-stream", "").replace("#disqus_thread", "") - # Article - article_attempt = GNews(language).get_full_article(url=url) - # Retrieving same title? Update article based on clean URL - if (article_attempt is not None) and (article_attempt.title == article.title): - article = article_attempt - except Exception as e: - logger.info("Article parsing of URL without parameters failed: {}".format(article.url)) - else: # Default behaviour - logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url)) - - # By default, URL same as canonical - url_canonical = article.url - - elif (article.url != article.canonical_link): - # If different, stick to canonical URL - logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link)) - else: - # If same, continue... - pass - - # Update config to determine if content is valid - article.config.MIN_WORD_COUNT = 150 - article.config.MIN_SENT_COUNT = 6 - - # Valid URL? - if (not article.is_valid_url()): - logger.debug("Not a valid news article: {}".format(url_canonical)) - article_status = "invalid" - # Is the article's body text is long enough to meet standard article requirements? - if (not article.is_valid_body()): - logger.debug("Article body not valid: {}".format(url_canonical)) - article_status = "unknown" - - if (article.images != article.imgs): - logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs)) - - # article.keywords, article.meta_keywords, article.summary - # article.movies - # article.top_image - - # Check if article status needs to be updated - article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple) - - article_elements = { - 'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/ - 'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com - 'title': article.title, # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020 - 'description': article.meta_description, # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office - 'text': article.text, # ${Article content} - 'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00' - 'authors': article.authors, # ['Christopher Knaus'] - 'language': article.meta_lang, # en - 'tags': list(article.tags), # ['Wide Open Border', '’My Son Hunter’ Movie', ...] - 'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...] - 'url_canonical': url_canonical, # Canonical URL (redirection) - # 'html': article.html, # HTML article - } - logger.debug("Processing OK: {}".format(url_canonical)) - return url_canonical, article_elements, article_status - except Exception as e: - logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc())) - return None, {}, "error" \ No newline at end of file diff --git a/OBSOLETE_app_fetcher/src/utils.py b/OBSOLETE_app_fetcher/src/utils.py deleted file mode 100644 index 76ae07a..0000000 --- a/OBSOLETE_app_fetcher/src/utils.py +++ /dev/null @@ -1,33 +0,0 @@ - -def remove_http_s(url): - url = url.replace("https://", "") if url.startswith("https://") else url - url = url.replace("http://", "") if url.startswith("http://") else url - return url - -def is_valid_url(url): - if (url.startswith("https://")): - return True - else: - return False - -def get_searxng_instances(): - # SearxNG instances: https://searx.space/ - searx_instances = set() - searx_instances.add("https://searx.work/") - searx_instances.add("https://search.ononoki.org/") - searx_instances.add("https://searxng.nicfab.eu/") - searx_instances.add("https://searx.be/") - - # searx_instances.add("https://searx.fmac.xyz/") - # searx_instances.add("https://northboot.xyz/") # FIX - - # searx_instances.add("https://serx.ml/") # Offline - # searx_instances.add("https://searx.ru/") - # searx_instances.add("https://searx.sp-codes.de/") - # searx_instances.add("https://searxng.nicfab.eu/") - # searx_instances.add("https://s.frlt.one/") - # searx_instances.add("https://search.sapti.me/") - - # To list - list_searx_instances = list(searx_instances) - return list_searx_instances \ No newline at end of file diff --git a/app_selenium/README.md b/app_selenium/README.md new file mode 100644 index 0000000..deac652 --- /dev/null +++ b/app_selenium/README.md @@ -0,0 +1,3 @@ + +* Missing kids posters fetch (num_pages=X) +* ... diff --git a/app_urls/api/models.py b/app_urls/api/models.py index f23ae0b..3013403 100644 --- a/app_urls/api/models.py +++ b/app_urls/api/models.py @@ -17,7 +17,7 @@ class Search(models.Model): db_table = 'search' def __str__(self): - return "[{}]->{}".format(self.type, self.search) + return "[{}: {}]".format(self.type, self.search) class Source(models.Model): id = models.SmallAutoField(primary_key=True) diff --git a/app_urls/api/src/db_utils.py b/app_urls/api/src/db_utils.py index 96799fe..f38391e 100644 --- a/app_urls/api/src/db_utils.py +++ b/app_urls/api/src/db_utils.py @@ -130,7 +130,7 @@ class DB_Handler(): # Get or create URL with canonical form obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical")) # Get the source-search IDs associated to obj_url.id - list_url_source_search = UrlsSourceSearch.objects.fiter(id_url=obj_url) + list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url) for obj_url_source_search in list_url_source_search: # Associate same sources to url_canonical (it might already exist) UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search) diff --git a/app_urls/api/templates/item_list.html b/app_urls/api/templates/item_list.html index f33e579..624c1ec 100644 --- a/app_urls/api/templates/item_list.html +++ b/app_urls/api/templates/item_list.html @@ -9,7 +9,7 @@ @@ -174,6 +245,9 @@ box-shadow: 2px 0 5px rgba(0, 0, 0, 0.1); padding: 15px; transition: width 0.3s ease; + /* Enable scrolling */ + overflow-y: auto; + max-height: 100vh; } #sidebar .nav-link { @@ -313,10 +387,10 @@ } th:nth-child(1), td:nth-child(1) { width: 50%; } /* URL column */ - th:nth-child(2), td:nth-child(2) { width: 20%; } /* Fetch Date */ - th:nth-child(3), td:nth-child(3) { width: 20%; } /* Sources */ - th:nth-child(4), td:nth-child(4) { width: 5%; } /* Status */ - th:nth-child(5), td:nth-child(5) { width: 5%; } /* Action */ + th:nth-child(2), td:nth-child(2) { width: 27.5%; } /* Fetch Date */ + th:nth-child(3), td:nth-child(3) { width: 10%; } /* Sources */ + th:nth-child(4), td:nth-child(4) { width: 10%; } /* Searches */ + th:nth-child(5), td:nth-child(5) { width: 2.5%; } /* Status */ /* ============================= */ /* Pagination Styling */ @@ -407,33 +481,23 @@ 🌙 - - -
+ + + - + diff --git a/app_urls/api/templates/item_list_partial.html b/app_urls/api/templates/item_list_partial.html index d41c3ea..e72a39f 100644 --- a/app_urls/api/templates/item_list_partial.html +++ b/app_urls/api/templates/item_list_partial.html @@ -7,15 +7,18 @@