From f84c7729f8db487633d6c4c8d053a178c652c666 Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Thu, 20 Mar 2025 17:19:52 +0100 Subject: [PATCH] Urls source search, cleaning code --- 1-DB.ipynb | 104 +++-------------- A_Development.ipynb | 60 ++-------- app_urls/README.md | 12 +- ...0003_urlssourcesearch_delete_urlssource.py | 27 +++++ app_urls/api/models.py | 9 +- app_urls/api/src/db_utils.py | 110 +++++------------- app_urls/api/src/fetch_feed.py | 21 ++-- app_urls/api/src/fetch_parser.py | 28 ++--- app_urls/api/src/fetch_search.py | 44 ++++--- app_urls/api/src/fetch_search_utils.py | 72 ++++++++---- app_urls/api/src/url_processor.py | 32 +++++ app_urls/api/tasks.py | 5 + app_urls/core/settings.py | 17 ++- 13 files changed, 241 insertions(+), 300 deletions(-) create mode 100644 app_urls/api/migrations/0003_urlssourcesearch_delete_urlssource.py diff --git a/1-DB.ipynb b/1-DB.ipynb index 0430949..cc5bf8b 100644 --- a/1-DB.ipynb +++ b/1-DB.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -11,64 +11,18 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "db_postgres\n", - "db_redis\n", - "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n", - " ⠋ Container db_redis \u001b[39mCreating\u001b[0m \u001b[34m0.1s \u001b[0m\n", - " ⠋ Container db_postgres \u001b[39mCreating\u001b[0m \u001b[34m0.1s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/2\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.3s \u001b[0m\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.3s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.4s \u001b[0m\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.4s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.5s \u001b[0m\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.5s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n", - " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.5s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.5s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h" - ] - } - ], + "outputs": [], "source": [ "!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "UndefinedTable", - "evalue": "relation \"urls_source\" does not exist", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mUndefinedTable\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 19\u001b[39m\n\u001b[32m 15\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m conn.cursor() \u001b[38;5;28;01mas\u001b[39;00m cur:\n\u001b[32m 16\u001b[39m \u001b[38;5;66;03m# Autocommit at end of transaction (Atomic insert of URLs and sources)\u001b[39;00m\n\u001b[32m 17\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m conn.transaction() \u001b[38;5;28;01mas\u001b[39;00m tx:\n\u001b[32m 18\u001b[39m \u001b[38;5;66;03m# Create URLs table\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m19\u001b[39m c = \u001b[43mcur\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\"\"\u001b[39;49m\n\u001b[32m 20\u001b[39m \u001b[33;43m CREATE TYPE URL_STATUS AS ENUM (\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mraw\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43merror\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mvalid\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43munknown\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43minvalid\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mduplicate\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m);\u001b[39;49m\n\u001b[32m 21\u001b[39m \n\u001b[32m 22\u001b[39m \u001b[33;43m CREATE TABLE URLS (\u001b[39;49m\n\u001b[32m 23\u001b[39m \u001b[33;43m id SERIAL PRIMARY KEY,\u001b[39;49m\n\u001b[32m 24\u001b[39m \u001b[33;43m url TEXT NOT NULL UNIQUE,\u001b[39;49m\n\u001b[32m 25\u001b[39m \u001b[33;43m ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\u001b[39;49m\n\u001b[32m 26\u001b[39m \u001b[33;43m status URL_STATUS NOT NULL DEFAULT \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mraw\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m -- ,\u001b[39;49m\n\u001b[32m 27\u001b[39m \u001b[33;43m -- status_wendy WENDY_STATUS DEFAULT NULL,\u001b[39;49m\n\u001b[32m 28\u001b[39m \u001b[33;43m -- ts_wendy TIMESTAMPTZ DEFAULT NULL\u001b[39;49m\n\u001b[32m 29\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 30\u001b[39m \u001b[33;43m CREATE INDEX idx_urls_status ON urls(status);\u001b[39;49m\n\u001b[32m 31\u001b[39m \u001b[33;43m CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\u001b[39;49m\n\u001b[32m 32\u001b[39m \n\u001b[32m 33\u001b[39m \u001b[33;43m CREATE TABLE URLS_DUPLICATE (\u001b[39;49m\n\u001b[32m 34\u001b[39m \u001b[33;43m id_url_canonical INTEGER REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 35\u001b[39m \u001b[33;43m id_url_duplicated INTEGER REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 36\u001b[39m \u001b[33;43m PRIMARY KEY (id_url_canonical, id_url_duplicated)\u001b[39;49m\n\u001b[32m 37\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 38\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 39\u001b[39m \u001b[33;43m CREATE TYPE SEARCH_TYPE AS ENUM (\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mrss_feed\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mkeyword_search\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43murl_host\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m);\u001b[39;49m\n\u001b[32m 40\u001b[39m \u001b[33;43m CREATE TABLE SEARCH (\u001b[39;49m\n\u001b[32m 41\u001b[39m \u001b[33;43m id SMALLSERIAL PRIMARY KEY,\u001b[39;49m\n\u001b[32m 42\u001b[39m \u001b[33;43m search TEXT NOT NULL UNIQUE,\u001b[39;49m\n\u001b[32m 43\u001b[39m \u001b[33;43m type SEARCH_TYPE NOT NULL\u001b[39;49m\n\u001b[32m 44\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 45\u001b[39m \u001b[33;43m CREATE INDEX idx_search_type ON SEARCH(type);\u001b[39;49m\n\u001b[32m 46\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 47\u001b[39m \u001b[33;43m CREATE TABLE SOURCE (\u001b[39;49m\n\u001b[32m 48\u001b[39m \u001b[33;43m id SMALLSERIAL PRIMARY KEY,\u001b[39;49m\n\u001b[32m 49\u001b[39m \u001b[33;43m source TEXT NOT NULL UNIQUE\u001b[39;49m\n\u001b[32m 50\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 51\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 52\u001b[39m \u001b[33;43m CREATE TABLE URLS_SOURCE_SEARCH (\u001b[39;49m\n\u001b[32m 53\u001b[39m \u001b[33;43m id_url INTEGER REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 54\u001b[39m \u001b[33;43m id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\u001b[39;49m\n\u001b[32m 55\u001b[39m \u001b[33;43m id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\u001b[39;49m\n\u001b[32m 56\u001b[39m \u001b[33;43m PRIMARY KEY(id_url, id_source)\u001b[39;49m\n\u001b[32m 57\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 58\u001b[39m \u001b[33;43m CREATE INDEX idx_source ON urls_source(id_source);\u001b[39;49m\n\u001b[32m 59\u001b[39m \n\u001b[32m 60\u001b[39m \u001b[33;43m CREATE TABLE STATUS_PATTERN_MATCHING (\u001b[39;49m\n\u001b[32m 61\u001b[39m \u001b[33;43m pattern TEXT PRIMARY KEY,\u001b[39;49m\n\u001b[32m 62\u001b[39m \u001b[33;43m priority SMALLINT NOT NULL,\u001b[39;49m\n\u001b[32m 63\u001b[39m \u001b[33;43m status URL_STATUS NOT NULL\u001b[39;49m\n\u001b[32m 64\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 65\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 66\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 67\u001b[39m \u001b[33;43m CREATE TABLE URL_CONTENT (\u001b[39;49m\n\u001b[32m 68\u001b[39m \u001b[33;43m id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 69\u001b[39m \u001b[33;43m date_published TIMESTAMPTZ DEFAULT NOW(),\u001b[39;49m\n\u001b[32m 70\u001b[39m \u001b[33;43m title TEXT,\u001b[39;49m\n\u001b[32m 71\u001b[39m \u001b[33;43m description TEXT,\u001b[39;49m\n\u001b[32m 72\u001b[39m \u001b[33;43m content TEXT,\u001b[39;49m\n\u001b[32m 73\u001b[39m \u001b[33;43m valid_content BOOLEAN,\u001b[39;49m\n\u001b[32m 74\u001b[39m \u001b[33;43m language CHAR(2), -- ISO 639-1 Code\u001b[39;49m\n\u001b[32m 75\u001b[39m \u001b[33;43m keywords TEXT[],\u001b[39;49m\n\u001b[32m 76\u001b[39m \u001b[33;43m tags TEXT[],\u001b[39;49m\n\u001b[32m 77\u001b[39m \u001b[33;43m authors TEXT[],\u001b[39;49m\n\u001b[32m 78\u001b[39m \u001b[33;43m image_main_url TEXT,\u001b[39;49m\n\u001b[32m 79\u001b[39m \u001b[33;43m images_url TEXT[],\u001b[39;49m\n\u001b[32m 80\u001b[39m \u001b[33;43m videos_url TEXT[],\u001b[39;49m\n\u001b[32m 81\u001b[39m \u001b[33;43m url_host TEXT, -- www.breitbart.com\u001b[39;49m\n\u001b[32m 82\u001b[39m \u001b[33;43m site_name TEXT -- Breitbart News\u001b[39;49m\n\u001b[32m 83\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 84\u001b[39m \u001b[33;43m CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\u001b[39;49m\n\u001b[32m 85\u001b[39m \u001b[33;43m CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\u001b[39;49m\n\u001b[32m 86\u001b[39m \u001b[33;43m CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\u001b[39;49m\n\u001b[32m 87\u001b[39m \u001b[33;43m CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\u001b[39;49m\n\u001b[32m 88\u001b[39m \u001b[33;43m CREATE INDEX idx_language ON URL_CONTENT (language);\u001b[39;49m\n\u001b[32m 89\u001b[39m \u001b[33;43m CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\u001b[39;49m\n\u001b[32m 90\u001b[39m \u001b[33;43m \u001b[39;49m\u001b[33;43m\"\"\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 92\u001b[39m \u001b[38;5;66;03m# Feeds\u001b[39;00m\n\u001b[32m 93\u001b[39m cur.execute( \u001b[33m\"\u001b[39m\u001b[33mINSERT INTO SEARCH (search, type) VALUES (\u001b[39m\u001b[33m'\u001b[39m\u001b[33mhttps://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC\u001b[39m\u001b[33m'\u001b[39m\u001b[33m, \u001b[39m\u001b[33m'\u001b[39m\u001b[33mrss_feed\u001b[39m\u001b[33m'\u001b[39m\u001b[33m);\u001b[39m\u001b[33m\"\u001b[39m )\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/anaconda3/envs/matitos/lib/python3.12/site-packages/psycopg/cursor.py:97\u001b[39m, in \u001b[36mCursor.execute\u001b[39m\u001b[34m(self, query, params, prepare, binary)\u001b[39m\n\u001b[32m 93\u001b[39m \u001b[38;5;28mself\u001b[39m._conn.wait(\n\u001b[32m 94\u001b[39m \u001b[38;5;28mself\u001b[39m._execute_gen(query, params, prepare=prepare, binary=binary)\n\u001b[32m 95\u001b[39m )\n\u001b[32m 96\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m e._NO_TRACEBACK \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[32m---> \u001b[39m\u001b[32m97\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ex.with_traceback(\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m 98\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n", - "\u001b[31mUndefinedTable\u001b[39m: relation \"urls_source\" does not exist" - ] - } - ], + "outputs": [], "source": [ "INSERT_TABLES = True\n", "INSERT_SAMPLE_DATA = False\n", @@ -125,9 +79,10 @@ " id_url INTEGER REFERENCES URLS(id),\n", " id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n", " id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n", - " PRIMARY KEY(id_url, id_source)\n", + " PRIMARY KEY(id_url, id_source, id_search)\n", " );\n", - " CREATE INDEX idx_source ON urls_source(id_source);\n", + " CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);\n", + " CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);\n", "\n", " CREATE TABLE STATUS_PATTERN_MATCHING (\n", " pattern TEXT PRIMARY KEY,\n", @@ -160,11 +115,12 @@ " CREATE INDEX idx_language ON URL_CONTENT (language);\n", " CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n", " \"\"\")\n", + "\n", + " ### Default insert values\n", " \n", " # Feeds\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n", " # Websites of interest\n", - " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.unicef.org', 'url_host');\" )\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n", " # Search keywords\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n", @@ -207,17 +163,7 @@ " cur.execute(\"INSERT INTO SOURCE (source) values ('news.google.com')\")\n", " cur.execute(\"INSERT INTO SOURCE (source) values ('qwant.com')\")\n", "\n", - " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (1, 1)\")\n", - " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (2, 1)\")\n", - " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (3, 1)\")\n", - " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (4, 1)\")\n", - " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (5, 1)\")\n", - " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (6, 1)\")\n", - " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (7, 1)\")\n", - "\n", - " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (1, 2)\")\n", - " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (2, 2)\")\n", - " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source) values (3, 2)\")\n", + " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source, id_search) values (1, 1, 1)\")\n", "\n", " for j in range(5):\n", " import time\n", @@ -241,26 +187,6 @@ "outputs": [], "source": [] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Connect to an existing database\n", - "with psycopg.connect(connection_info) as conn:\n", - " # Open a cursor to perform database operations\n", - " with conn.cursor() as cur:\n", - " pprint( cur.execute(\"SELECT * FROM SEARCH;\").fetchall() )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -285,7 +211,13 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Connect to an existing database\n", + "with psycopg.connect(connection_info) as conn:\n", + " # Open a cursor to perform database operations\n", + " with conn.cursor() as cur:\n", + " pprint( cur.execute(\"SELECT * FROM SEARCH;\").fetchall() )" + ] }, { "cell_type": "code", diff --git a/A_Development.ipynb b/A_Development.ipynb index fdf01d8..2f88e52 100644 --- a/A_Development.ipynb +++ b/A_Development.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -14,25 +14,16 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Searching Bing \n", - " \r" - ] - } - ], + "outputs": [], "source": [ "results = engine.search('news: \"child abuse\"', pages=2)" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -41,18 +32,9 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Searching Brave \n", - " \r" - ] - } - ], + "outputs": [], "source": [ "query = 'news: child abuse'\n", "r = engine.search(query, pages=2)" @@ -60,20 +42,9 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'_results': []}" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "r.__dict__" ] @@ -87,20 +58,9 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "newspaper.exceptions.ArticleBinaryDataException" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import newspaper\n", "newspaper.ArticleBinaryDataException" diff --git a/app_urls/README.md b/app_urls/README.md index 477ee31..9edfd16 100644 --- a/app_urls/README.md +++ b/app_urls/README.md @@ -2,7 +2,7 @@ ``` conda create -n matitos_urls python=3.12 conda activate matitos_urls -pip install django psycopg[binary] django-rq +pip install django psycopg[binary] django-redis django-rq pip install feedparser python-dateutil newspaper4k lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews ``` @@ -77,8 +77,10 @@ DB_PORT=${DB_NAME:-5432} REDIS_HOST=${REDIS_HOST:-localhost} REDIS_PORT=${REDIS_PORT:-6379} -# Default RQ queue timeout +# Default RQ job timeout RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900} +# Default RQ job queue TTL +RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600} ``` * Django DB @@ -94,9 +96,9 @@ python manage.py makemigrations api; python manage.py migrate --fake-initial # Server python manage.py runserver -# Worker -python manage.py rqworker default -while true; do python manage.py rqworker default --burst -v 0; sleep 5; done +# Workers +# python manage.py rqworker high default low +python manage.py rqworker high default low # Visualize DB http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id diff --git a/app_urls/api/migrations/0003_urlssourcesearch_delete_urlssource.py b/app_urls/api/migrations/0003_urlssourcesearch_delete_urlssource.py new file mode 100644 index 0000000..94c990a --- /dev/null +++ b/app_urls/api/migrations/0003_urlssourcesearch_delete_urlssource.py @@ -0,0 +1,27 @@ +# Generated by Django 4.2.20 on 2025-03-20 16:12 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0002_delete_feed_delete_websiteofinterest_and_more'), + ] + + operations = [ + migrations.CreateModel( + name='UrlsSourceSearch', + fields=[ + ('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')), + ], + options={ + 'db_table': 'urls_source_search', + 'managed': False, + }, + ), + migrations.DeleteModel( + name='UrlsSource', + ), + ] diff --git a/app_urls/api/models.py b/app_urls/api/models.py index 8459cbc..8e9a048 100644 --- a/app_urls/api/models.py +++ b/app_urls/api/models.py @@ -87,11 +87,12 @@ class UrlsDuplicate(models.Model): unique_together = (('id_url_canonical', 'id_url_duplicated'),) -class UrlsSource(models.Model): - id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source) found, that is not supported. The first column is selected. +class UrlsSourceSearch(models.Model): + id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected. id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source') + id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search') class Meta: managed = False - db_table = 'urls_source' - unique_together = (('id_url', 'id_source'),) \ No newline at end of file + db_table = 'urls_source_search' + unique_together = (('id_url', 'id_source', 'id_search'),) diff --git a/app_urls/api/src/db_utils.py b/app_urls/api/src/db_utils.py index 634d9c1..6ea318e 100644 --- a/app_urls/api/src/db_utils.py +++ b/app_urls/api/src/db_utils.py @@ -1,11 +1,9 @@ -from ..models import Urls, UrlContent, UrlsSource, UrlsDuplicate, Source, StatusPatternMatching +from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPatternMatching, Source, Search from django.db.models import Q -from .url_processor import process_url from django.core.cache import cache from django.db import IntegrityError -import hashlib +from .url_processor import process_url, get_with_protocol import re -import time import traceback from .logger import get_logger logger = get_logger() @@ -19,61 +17,32 @@ class DB_Handler(): # URL host slowdown self.url_host_slowdown_seconds = 5 - def _get_safe_cache_key(self, raw_key): - """Generate a safe cache key using an MD5 hash""" - return hashlib.md5(raw_key.encode()).hexdigest() - - def _cache_key(self, cache_key, hash_encode, cache_timeout): - if (hash_encode): - cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout) - else: - cache.set(cache_key, True, timeout=cache_timeout) - - def _is_cached_key(self, cache_key, hash_encoded): - # Returns True if cached - if (hash_encoded): - return cache.get(self._get_safe_cache_key(cache_key)) is not None - else: - return cache.get(cache_key) is not None - - def _clean_protocol(self, url): - # http:// -> https:// - url = url.replace("http://", "https://") - # "" -> https:// - if not (url.startswith("https://")): - url = "https://" + url - return url - - def insert_raw_urls(self, urls, source): + def insert_raw_urls(self, urls, obj_source, obj_search): try: logger.debug("Inserting raw URLs") # Empty? if (len(urls) == 0): - logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source)) + logger.debug("Empty batch of urls (not writing to DB) for source-search: {} - {}".format(obj_source.source, obj_search.search)) return - # Default protocol https:// - urls_clean = [self._clean_protocol(url) for url in urls] - - # Get the source (create if not exists) - source_obj, created = Source.objects.get_or_create(source=source) + urls_clean = [get_with_protocol(url) for url in urls] urls_to_insert = [] # Per URL for url in urls_clean: ### Already processed URL? - if (self._is_cached_key(url, hash_encoded=True)): + if (cache.get("insert_{}".format(url)) is not None): logger.debug("Already cached URL: {}".format(url)) - if (self._is_cached_key("{}{}".format(source, url), hash_encoded=True)): - logger.debug("Already cached (source, URL): {} {}".format(source, url)) + if (cache.get("insert_{}{}{}".format(url, obj_source.source, obj_search.search)) is not None): + logger.debug("Already cached (URL, source, search): {} {} {}".format(url, obj_source.source, obj_search.search)) else: - ### Insert (URL_id, source_id), since not cached + ### Insert (URL_id, source_id, search_id), since not cached # Get URL ID (should already be created) - url_obj, created = Urls.objects.get_or_create(url=url) + obj_url, created = Urls.objects.get_or_create(url=url) # Create (id_source, id_url) (shouldn't exist) - UrlsSource.objects.get_or_create(id_source=source_obj, id_url=url_obj) + UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search) else: # Add object to insert # url_object_to_insert.append(Urls(url=url)) @@ -85,16 +54,20 @@ class DB_Handler(): # URLs (ignore_conflicts=False to return IDs) bulk_created_urls = Urls.objects.bulk_create([Urls(url=url) for url in urls_to_insert], ignore_conflicts=False) # (URL_id, source_id) - UrlsSource.objects.bulk_create([UrlsSource(id_source=source_obj, id_url=url_obj) for url_obj in bulk_created_urls], ignore_conflicts=True) + UrlsSourceSearch.objects.bulk_create([UrlsSourceSearch(id_url=obj_url, id_source=obj_source, id_search=obj_search) for obj_url in bulk_created_urls], ignore_conflicts=True) except IntegrityError as e: ### Fallback to one-by-one insert logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method") # One by one for url in urls_to_insert: # URL - url_obj, created = Urls.objects.get_or_create(url=url) - # (URL, source) - UrlsSource.objects.get_or_create(id_source=source_obj, id_url=url_obj) + obj_url, created = Urls.objects.get_or_create(url=url) + if (created): + logger.info("CREATED: {}".format(obj_url.url)) + else: + logger.info("NOT CREATED: {}".format(obj_url.url)) + # (URL, source, search) + UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search) except Exception as e: logger.warning("bulk_create unknown exception while inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) # Avoid caching due to error on insertion @@ -102,37 +75,14 @@ class DB_Handler(): # Insert or update cache for url in urls_clean: - # Hash encode URLs for special characters - self._cache_key(url, hash_encode=True, cache_timeout=self._cache_timeout_insert_url) - self._cache_key("{}{}".format(source, url), hash_encode=True, cache_timeout=self._cache_timeout_insert_url) + cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url) + cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url) logger.info("Inserted #{} raw URLs".format(len(urls_to_insert))) except Exception as e: logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) - def _get_url_host(self, url): - # URL no protocol, first substring before '/' - url_host = url.replace("https://", "").replace("http://", "").split("/")[0] - return url_host - - def _url_host_slowdown(self, url, url_host_slowdown_seconds): - ### Avoid (frequent) too many requests to the same URL host - # Get URL host - url_host = self._get_url_host(url) - # Recently processed URL host? -> Slow down required - last_cached_timestamp = cache.get("processed_{}".format(url_host), None) - if last_cached_timestamp: - # Get time since last processed URL host (in seconds) - time_since_last_processed = time.time() - last_cached_timestamp - # Amount of time required to sleep? - slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed) - logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host)) - # Sleep - time.sleep(slowdown_required) - # About to process URL host, cache time - cache.set("processed_{}".format(url_host), time.time(), timeout=60*5) # Expire after 5 minutes - def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error): def set_status(obj_url, status): @@ -158,8 +108,6 @@ class DB_Handler(): ##### Process URL try: - # Slow down if required to avoid too many requests error - self._url_host_slowdown(obj_url.url, self.url_host_slowdown_seconds) # Get data dict_url_data = process_url(obj_url.url) # Not none or handle as exception @@ -190,17 +138,17 @@ class DB_Handler(): # Get or create URL with canonical form obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical")) - # Get the sources id associated to obj_url.id - url_sources = UrlsSource.objects.filter(id_url=obj_url) - for url_source_obj in url_sources: + # Get the source-search IDs associated to obj_url.id + list_url_source_search = UrlsSourceSearch.objects.fiter(id_url=obj_url) + for obj_url_source_search in list_url_source_search: # Associate same sources to url_canonical (it might already exist) - obj_urls_source, created = UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical) + UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search) # URLs duplciate association - obj_urls_duplicate, created = UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url) + UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url) # TODO: return obj_url_canonical so as to directly process the recently inserted URL - # Whever this function is called, add: + # Wherever this function is called, add: # self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error) # Next URL @@ -281,7 +229,7 @@ class DB_Handler(): # Per URL for obj_url in error_urls: # URL ID cached? -> Tried to process recently already, skip - if (self._is_cached_key("error_{}".format(obj_url.id), hash_encoded=False)): + if (cache.get("error_{}".format(obj_url.id)) is not None): logger.debug("Already cached URL ID: {}".format(obj_url.id)) num_urls_skipped += 1 continue @@ -292,7 +240,7 @@ class DB_Handler(): num_urls_processed += 1 except Exception as e: # Error, cache to avoid re-processing for X time - self._cache_key("error_{}".format(obj_url.id), hash_encode=False, cache_timeout=self._cache_timeout_error_url) + cache.set("error_{}".format(obj_url.id), True, timeout=self._cache_timeout_insert_url) num_urls_skipped += 1 # Get following batch of URLs, status='error' diff --git a/app_urls/api/src/fetch_feed.py b/app_urls/api/src/fetch_feed.py index 8d7389b..bc2f809 100644 --- a/app_urls/api/src/fetch_feed.py +++ b/app_urls/api/src/fetch_feed.py @@ -1,5 +1,5 @@ from .db_utils import DB_Handler -from ..models import Search +from ..models import Search, Source import feedparser import dateutil import traceback @@ -14,16 +14,19 @@ class FetchFeeds(): try: logger.debug("Starting FetchFeeds.run()") - # Get feeds - list_url_feeds = list(Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED).values_list('search', flat=True)) - logger.debug("Fetching from feeds: {}".format(list_url_feeds)) + # Get source object + obj_source, created = Source.objects.get_or_create(source="feeds") + + # Get feeds objects + list_obj_search_feeds = Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED) + logger.debug("Fetching from feeds: {}".format([e.search for e in list_obj_search_feeds])) # Process via RSS feeds - for url_feed in list_url_feeds: + for obj_search in list_obj_search_feeds: # Initialize urls_fetched, urls_publish_date = [], [] # Fetch feeds - feeds = feedparser.parse(url_feed) + feeds = feedparser.parse(obj_search.search) # Parse for f in feeds.get("entries", []): # Get URL @@ -41,10 +44,8 @@ class FetchFeeds(): urls_publish_date.append(publish_date_parsed) # URL urls_fetched.append(url) - - # URL fetching source - source = "feed {}".format(url_feed) + # Write to DB - DB_Handler().insert_raw_urls(urls_fetched, source) + DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search) except Exception as e: logger.warning("Exception in FetchFeeds.run(): {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/api/src/fetch_parser.py b/app_urls/api/src/fetch_parser.py index cea8580..04398e2 100644 --- a/app_urls/api/src/fetch_parser.py +++ b/app_urls/api/src/fetch_parser.py @@ -1,5 +1,6 @@ from .db_utils import DB_Handler -from ..models import Search +from ..models import Search, Source +from .url_processor import get_with_protocol, url_host_slowdown import newspaper import traceback from .logger import get_logger @@ -13,27 +14,26 @@ class FetchParser(): try: logger.debug("Starting FetchParser.run() for {}") + # Get source object + obj_source, created = Source.objects.get_or_create(source="newspaper4k") # Get URL hosts - list_url_host = list(Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST).values_list('search', flat=True)) - logger.debug("Fetching news by parsing URL hosts: {}".format(list_url_host)) + list_url_host = Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST) + logger.debug("Fetching news by parsing URL hosts: {}".format([e.search for e in list_url_host])) # Process newspaper4k build method - for url_host_feed in list_url_host: + for obj_search in list_url_host: # Protocol - if not (url_host_feed.startswith("http")): - url_host_feed_formatted = "https://" + url_host_feed - else: - url_host_feed_formatted = url_host_feed - - logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted)) + url_host_protocol = get_with_protocol(obj_search.search) + logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_protocol)) + + # Make sure no requests made for the last X seconds + url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5) # Source object - url_host_built = newspaper.build(url_host_feed_formatted) + url_host_built = newspaper.build(url_host_protocol) # Get articles URL list urls_fetched = url_host_built.article_urls() - # URL fetching source - source = "newspaper4k {}".format(url_host_feed) # Write to DB - DB_Handler().insert_raw_urls(urls_fetched, source) + DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search) except Exception as e: logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/api/src/fetch_search.py b/app_urls/api/src/fetch_search.py index 2b9949f..1554c0d 100644 --- a/app_urls/api/src/fetch_search.py +++ b/app_urls/api/src/fetch_search.py @@ -1,5 +1,6 @@ from .db_utils import DB_Handler -from ..models import Search +from ..models import Search, Source +from django.db.models import Q import traceback import time from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news @@ -10,54 +11,59 @@ class FetchSearcher(): def __init__(self) -> None: logger.debug("Initializing Fetcher Searcher") + def _get_source_object(self, source): + # TODO: Cache + # self.cached_sources = {} + # Get source object + obj_source, created = Source.objects.get_or_create(source=source) + return obj_source + def run(self): try: logger.debug("Starting FetchSearcher.run()") - - # Get keyword searches of interest - list_keyword_search = list(Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH).values_list('search', flat=True)) - # Get URL host of interest - list_url_host = list(Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST).values_list('search', flat=True)) - - # TODO: allintitle: "child abuse" - # TODO: intitle: "child abuse" - # list_keyword_search + ['allintitle: "{}"'.format(s) for s in list_keyword_search] + ['intitle: "{}"'.format(s) for s in list_keyword_search] - # Merge searches - list_search = list_keyword_search + ["site:{}".format(u) for u in list_url_host] - logger.debug("Fetching from keyword search: {}".format(list_search)) + + # Get search objects of interest + list_search_obj = Search.objects.filter(Q(type=Search.TYPE_ENUM.URL_HOST) | Q(type=Search.TYPE_ENUM.KEYWORD_SEARCH)) + logger.debug("Fetching from search: {}".format(["{} ({})".format(e.search, e.type) for e in list_search_obj])) # Search - for keyword_search in list_search: + for obj_search in list_search_obj: # TODO: language & country customization + # TODO: allintitle: "child abuse" + # TODO: intitle: "child abuse" + + # Search + keyword_search = "{}{}".format("site:" if obj_search.type is Search.TYPE_ENUM.URL_HOST else "", obj_search.search) # DDG News time.sleep(5) raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "wt-wt") # Write to DB - DB_Handler().insert_raw_urls(raw_urls, source) + DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) # GNews time.sleep(5) raw_urls, source = search_gnews(keyword_search, language="en", country="US") # Write to DB - DB_Handler().insert_raw_urls(raw_urls, source) + DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) # DDG Text time.sleep(5) raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=None, region = "wt-wt") # Write to DB - DB_Handler().insert_raw_urls(raw_urls, source) + DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) # GoogleNews news time.sleep(5) raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US") # Write to DB - DB_Handler().insert_raw_urls(raw_urls, source) + DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) + # GoogleNews general time.sleep(5) raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5) # Write to DB - DB_Handler().insert_raw_urls(raw_urls, source) + DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) # TODO: # SearxNG diff --git a/app_urls/api/src/fetch_search_utils.py b/app_urls/api/src/fetch_search_utils.py index 4bcc6af..e6cea03 100644 --- a/app_urls/api/src/fetch_search_utils.py +++ b/app_urls/api/src/fetch_search_utils.py @@ -1,3 +1,4 @@ +from django.core.cache import cache import traceback import random import time @@ -10,20 +11,31 @@ from duckduckgo_search import DDGS from GoogleNews import GoogleNews ########################################################################### -def decode_gnews_urls(encoded_urls): +def decode_gnews_urls(encoded_urls, interval=2): # DecodeURLs list_decoded_urls = [] for url in encoded_urls: - try: - # Decode URL, with interval time to avoid block - decoded_url = gnewsdecoder(url, interval=5) - # Ok? - if decoded_url.get("status"): - list_decoded_urls.append(decoded_url["decoded_url"]) - else: - logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, decoded_url["message"])) - except Exception as e: - logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc())) + # Already cached? + decoded_url = cache.get("gnews_decode_{}".format(url)) + if (decoded_url is not None): + logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url)) + # Append decoded URL + list_decoded_urls.append(decoded_url) + else: + try: + # Decode URL, with interval time to avoid block + decoded_url_dict = gnewsdecoder(url, interval=interval) + # Ok? + if decoded_url_dict.get("status"): + # Append decoded URL + decoded_url = decoded_url_dict["decoded_url"] + list_decoded_urls.append(decoded_url) + # Cache decoded URL + cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12) + else: + logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, decoded_url["message"])) + except Exception as e: + logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc())) return list_decoded_urls ########################################################################### @@ -33,13 +45,18 @@ def search_gnews(keyword_search, period="1d", language="en", country="US", max_r source = "gnews {} {} {}-{} max_results={}".format("news", period, language, country, max_results).replace("None", "").strip() logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) - # Get news - results_gnews = GNews(language=language, country=country).get_news(keyword_search) - # Get list of encoded urls - encoded_urls = [e.get("url") for e in results_gnews] - # Decode - list_decoded_urls = decode_gnews_urls(encoded_urls) - return list_decoded_urls, source + try: + # Get news + results_gnews = GNews(language=language, country=country).get_news(keyword_search) + # Get list of encoded urls + encoded_urls = [e.get("url") for e in results_gnews] + # Decode + logger.debug("Decoding gnews URLs") + urls = decode_gnews_urls(encoded_urls) + except Exception as e: + logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc())) + urls = [] + return urls, source ########################################################################### @@ -51,14 +68,18 @@ def search_ddg(keyword_search, category="news", timelimit="d", max_results=None, # region="{}-{}".format(langauge, country.lower()) # timelimit= # Options: d, w, m # max_results # max number of results. If None, returns results only from the first response. Defaults to None - - if (category == "news"): - news = DDGS().news(keyword_search, region=region, timelimit=timelimit, max_results=max_results) - urls = [e.get("url") for e in news] - if (category == "text"): - news = DDGS().text(keyword_search, region=region, timelimit=timelimit, max_results=max_results) - urls = [e.get("href") for e in news] + try: + if (category == "news"): + news = DDGS().news(keyword_search, region=region, timelimit=timelimit, max_results=max_results) + urls = [e.get("url") for e in news] + if (category == "text"): + news = DDGS().text(keyword_search, region=region, timelimit=timelimit, max_results=max_results) + urls = [e.get("href") for e in news] + except Exception as e: + logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc())) + urls = [] + return urls, source ########################################################################### @@ -78,6 +99,7 @@ def search_googlenews_news(keyword_search, period="1d", language="en", country=" # Fetch encoded_urls = googlenews.get_links() # Decode + logger.debug("Decoding gnews URLs") urls = decode_gnews_urls(encoded_urls) except Exception as e: logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc())) diff --git a/app_urls/api/src/url_processor.py b/app_urls/api/src/url_processor.py index 6f1d6c0..a925dde 100644 --- a/app_urls/api/src/url_processor.py +++ b/app_urls/api/src/url_processor.py @@ -2,14 +2,46 @@ from django.core.cache import cache from .logger import get_logger logger = get_logger() import newspaper +import time from urllib.parse import unquote # pip install langdetect #import langdetect #langdetect.DetectorFactory.seed = 0 +def get_with_protocol(url): + # http:// -> https:// + url = url.replace("http://", "https://") + # "" -> https:// + if not (url.startswith("https://")): + url = "https://" + url + return url + +def get_url_host(url): + # URL no protocol, first substring before '/' + url_host = url.replace("https://", "").replace("http://", "").split("/")[0] + return url_host + +def url_host_slowdown(url, url_host_slowdown_seconds): + ### Avoid (frequent) too many requests to the same URL host + # Get URL host + url_host = get_url_host(url) + # Recently processed URL host? -> Slow down required + last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None) + if last_cached_timestamp: + # Get time since last processed URL host (in seconds) + time_since_last_processed = time.time() - last_cached_timestamp + # Amount of time required to sleep? + slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed) + logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host)) + # Sleep + time.sleep(slowdown_required) + # About to process URL host, cache time + cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes def process_url(url): try: + # Slow down if required to avoid too many requests error + url_host_slowdown(url, url_host_slowdown_seconds=2) # Process article = newspaper.article(url) except newspaper.ArticleBinaryDataException: diff --git a/app_urls/api/tasks.py b/app_urls/api/tasks.py index 0ccea1c..3e4d163 100644 --- a/app_urls/api/tasks.py +++ b/app_urls/api/tasks.py @@ -13,6 +13,11 @@ from src.missing_kids_status import MissingKidsStatus from .src.logger import get_logger logger = get_logger() +@job +def fetch_feeds(): + logger.info("Task triggered: {}".format("FetchFeeds")) + FetchFeeds().run() + @job def background_task(process_type: str): logger.info("Task triggered: {}".format(process_type)) diff --git a/app_urls/core/settings.py b/app_urls/core/settings.py index b94ad48..25720d3 100644 --- a/app_urls/core/settings.py +++ b/app_urls/core/settings.py @@ -21,7 +21,7 @@ BASE_DIR = Path(__file__).resolve().parent.parent # See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! -SECRET_KEY = 'django-insecure-kc0jj#_=7i$_79p(n5)p3taxvhnq=w*ori-%%iu_a6wye@$(*n' +SECRET_KEY = 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt' # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True @@ -38,7 +38,6 @@ INSTALLED_APPS = [ 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', - # 'rest_framework', 'django_rq', 'api', ] @@ -93,11 +92,16 @@ DATABASES = { CACHES = { "default": { - "BACKEND": "django.core.cache.backends.redis.RedisCache", + #"BACKEND": "django.core.cache.backends.redis.RedisCache", + "BACKEND": "django_redis.cache.RedisCache", "LOCATION": "redis://{}:{}".format( - os.environ.get("REDIS_HOST", "localhost"), - os.environ.get("REDIS_PORT", 6379) - ), + os.environ.get("REDIS_HOST", "localhost"), + os.environ.get("REDIS_PORT", 6379) + ), + "OPTIONS": { + "MEMCACHE_MAX_KEY_LENGTH": 2048, + "CLIENT_CLASS": "django_redis.client.DefaultClient", + }, } } @@ -107,6 +111,7 @@ RQ_QUEUES = { 'PORT': os.environ.get("REDIS_PORT", 6379), 'DB': os.environ.get("REDIS_DB", 0), 'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 900), + 'DEFAULT_RESULT_TTL': os.environ.get("RQ_DEFAULT_RESULT_TTL", 3600), } }