diff --git a/1-DB.ipynb b/1-DB.ipynb index 4fcb228..0430949 100644 --- a/1-DB.ipynb +++ b/1-DB.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -11,27 +11,74 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "db_postgres\n", + "db_redis\n", + "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n", + " ⠋ Container db_redis \u001b[39mCreating\u001b[0m \u001b[34m0.1s \u001b[0m\n", + " ⠋ Container db_postgres \u001b[39mCreating\u001b[0m \u001b[34m0.1s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/2\n", + " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n", + " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.3s \u001b[0m\n", + " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.3s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n", + " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.4s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n", + " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.5s \u001b[0m\n", + " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.5s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n", + " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.5s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.5s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h" + ] + } + ], "source": [ "!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "UndefinedTable", + "evalue": "relation \"urls_source\" does not exist", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mUndefinedTable\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 19\u001b[39m\n\u001b[32m 15\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m conn.cursor() \u001b[38;5;28;01mas\u001b[39;00m cur:\n\u001b[32m 16\u001b[39m \u001b[38;5;66;03m# Autocommit at end of transaction (Atomic insert of URLs and sources)\u001b[39;00m\n\u001b[32m 17\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m conn.transaction() \u001b[38;5;28;01mas\u001b[39;00m tx:\n\u001b[32m 18\u001b[39m \u001b[38;5;66;03m# Create URLs table\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m19\u001b[39m c = \u001b[43mcur\u001b[49m\u001b[43m.\u001b[49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m\"\"\"\u001b[39;49m\n\u001b[32m 20\u001b[39m \u001b[33;43m CREATE TYPE URL_STATUS AS ENUM (\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mraw\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43merror\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mvalid\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43munknown\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43minvalid\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mduplicate\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m);\u001b[39;49m\n\u001b[32m 21\u001b[39m \n\u001b[32m 22\u001b[39m \u001b[33;43m CREATE TABLE URLS (\u001b[39;49m\n\u001b[32m 23\u001b[39m \u001b[33;43m id SERIAL PRIMARY KEY,\u001b[39;49m\n\u001b[32m 24\u001b[39m \u001b[33;43m url TEXT NOT NULL UNIQUE,\u001b[39;49m\n\u001b[32m 25\u001b[39m \u001b[33;43m ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\u001b[39;49m\n\u001b[32m 26\u001b[39m \u001b[33;43m status URL_STATUS NOT NULL DEFAULT \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mraw\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m -- ,\u001b[39;49m\n\u001b[32m 27\u001b[39m \u001b[33;43m -- status_wendy WENDY_STATUS DEFAULT NULL,\u001b[39;49m\n\u001b[32m 28\u001b[39m \u001b[33;43m -- ts_wendy TIMESTAMPTZ DEFAULT NULL\u001b[39;49m\n\u001b[32m 29\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 30\u001b[39m \u001b[33;43m CREATE INDEX idx_urls_status ON urls(status);\u001b[39;49m\n\u001b[32m 31\u001b[39m \u001b[33;43m CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\u001b[39;49m\n\u001b[32m 32\u001b[39m \n\u001b[32m 33\u001b[39m \u001b[33;43m CREATE TABLE URLS_DUPLICATE (\u001b[39;49m\n\u001b[32m 34\u001b[39m \u001b[33;43m id_url_canonical INTEGER REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 35\u001b[39m \u001b[33;43m id_url_duplicated INTEGER REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 36\u001b[39m \u001b[33;43m PRIMARY KEY (id_url_canonical, id_url_duplicated)\u001b[39;49m\n\u001b[32m 37\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 38\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 39\u001b[39m \u001b[33;43m CREATE TYPE SEARCH_TYPE AS ENUM (\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mrss_feed\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mkeyword_search\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m, \u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43murl_host\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[33;43m);\u001b[39;49m\n\u001b[32m 40\u001b[39m \u001b[33;43m CREATE TABLE SEARCH (\u001b[39;49m\n\u001b[32m 41\u001b[39m \u001b[33;43m id SMALLSERIAL PRIMARY KEY,\u001b[39;49m\n\u001b[32m 42\u001b[39m \u001b[33;43m search TEXT NOT NULL UNIQUE,\u001b[39;49m\n\u001b[32m 43\u001b[39m \u001b[33;43m type SEARCH_TYPE NOT NULL\u001b[39;49m\n\u001b[32m 44\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 45\u001b[39m \u001b[33;43m CREATE INDEX idx_search_type ON SEARCH(type);\u001b[39;49m\n\u001b[32m 46\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 47\u001b[39m \u001b[33;43m CREATE TABLE SOURCE (\u001b[39;49m\n\u001b[32m 48\u001b[39m \u001b[33;43m id SMALLSERIAL PRIMARY KEY,\u001b[39;49m\n\u001b[32m 49\u001b[39m \u001b[33;43m source TEXT NOT NULL UNIQUE\u001b[39;49m\n\u001b[32m 50\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 51\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 52\u001b[39m \u001b[33;43m CREATE TABLE URLS_SOURCE_SEARCH (\u001b[39;49m\n\u001b[32m 53\u001b[39m \u001b[33;43m id_url INTEGER REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 54\u001b[39m \u001b[33;43m id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\u001b[39;49m\n\u001b[32m 55\u001b[39m \u001b[33;43m id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\u001b[39;49m\n\u001b[32m 56\u001b[39m \u001b[33;43m PRIMARY KEY(id_url, id_source)\u001b[39;49m\n\u001b[32m 57\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 58\u001b[39m \u001b[33;43m CREATE INDEX idx_source ON urls_source(id_source);\u001b[39;49m\n\u001b[32m 59\u001b[39m \n\u001b[32m 60\u001b[39m \u001b[33;43m CREATE TABLE STATUS_PATTERN_MATCHING (\u001b[39;49m\n\u001b[32m 61\u001b[39m \u001b[33;43m pattern TEXT PRIMARY KEY,\u001b[39;49m\n\u001b[32m 62\u001b[39m \u001b[33;43m priority SMALLINT NOT NULL,\u001b[39;49m\n\u001b[32m 63\u001b[39m \u001b[33;43m status URL_STATUS NOT NULL\u001b[39;49m\n\u001b[32m 64\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 65\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 66\u001b[39m \u001b[33;43m \u001b[39;49m\n\u001b[32m 67\u001b[39m \u001b[33;43m CREATE TABLE URL_CONTENT (\u001b[39;49m\n\u001b[32m 68\u001b[39m \u001b[33;43m id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\u001b[39;49m\n\u001b[32m 69\u001b[39m \u001b[33;43m date_published TIMESTAMPTZ DEFAULT NOW(),\u001b[39;49m\n\u001b[32m 70\u001b[39m \u001b[33;43m title TEXT,\u001b[39;49m\n\u001b[32m 71\u001b[39m \u001b[33;43m description TEXT,\u001b[39;49m\n\u001b[32m 72\u001b[39m \u001b[33;43m content TEXT,\u001b[39;49m\n\u001b[32m 73\u001b[39m \u001b[33;43m valid_content BOOLEAN,\u001b[39;49m\n\u001b[32m 74\u001b[39m \u001b[33;43m language CHAR(2), -- ISO 639-1 Code\u001b[39;49m\n\u001b[32m 75\u001b[39m \u001b[33;43m keywords TEXT[],\u001b[39;49m\n\u001b[32m 76\u001b[39m \u001b[33;43m tags TEXT[],\u001b[39;49m\n\u001b[32m 77\u001b[39m \u001b[33;43m authors TEXT[],\u001b[39;49m\n\u001b[32m 78\u001b[39m \u001b[33;43m image_main_url TEXT,\u001b[39;49m\n\u001b[32m 79\u001b[39m \u001b[33;43m images_url TEXT[],\u001b[39;49m\n\u001b[32m 80\u001b[39m \u001b[33;43m videos_url TEXT[],\u001b[39;49m\n\u001b[32m 81\u001b[39m \u001b[33;43m url_host TEXT, -- www.breitbart.com\u001b[39;49m\n\u001b[32m 82\u001b[39m \u001b[33;43m site_name TEXT -- Breitbart News\u001b[39;49m\n\u001b[32m 83\u001b[39m \u001b[33;43m );\u001b[39;49m\n\u001b[32m 84\u001b[39m \u001b[33;43m CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\u001b[39;49m\n\u001b[32m 85\u001b[39m \u001b[33;43m CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\u001b[39;49m\n\u001b[32m 86\u001b[39m \u001b[33;43m CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\u001b[39;49m\n\u001b[32m 87\u001b[39m \u001b[33;43m CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\u001b[39;49m\n\u001b[32m 88\u001b[39m \u001b[33;43m CREATE INDEX idx_language ON URL_CONTENT (language);\u001b[39;49m\n\u001b[32m 89\u001b[39m \u001b[33;43m CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\u001b[39;49m\n\u001b[32m 90\u001b[39m \u001b[33;43m \u001b[39;49m\u001b[33;43m\"\"\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m 92\u001b[39m \u001b[38;5;66;03m# Feeds\u001b[39;00m\n\u001b[32m 93\u001b[39m cur.execute( \u001b[33m\"\u001b[39m\u001b[33mINSERT INTO SEARCH (search, type) VALUES (\u001b[39m\u001b[33m'\u001b[39m\u001b[33mhttps://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC\u001b[39m\u001b[33m'\u001b[39m\u001b[33m, \u001b[39m\u001b[33m'\u001b[39m\u001b[33mrss_feed\u001b[39m\u001b[33m'\u001b[39m\u001b[33m);\u001b[39m\u001b[33m\"\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/anaconda3/envs/matitos/lib/python3.12/site-packages/psycopg/cursor.py:97\u001b[39m, in \u001b[36mCursor.execute\u001b[39m\u001b[34m(self, query, params, prepare, binary)\u001b[39m\n\u001b[32m 93\u001b[39m \u001b[38;5;28mself\u001b[39m._conn.wait(\n\u001b[32m 94\u001b[39m \u001b[38;5;28mself\u001b[39m._execute_gen(query, params, prepare=prepare, binary=binary)\n\u001b[32m 95\u001b[39m )\n\u001b[32m 96\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m e._NO_TRACEBACK \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[32m---> \u001b[39m\u001b[32m97\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ex.with_traceback(\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[32m 98\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n", + "\u001b[31mUndefinedTable\u001b[39m: relation \"urls_source\" does not exist" + ] + } + ], "source": [ "INSERT_TABLES = True\n", - "INSERT_SAMPLE_DATA = True\n", + "INSERT_SAMPLE_DATA = False\n", "\n", "import psycopg\n", "connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n", "\n", "from datetime import datetime, timezone\n", "import re\n", + "from pprint import pprint\n", "\n", "if INSERT_TABLES:\n", " # Connect to an existing database\n", @@ -60,27 +107,24 @@ " id_url_duplicated INTEGER REFERENCES URLS(id),\n", " PRIMARY KEY (id_url_canonical, id_url_duplicated)\n", " );\n", - "\n", - " CREATE TABLE FEED (\n", - " id SMALLSERIAL PRIMARY KEY,\n", - " rss_feed TEXT NOT NULL UNIQUE\n", - " );\n", - " CREATE TABLE WEBSITE_OF_INTEREST (\n", - " id SMALLSERIAL PRIMARY KEY,\n", - " url_host TEXT NOT NULL UNIQUE\n", - " );\n", + " \n", + " CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');\n", " CREATE TABLE SEARCH (\n", " id SMALLSERIAL PRIMARY KEY,\n", - " keyword_search TEXT NOT NULL UNIQUE\n", + " search TEXT NOT NULL UNIQUE,\n", + " type SEARCH_TYPE NOT NULL\n", " );\n", + " CREATE INDEX idx_search_type ON SEARCH(type);\n", + " \n", " CREATE TABLE SOURCE (\n", " id SMALLSERIAL PRIMARY KEY,\n", " source TEXT NOT NULL UNIQUE\n", " );\n", - "\n", - " CREATE TABLE URLS_SOURCE (\n", + " \n", + " CREATE TABLE URLS_SOURCE_SEARCH (\n", " id_url INTEGER REFERENCES URLS(id),\n", - " id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT, -- Source encodes search information\n", + " id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n", + " id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n", " PRIMARY KEY(id_url, id_source)\n", " );\n", " CREATE INDEX idx_source ON urls_source(id_source);\n", @@ -116,14 +160,14 @@ " CREATE INDEX idx_language ON URL_CONTENT (language);\n", " CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n", " \"\"\")\n", - "\n", + " \n", " # Feeds\n", - " cur.execute( \"INSERT INTO FEED (rss_feed) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC');\" )\n", + " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n", " # Websites of interest\n", - " cur.execute( \"INSERT INTO WEBSITE_OF_INTEREST (url_host) VALUES ('www.unicef.org');\" )\n", - " cur.execute( \"INSERT INTO WEBSITE_OF_INTEREST (url_host) VALUES ('www.breitbart.com/');\" )\n", + " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.unicef.org', 'url_host');\" )\n", + " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('www.breitbart.com', 'url_host');\" )\n", " # Search keywords\n", - " cur.execute( \"INSERT INTO SEARCH (keyword_search) VALUES ('child abuse');\" )\n", + " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n", " \n", " # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n", " # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n", @@ -190,14 +234,39 @@ " (1, datetime.now(tz=timezone.utc), content, language, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Connect to an existing database\n", + "with psycopg.connect(connection_info) as conn:\n", + " # Open a cursor to perform database operations\n", + " with conn.cursor() as cur:\n", + " pprint( cur.execute(\"SELECT * FROM SEARCH;\").fetchall() )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "from pprint import pprint\n", - "\n", "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", @@ -224,8 +293,6 @@ "metadata": {}, "outputs": [], "source": [ - "from pprint import pprint\n", - "\n", "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", diff --git a/A_Development.ipynb b/A_Development.ipynb index e651f2c..fdf01d8 100644 --- a/A_Development.ipynb +++ b/A_Development.ipynb @@ -2,15 +2,80 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "import newspaper\n", - "url = \"http://www.missingkids.org/poster/NCMC/2045193/1\"\n", - "#url = \"https://www.missingkids.org/new-poster/NCMC/2045193/1\"\n", + "# !pip install git+https://github.com/tasos-py/Search-Engines-Scraper.git\n", + "import search_engines\n", "\n", - "art = newspaper.article(url)" + "engine = search_engines.Bing()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Searching Bing \n", + " \r" + ] + } + ], + "source": [ + "results = engine.search('news: \"child abuse\"', pages=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "engine = search_engines.search_engines_dict[\"brave\"]()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Searching Brave \n", + " \r" + ] + } + ], + "source": [ + "query = 'news: child abuse'\n", + "r = engine.search(query, pages=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'_results': []}" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "r.__dict__" ] }, { @@ -18,8 +83,57 @@ "execution_count": null, "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "newspaper.exceptions.ArticleBinaryDataException" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "art.__dict__" + "import newspaper\n", + "newspaper.ArticleBinaryDataException" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "import newspaper\n", + "\n", + "url = 'https://www.missingkids.org/poster/USVA/VA25-0820/1'\n", + "art_1 = newspaper.article(url)\n", + "url = 'https://www.missingkids.org/poster/NCMC/2045193/1'\n", + "art_2 = newspaper.article(url)\n", + "'''" ] }, { @@ -44,15 +158,8 @@ "l = client.list()\n", "list_models = [m.get(\"model\") for m in l.model_dump().get(\"models\")]\n", "\n", - "list_models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "print(list_models)\n", + "\n", "for m in list_models:\n", " context_key = [ k for k in client.show(m).model_dump().get(\"modelinfo\").keys() if \"context_length\" in k]\n", " if (len(context_key) != 1):\n", diff --git a/app_urls/README.md b/app_urls/README.md index eadabd0..477ee31 100644 --- a/app_urls/README.md +++ b/app_urls/README.md @@ -3,7 +3,7 @@ conda create -n matitos_urls python=3.12 conda activate matitos_urls pip install django psycopg[binary] django-rq -pip install feedparser python-dateutil newspaper4k lxml[html_clean] +pip install feedparser python-dateutil newspaper4k lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews ``` * From automated inspectdb @@ -11,38 +11,59 @@ pip install feedparser python-dateutil newspaper4k lxml[html_clean] # 1) Inspect DB, generate models.py python manage.py inspectdb -# 2) models.py, within class Urls, add: +# 2) Modify models.py +# URLS: +class Urls(models.Model): class STATUS_ENUM(models.TextChoices): - RAW = "raw" - ERROR = "error" - VALID = "valid" - UNKNOWN = "unknown" - INVALID = "invalid" - DUPLICATE = "duplicate" + RAW = "raw", "Raw" + ERROR = "error", "Error" + VALID = "valid", "Valid" + UNKNOWN = "unknown", "Unknown" + INVALID = "invalid", "Invalid" + DUPLICATE = "duplicate", "Duplicate" -# Update status - status = models.TextField(choices=STATUS_ENUM, default=STATUS_ENUM.RAW) # This field type is a guess. + url = models.TextField(unique=True) + ts_fetch = models.DateTimeField(auto_now_add=True) + status = models.TextField(choices=STATUS_ENUM.choices, default=STATUS_ENUM.RAW) # This field type is a guess. -# To class Meta, add default ordering class Meta: managed = False - db_table = 'urls' # db_table = '{}_urls'.format(project_name) + db_table = 'urls' ordering = ["-ts_fetch"] -# Fields default: - ts_fetch = models.DateTimeField(auto_now_add=True) - status = models.TextField(default='raw') # This field type is a guess. +# SEARCH: +class Search(models.Model): + class TYPE_ENUM(models.TextChoices): + RSS_FEED = "rss_feed", "RSS_Feed" + KEYWORD_SEARCH = "keyword_search", "Keyword_Search" + URL_HOST = "url_host", "URL_Host" -# URLContent: -from django.contrib.postgres.fields import ArrayField + id = models.SmallAutoField(primary_key=True) + search = models.TextField(unique=True) + type = models.TextField(choices=TYPE_ENUM.choices) # This field type is a guess. +# URL_CONTENT: +class UrlContent(models.Model): + id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True) + date_published = models.DateTimeField(blank=True, null=True) + title = models.TextField(blank=True, null=True) + description = models.TextField(blank=True, null=True) + content = models.TextField(blank=True, null=True) + valid_content = models.BooleanField(blank=True, null=True) + language = models.CharField(max_length=2, blank=True, null=True) keywords = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess. tags = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess. authors = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess. image_main_url = models.TextField(blank=True, null=True) images_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess. videos_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess. + url_host = models.TextField(blank=True, null=True) + site_name = models.TextField(blank=True, null=True) + +# TODO: Associate db_table name with a prefix on project_name +class Meta: + db_table = 'urls' # db_table = '{}_urls'.format(project_name) ``` * Environment variables @@ -55,6 +76,9 @@ DB_PORT=${DB_NAME:-5432} REDIS_HOST=${REDIS_HOST:-localhost} REDIS_PORT=${REDIS_PORT:-6379} + +# Default RQ queue timeout +RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900} ``` * Django DB diff --git a/app_urls/api/migrations/0002_delete_feed_delete_websiteofinterest_and_more.py b/app_urls/api/migrations/0002_delete_feed_delete_websiteofinterest_and_more.py new file mode 100644 index 0000000..1361219 --- /dev/null +++ b/app_urls/api/migrations/0002_delete_feed_delete_websiteofinterest_and_more.py @@ -0,0 +1,26 @@ +# Generated by Django 5.1.7 on 2025-03-19 09:06 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0001_initial'), + ] + + operations = [ + migrations.DeleteModel( + name='Feed', + ), + migrations.DeleteModel( + name='WebsiteOfInterest', + ), + migrations.DeleteModel( + name='WebsiteToFilter', + ), + migrations.AlterModelOptions( + name='urls', + options={'managed': False, 'ordering': ['-ts_fetch']}, + ), + ] diff --git a/app_urls/api/models.py b/app_urls/api/models.py index b51fcf9..8459cbc 100644 --- a/app_urls/api/models.py +++ b/app_urls/api/models.py @@ -2,18 +2,15 @@ from django.db import models from django.contrib.postgres.fields import ArrayField # Create your models here. -class Feed(models.Model): - id = models.SmallAutoField(primary_key=True) - rss_feed = models.TextField(unique=True) - - class Meta: - managed = False - db_table = 'feed' - - class Search(models.Model): + class TYPE_ENUM(models.TextChoices): + RSS_FEED = "rss_feed", "RSS_Feed" + KEYWORD_SEARCH = "keyword_search", "Keyword_Search" + URL_HOST = "url_host", "URL_Host" + id = models.SmallAutoField(primary_key=True) - keyword_search = models.TextField(unique=True) + search = models.TextField(unique=True) + type = models.TextField(choices=TYPE_ENUM.choices) # This field type is a guess. class Meta: managed = False @@ -77,6 +74,7 @@ class Urls(models.Model): class Meta: managed = False db_table = 'urls' + ordering = ["-ts_fetch"] class UrlsDuplicate(models.Model): @@ -96,13 +94,4 @@ class UrlsSource(models.Model): class Meta: managed = False db_table = 'urls_source' - unique_together = (('id_url', 'id_source'),) - - -class WebsiteOfInterest(models.Model): - id = models.SmallAutoField(primary_key=True) - url_host = models.TextField(unique=True) - - class Meta: - managed = False - db_table = 'website_of_interest' + unique_together = (('id_url', 'id_source'),) \ No newline at end of file diff --git a/app_urls/api/src/db_utils.py b/app_urls/api/src/db_utils.py index e0307a1..634d9c1 100644 --- a/app_urls/api/src/db_utils.py +++ b/app_urls/api/src/db_utils.py @@ -12,7 +12,6 @@ logger = get_logger() class DB_Handler(): def __init__(self): - logger.debug("Initializing URL DB Handler") # Inserting raw URL, cache time: 1 day self._cache_timeout_insert_url = 86400 # Processing error URL, cache time: 2 days @@ -37,16 +36,15 @@ class DB_Handler(): else: return cache.get(cache_key) is not None - def insert_raw_urls(self, urls, source): - - def clean_protocol(url): - # http:// -> https:// - url = url.replace("http://", "https://") - # "" -> https:// - if not (url.startswith("https://")): - url = "https://" + url - return url - + def _clean_protocol(self, url): + # http:// -> https:// + url = url.replace("http://", "https://") + # "" -> https:// + if not (url.startswith("https://")): + url = "https://" + url + return url + + def insert_raw_urls(self, urls, source): try: logger.debug("Inserting raw URLs") # Empty? @@ -55,7 +53,7 @@ class DB_Handler(): return # Default protocol https:// - urls_clean = [clean_protocol(url) for url in urls] + urls_clean = [self._clean_protocol(url) for url in urls] # Get the source (create if not exists) source_obj, created = Source.objects.get_or_create(source=source) @@ -90,7 +88,7 @@ class DB_Handler(): UrlsSource.objects.bulk_create([UrlsSource(id_source=source_obj, id_url=url_obj) for url_obj in bulk_created_urls], ignore_conflicts=True) except IntegrityError as e: ### Fallback to one-by-one insert - logger.debug("bulk_create exception while inserting raw URLs, falling back to non-bulk method") + logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method") # One by one for url in urls_to_insert: # URL @@ -177,9 +175,16 @@ class DB_Handler(): set_status(obj_url, Urls.STATUS_ENUM.ERROR) # Next URL return + + # Invalid? e.g. binary data + if (dict_url_data.get("override_status") == "invalid"): + # Update status + set_status(obj_url, Urls.STATUS_ENUM.INVALID) + # Next URL + return ##### Canonical URL different? -> Duplicate - if (dict_url_data.get("url") != dict_url_data.get("url_canonical")): + if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")): # Update status set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE) @@ -194,6 +199,10 @@ class DB_Handler(): # URLs duplciate association obj_urls_duplicate, created = UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url) + # TODO: return obj_url_canonical so as to directly process the recently inserted URL + # Whever this function is called, add: + # self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error) + # Next URL return @@ -273,6 +282,7 @@ class DB_Handler(): for obj_url in error_urls: # URL ID cached? -> Tried to process recently already, skip if (self._is_cached_key("error_{}".format(obj_url.id), hash_encoded=False)): + logger.debug("Already cached URL ID: {}".format(obj_url.id)) num_urls_skipped += 1 continue @@ -299,7 +309,7 @@ class DB_Handler(): missingkids_urls = Urls.objects.order_by("-ts_fetch").filter( (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster")) & - (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID)) + (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR)) )[:batch_size] # Per URL diff --git a/app_urls/api/src/fetch_feed.py b/app_urls/api/src/fetch_feed.py index 54d99df..8d7389b 100644 --- a/app_urls/api/src/fetch_feed.py +++ b/app_urls/api/src/fetch_feed.py @@ -1,5 +1,5 @@ from .db_utils import DB_Handler -from ..models import Feed +from ..models import Search import feedparser import dateutil import traceback @@ -15,7 +15,7 @@ class FetchFeeds(): logger.debug("Starting FetchFeeds.run()") # Get feeds - list_url_feeds = list(Feed.objects.values_list('rss_feed', flat=True)) + list_url_feeds = list(Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED).values_list('search', flat=True)) logger.debug("Fetching from feeds: {}".format(list_url_feeds)) # Process via RSS feeds diff --git a/app_urls/api/src/fetch_parser.py b/app_urls/api/src/fetch_parser.py index 73116e4..cea8580 100644 --- a/app_urls/api/src/fetch_parser.py +++ b/app_urls/api/src/fetch_parser.py @@ -1,5 +1,5 @@ from .db_utils import DB_Handler -from ..models import WebsiteOfInterest +from ..models import Search import newspaper import traceback from .logger import get_logger @@ -14,7 +14,7 @@ class FetchParser(): logger.debug("Starting FetchParser.run() for {}") # Get URL hosts - list_url_host = list(WebsiteOfInterest.objects.values_list('url_host', flat=True)) + list_url_host = list(Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST).values_list('search', flat=True)) logger.debug("Fetching news by parsing URL hosts: {}".format(list_url_host)) # Process newspaper4k build method diff --git a/app_urls/api/src/fetch_search.py b/app_urls/api/src/fetch_search.py new file mode 100644 index 0000000..2b9949f --- /dev/null +++ b/app_urls/api/src/fetch_search.py @@ -0,0 +1,75 @@ +from .db_utils import DB_Handler +from ..models import Search +import traceback +import time +from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news +from .logger import get_logger +logger = get_logger() + +class FetchSearcher(): + def __init__(self) -> None: + logger.debug("Initializing Fetcher Searcher") + + def run(self): + try: + logger.debug("Starting FetchSearcher.run()") + + # Get keyword searches of interest + list_keyword_search = list(Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH).values_list('search', flat=True)) + # Get URL host of interest + list_url_host = list(Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST).values_list('search', flat=True)) + + # TODO: allintitle: "child abuse" + # TODO: intitle: "child abuse" + # list_keyword_search + ['allintitle: "{}"'.format(s) for s in list_keyword_search] + ['intitle: "{}"'.format(s) for s in list_keyword_search] + # Merge searches + list_search = list_keyword_search + ["site:{}".format(u) for u in list_url_host] + logger.debug("Fetching from keyword search: {}".format(list_search)) + + # Search + for keyword_search in list_search: + # TODO: language & country customization + + # DDG News + time.sleep(5) + raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "wt-wt") + # Write to DB + DB_Handler().insert_raw_urls(raw_urls, source) + + # GNews + time.sleep(5) + raw_urls, source = search_gnews(keyword_search, language="en", country="US") + # Write to DB + DB_Handler().insert_raw_urls(raw_urls, source) + + # DDG Text + time.sleep(5) + raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=None, region = "wt-wt") + # Write to DB + DB_Handler().insert_raw_urls(raw_urls, source) + + # GoogleNews news + time.sleep(5) + raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US") + # Write to DB + DB_Handler().insert_raw_urls(raw_urls, source) + # GoogleNews general + time.sleep(5) + raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5) + # Write to DB + DB_Handler().insert_raw_urls(raw_urls, source) + + # TODO: + # SearxNG + """ + period = "day" + for searx_instance in get_searxng_instances(): + dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period} + dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period} + # Append thread + FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler) + FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)" + """ + # TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master + except Exception as e: + logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/api/src/fetch_search_utils.py b/app_urls/api/src/fetch_search_utils.py new file mode 100644 index 0000000..4bcc6af --- /dev/null +++ b/app_urls/api/src/fetch_search_utils.py @@ -0,0 +1,129 @@ +import traceback +import random +import time +from .logger import get_logger +logger = get_logger() + +from googlenewsdecoder import gnewsdecoder +from gnews import GNews +from duckduckgo_search import DDGS +from GoogleNews import GoogleNews + +########################################################################### +def decode_gnews_urls(encoded_urls): + # DecodeURLs + list_decoded_urls = [] + for url in encoded_urls: + try: + # Decode URL, with interval time to avoid block + decoded_url = gnewsdecoder(url, interval=5) + # Ok? + if decoded_url.get("status"): + list_decoded_urls.append(decoded_url["decoded_url"]) + else: + logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, decoded_url["message"])) + except Exception as e: + logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc())) + return list_decoded_urls + +########################################################################### + +def search_gnews(keyword_search, period="1d", language="en", country="US", max_results=100): + # [source] [category] [period] [language-country] [max_results] + source = "gnews {} {} {}-{} max_results={}".format("news", period, language, country, max_results).replace("None", "").strip() + logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) + + # Get news + results_gnews = GNews(language=language, country=country).get_news(keyword_search) + # Get list of encoded urls + encoded_urls = [e.get("url") for e in results_gnews] + # Decode + list_decoded_urls = decode_gnews_urls(encoded_urls) + return list_decoded_urls, source + +########################################################################### + +def search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region="wt-wt"): + # [source] [category] [period] [language-country] [max_results] + source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("None", "").strip() + logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) + + # region="{}-{}".format(langauge, country.lower()) + # timelimit= # Options: d, w, m + # max_results # max number of results. If None, returns results only from the first response. Defaults to None + + if (category == "news"): + news = DDGS().news(keyword_search, region=region, timelimit=timelimit, max_results=max_results) + urls = [e.get("url") for e in news] + if (category == "text"): + news = DDGS().text(keyword_search, region=region, timelimit=timelimit, max_results=max_results) + urls = [e.get("href") for e in news] + + return urls, source +########################################################################### + +def search_googlenews_news(keyword_search, period="1d", language="en", country="US"): + category = "news" + # [source] [category] [period] [language-country] + source = "googlenews {} {} {}-{}".format(category, period, language, country).replace("None", "").strip() + logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) + + # Initialize + googlenews = GoogleNews(period=period, lang=language, region=country) + googlenews.enableException(True) + + try: + # Search + googlenews.get_news(keyword_search) + # Fetch + encoded_urls = googlenews.get_links() + # Decode + urls = decode_gnews_urls(encoded_urls) + except Exception as e: + logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc())) + urls = [] + + return urls, source + +def search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5): + category="general" + # [source] [category] [period] [language-country] [max_results] + source = "googlenews {} {} {}-{} max_pages={}".format(category, period, language, country, max_pages).replace("None", "").strip() + logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) + + # Initialize + googlenews = GoogleNews(period=period, lang=language, region=country) + googlenews.enableException(True) + + try: + set_links = set() + # Search + googlenews.search(keyword_search) + + # Iterate pages + for i in range(max_pages): + time.sleep(random.uniform(1, 2.5)) + num_before = len(set_links) + + # Get page + try: + links = googlenews.page_at(i+1) + except Exception as e: + logger.warning("Exception fetching page in GoogleNews {}: {}".format(source, str(e))) + break + # Links + for l in links: + # 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK' + set_links.add( l.get("link").split("&ved=")[0] ) + # Finished? + if (num_before == len(set_links)): + break + # To list + urls = list(set_links) + except Exception as e: + logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc())) + urls = [] + + return urls, source + +########################################################################### \ No newline at end of file diff --git a/app_urls/api/src/url_processor.py b/app_urls/api/src/url_processor.py index 86b4100..6f1d6c0 100644 --- a/app_urls/api/src/url_processor.py +++ b/app_urls/api/src/url_processor.py @@ -12,6 +12,9 @@ def process_url(url): try: # Process article = newspaper.article(url) + except newspaper.ArticleBinaryDataException: + logger.warning("ArticleException for input URL {}\n{}".format(url, str(e))) + return {"override_status": "invalid"} except newspaper.ArticleException as e: logger.warning("ArticleException for input URL {}\n{}".format(url, str(e))) return None diff --git a/app_urls/api/tasks.py b/app_urls/api/tasks.py index 86b56c6..0ccea1c 100644 --- a/app_urls/api/tasks.py +++ b/app_urls/api/tasks.py @@ -2,6 +2,7 @@ from django_rq import job from .src.fetch_feed import FetchFeeds from .src.fetch_parser import FetchParser +from .src.fetch_search import FetchSearcher from .src.db_utils import DB_Handler ''' from src.fetch_search import FetchSearcher @@ -21,16 +22,20 @@ def background_task(process_type: str): FetchFeeds().run() elif (process_type == "fetch_parser"): FetchParser().run() - # TODO: ENCODE BATCH_SIZE IN PROCESS_tYPE.. - elif (process_type == "process_raw_urls"): - DB_Handler().process_raw_urls(batch_size=50) - elif (process_type == "process_error_urls"): - DB_Handler().process_error_urls(batch_size=50) - elif (process_type == "process_missing_kids_urls"): - DB_Handler().process_missing_kids_urls(batch_size=50) - elif ("process_missing_kids_urls" in process_type): + elif (process_type == "fetch_search"): + FetchSearcher().run() + #elif (process_type == "fetch_missingkids"): + # FetchMissingKids().run() + elif ("process_" in process_type): + # Batch size encoded in URL batch_size = int(process_type.split("_")[-1]) - DB_Handler().process_missing_kids_urls(batch_size=batch_size) + # Task type + if ("process_raw_urls" in process_type): + DB_Handler().process_raw_urls(batch_size=batch_size) + elif ("process_error_urls" in process_type): + DB_Handler().process_error_urls(batch_size=batch_size) + elif ("process_missing_kids_urls" in process_type): + DB_Handler().process_missing_kids_urls(batch_size=batch_size) else: logger.info("Task unknown!: {}".format(process_type)) @@ -47,15 +52,7 @@ def background_task(process_type: str): MissingKidsFetch(db_handler, num_pages=4).run() elif (process_type == "fetch_missing_kids_full"): MissingKidsFetch(db_handler, num_pages=100000).run() - - elif (process_type == "update_missing_kids_status_reduced"): - MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status() - elif (process_type == "update_missing_kids_status_full"): - MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status() - elif (process_type == "update_error_urls"): - UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status() - else: logger.error("Task error, unknown type: {}".format(process_type)) return diff --git a/app_urls/api/views.py b/app_urls/api/views.py index a2ca67e..07b8294 100644 --- a/app_urls/api/views.py +++ b/app_urls/api/views.py @@ -9,11 +9,22 @@ logger = get_logger() def trigger_task(request, task): """View that enqueues a task.""" + + """ + if ("fetch_" in task): + priority = "low" + job_timeout="30m" + elif ("process_" in task): + priority = "medium" + job_timeout="30m" + """ + queue = django_rq.get_queue('default') # Get the default queue - job = queue.enqueue(background_task, task) + job = queue.enqueue(background_task, task, job_timeout="30m") return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id}) def link_list(request): prefix = "http://localhost:8000/api" - links = ["fetch_feeds", "fetch_parser", "process_raw_urls", "process_error_urls", "process_missing_kids_urls_50", "process_missing_kids_urls_500000"] - return JsonResponse({"links": ["http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id"] + [os.path.join(prefix, l) for l in links]}) + links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_500000"] + db_links = ["http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500"] + return JsonResponse({"links": db_links + [os.path.join(prefix, l) for l in links]}) diff --git a/app_urls/core/settings.py b/app_urls/core/settings.py index e5eed47..b94ad48 100644 --- a/app_urls/core/settings.py +++ b/app_urls/core/settings.py @@ -106,7 +106,7 @@ RQ_QUEUES = { 'HOST': os.environ.get("REDIS_HOST", "localhost"), 'PORT': os.environ.get("REDIS_PORT", 6379), 'DB': os.environ.get("REDIS_DB", 0), - 'DEFAULT_TIMEOUT': os.environ.get("REDIS_DEFAULT_TIMEOUT", 360), + 'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 900), } }