diff --git a/.gitignore b/.gitignore index b4cf9f1..051548c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ __pycache__/ *.pyc **/credentials.py logs/ +postgres/ \ No newline at end of file diff --git a/A_Development.ipynb b/A_Development.ipynb deleted file mode 100644 index 2f88e52..0000000 --- a/A_Development.ipynb +++ /dev/null @@ -1,363 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install git+https://github.com/tasos-py/Search-Engines-Scraper.git\n", - "import search_engines\n", - "\n", - "engine = search_engines.Bing()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "results = engine.search('news: \"child abuse\"', pages=2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "engine = search_engines.search_engines_dict[\"brave\"]()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "query = 'news: child abuse'\n", - "r = engine.search(query, pages=2)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "r.__dict__" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import newspaper\n", - "newspaper.ArticleBinaryDataException" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "'''\n", - "import newspaper\n", - "\n", - "url = 'https://www.missingkids.org/poster/USVA/VA25-0820/1'\n", - "art_1 = newspaper.article(url)\n", - "url = 'https://www.missingkids.org/poster/NCMC/2045193/1'\n", - "art_2 = newspaper.article(url)\n", - "'''" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import ollama\n", - "\n", - "#model = \"llama3.2:1b\"\n", - "client = ollama.Client(\n", - " host = 'https://ollamamodel.matitos.org',\n", - ")\n", - "l = client.list()\n", - "list_models = [m.get(\"model\") for m in l.model_dump().get(\"models\")]\n", - "\n", - "print(list_models)\n", - "\n", - "for m in list_models:\n", - " context_key = [ k for k in client.show(m).model_dump().get(\"modelinfo\").keys() if \"context_length\" in k]\n", - " if (len(context_key) != 1):\n", - " print(\"Problem!!!\")\n", - " print(m, client.show(m).model_dump().get(\"modelinfo\").get(context_key[0]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "text = \"...\"\n", - "model = \"falcon3:1b\"\n", - "\n", - "msg_content = {\n", - " \"role\": \"user\", \n", - " \"content\": text,\n", - "}\n", - "response = client.chat(model=model, messages=[msg_content], stream=False)\n", - "print(response[\"message\"][\"content\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import cv2\n", - "import base64\n", - "import numpy as np\n", - "\n", - "endpoint = \"http://192.168.2.64:12343/image\"\n", - "\n", - "\n", - "\n", - "prompt = \"Majestic mountain landscape with snow-capped peaks, autumn foliage in vibrant reds and oranges, a turquoise river winding through a valley, crisp and serene atmosphere, ultra-realistic style.\"\n", - "prompt = \"A group of kids happily playing in a joy environment\"\n", - "#prompt = \"A bitcoin behaving like a king, surrounded by small alternative coins. Detailed, geometric style\"\n", - "\n", - "json = {\n", - " \"prompt\": prompt,\n", - " \"num_inference_steps\": 10,\n", - " \"size\": \"512x512\",\n", - " \"seed\": 123456,\n", - "}\n", - "\n", - "for inf_step in [1, 4, 10, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100]:\n", - " json[\"num_inference_steps\"] = inf_step\n", - "\n", - " %time r = requests.post(endpoint, json=json)\n", - " print(\"Status code\", r.status_code)\n", - "\n", - " # Image\n", - " png_as_np = np.frombuffer(base64.b64decode(r.text), dtype=np.uint8)\n", - " image_bgr = cv2.imdecode(png_as_np, cv2.IMREAD_COLOR)\n", - "\n", - " cv2.imwrite(\"sample_img_{}.png\".format(json[\"num_inference_steps\"]), image_bgr)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install trafilatura\n", - "import trafilatura\n", - "from pprint import pprint\n", - "\n", - "url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n", - "# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n", - "url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n", - "\n", - "# Fetch\n", - "doc = trafilatura.fetch_url(url)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Content & metadata\n", - "metadata = trafilatura.extract_metadata(doc)\n", - "content = trafilatura.extract(doc)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pprint(metadata.as_dict())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(content)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install newspaper4k\n", - "# !pip install langdetect \n", - "import newspaper\n", - "import langdetect\n", - "langdetect.DetectorFactory.seed = 0\n", - "\n", - "\n", - "\n", - "# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n", - "#url = \"https://www.waff.com/2025/03/11/colbert-heights-high-school-employee-arrested-child-abuse/\"\n", - "\n", - "\n", - "\n", - "#url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n", - "\n", - "\n", - "url = \"https://apnews.com/article/canada-trump-us-tariffs-steel-2517a6a2baf0596cb1a43d3a7d1e7939\"\n", - "url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n", - "#url = \"https://www.ft.com/content/6d7c6915-4ceb-43fc-9896-590036b12a87\"\n", - "#url = \"https://www.lanacion.com.ar/politica/milei-en-bahia-blanca-un-viaje-sorpresa-para-frenar-las-criticas-y-mostrar-cercania-nid12032025/\"\n", - "#url = \"https://www.missingkids.org/poster/NCMC/2043547/1\"\n", - "\n", - "try:\n", - " article = newspaper.article(url)\n", - "except newspaper.ArticleException as e:\n", - " print(\"ArticleException: {}\".format(str(e)))\n", - "except Exception as e:\n", - " print(\"Err: {}\".format(str(e)))\n", - "\n", - "# url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])\n", - "# article.is_valid_url(), article.is_parsed, article.is_media_news(), article.is_valid_body()\n", - "article.meta_data\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install news-please\n", - "from newsplease import NewsPlease\n", - "\n", - "url = \"https://variety.com/2025/film/news/gene-hackman-death-suspicious-gas-leak-search-warrant-1236322610/\"\n", - "url = \"https://www.bbc.com/news/articles/cewkkkvkzn9o\"\n", - "url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n", - "article = NewsPlease.from_url(url)\n", - "print(article.title)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(article.maintext)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "matitos", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/README.md b/README.md index a865a3e..ae32f84 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,14 @@ - Fetch parsing URL host - Fetch from RSS feed - Fetch searching (Google search & news, DuckDuckGo, ...) + ++ Sources -> Robustness to TooManyRequests block + - Selenium based + - Sites change their logic, request captcha, ... + - Brave Search API + - Free up to X requests per day. Need credit card association (no charges) + - Bing API + - Subscription required + - Yandex. No API? - Process URLs -> Updates raw URLs - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date - Determines if it is a valid article content diff --git a/app_selenium/logger.py b/app_selenium/logger.py index 28a3099..6905c20 100644 --- a/app_selenium/logger.py +++ b/app_selenium/logger.py @@ -2,30 +2,29 @@ import logging import os # Get env var -path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log") +logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs") # Directory of logs -directory = '/'.join(path_logs_parameterization.split("/")[:-1]) -os.makedirs(directory, exist_ok=True) +os.makedirs(logs_directory, exist_ok=True) logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') -logger = logging.getLogger("news_fetcher") +logger = logging.getLogger("selenium") logger.setLevel(logging.DEBUG) # To file log: INFO / WARNING / ERROR / CRITICAL -fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1) +fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setLevel(logging.DEBUG) logger.addHandler(fh) # To file log: INFO / WARNING / ERROR -fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1) +fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setLevel(logging.INFO) logger.addHandler(fh) # To file log: WARNING / ERROR / CRITICAL -fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1) +fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setLevel(logging.WARNING) logger.addHandler(fh) diff --git a/app_selenium/missing_kids.py b/app_selenium/missing_kids.py index 11b38ba..4a97da7 100644 --- a/app_selenium/missing_kids.py +++ b/app_selenium/missing_kids.py @@ -28,7 +28,7 @@ class MissingKidsFetcher(): logger.debug("Processing page: {}...".format(i)) try: - time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3) + time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3) # Fetch poster URLs for element_type in ["a"]: # ["a", "p", "div"]: for elem in driver.find_elements(By.TAG_NAME, element_type): diff --git a/app_urls/1-DB.ipynb b/app_urls/1-DB.ipynb deleted file mode 100644 index 46757c3..0000000 --- a/app_urls/1-DB.ipynb +++ /dev/null @@ -1,341 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install psycopg[binary]" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "db_postgres\n", - "db_redis\n", - "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n", - " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h" - ] - } - ], - "source": [ - "!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5\n", - "!rm logs/*" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "INSERT_TABLES = True\n", - "INSERT_SAMPLE_DATA = False\n", - "\n", - "import psycopg\n", - "connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n", - "\n", - "from datetime import datetime, timezone\n", - "import re\n", - "from pprint import pprint" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "if INSERT_TABLES:\n", - " # Connect to an existing database\n", - " with psycopg.connect(connection_info) as conn:\n", - " # Open a cursor to perform database operations\n", - " with conn.cursor() as cur:\n", - " # Autocommit at end of transaction (Atomic insert of URLs and sources)\n", - " with conn.transaction() as tx:\n", - " # Create URLs table\n", - " c = cur.execute(\"\"\"\n", - " CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');\n", - "\n", - " CREATE TABLE URLS (\n", - " id SERIAL PRIMARY KEY,\n", - " url TEXT NOT NULL UNIQUE,\n", - " ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n", - " status URL_STATUS NOT NULL DEFAULT 'raw' -- ,\n", - " -- status_wendy WENDY_STATUS DEFAULT NULL,\n", - " -- ts_wendy TIMESTAMPTZ DEFAULT NULL\n", - " );\n", - " CREATE INDEX idx_urls_status ON urls(status);\n", - " CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\n", - "\n", - " CREATE TABLE URLS_DUPLICATE (\n", - " id_url_canonical INTEGER REFERENCES URLS(id),\n", - " id_url_duplicated INTEGER REFERENCES URLS(id),\n", - " PRIMARY KEY (id_url_canonical, id_url_duplicated)\n", - " );\n", - " \n", - " CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');\n", - " CREATE TABLE SEARCH (\n", - " id SMALLSERIAL PRIMARY KEY,\n", - " search TEXT NOT NULL UNIQUE,\n", - " type SEARCH_TYPE NOT NULL\n", - " -- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search\n", - " -- UNIQUE(search, language_country)\n", - " );\n", - " CREATE INDEX idx_search_type ON SEARCH(type);\n", - " \n", - " CREATE TABLE SOURCE (\n", - " id SMALLSERIAL PRIMARY KEY,\n", - " source TEXT NOT NULL UNIQUE\n", - " );\n", - " \n", - " -- CREATE TABLE SEARCH_LANGUAGE (\n", - " -- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. \"en\"\n", - " -- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. \"us\"\n", - " -- PRIMARY KEY (language, country)\n", - " -- );\n", - " \n", - " CREATE TABLE URLS_SOURCE_SEARCH (\n", - " id_url INTEGER REFERENCES URLS(id),\n", - " id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n", - " id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n", - " PRIMARY KEY(id_url, id_source, id_search)\n", - " );\n", - " CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);\n", - " CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);\n", - "\n", - " CREATE TABLE STATUS_PATTERN_MATCHING (\n", - " pattern TEXT PRIMARY KEY,\n", - " priority SMALLINT NOT NULL,\n", - " status URL_STATUS NOT NULL\n", - " );\n", - " \n", - " \n", - " CREATE TABLE URL_CONTENT (\n", - " id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\n", - " date_published TIMESTAMPTZ DEFAULT NOW(),\n", - " title TEXT,\n", - " description TEXT,\n", - " content TEXT,\n", - " valid_content BOOLEAN,\n", - " language CHAR(2), -- ISO 639-1 Code\n", - " keywords TEXT[],\n", - " tags TEXT[],\n", - " authors TEXT[],\n", - " image_main_url TEXT,\n", - " images_url TEXT[],\n", - " videos_url TEXT[],\n", - " url_host TEXT, -- www.breitbart.com\n", - " site_name TEXT -- Breitbart News\n", - " );\n", - " CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n", - " CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n", - " CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\n", - " CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\n", - " CREATE INDEX idx_language ON URL_CONTENT (language);\n", - " CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n", - " \"\"\")\n", - "\n", - " ### Default insert values\n", - " \n", - " # Feeds\n", - " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n", - " # Websites of interest\n", - " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n", - " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');\" )\n", - " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');\" )\n", - " # Search keywords\n", - " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n", - " # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');\" )\n", - " # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');\" )\n", - " \n", - " # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n", - " # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n", - " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"youtube.com/\"))) )\n", - " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"tiktok.com/\"))) )\n", - " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"twitter.com/\"))) )\n", - " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"reddit.com/\"))) )\n", - " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"libreddit.de/\"))) )\n", - " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\t urls\n", - "[]\n", - "\t urls_duplicate\n", - "[]\n", - "\t urls_source_search\n", - "[]\n", - "\t source\n", - "[]\n", - "\t search\n", - "[(1,\n", - " 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n", - " 'rss_feed'),\n", - " (2, 'missingkids.org/poster', 'url_host'),\n", - " (3, 'missingkids.org/new-poster', 'url_host'),\n", - " (4, 'breitbart.com', 'url_host'),\n", - " (5, 'child abuse', 'keyword_search')]\n", - "\t status_pattern_matching\n", - "[('.*youtube\\\\.com/.*', 50, 'invalid'),\n", - " ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n", - " ('.*twitter\\\\.com/.*', 50, 'invalid'),\n", - " ('.*reddit\\\\.com/.*', 50, 'invalid'),\n", - " ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n", - " ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n", - "\t url_content\n", - "[]\n" - ] - } - ], - "source": [ - "# Connect to an existing database\n", - "with psycopg.connect(connection_info) as conn:\n", - " # Open a cursor to perform database operations\n", - " with conn.cursor() as cur:\n", - " # Get tables\n", - " cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema='public';\")\n", - " tables = [t[0] for t in cur.fetchall()]\n", - "\n", - " for t in tables:\n", - " print(\"\\t\", t)\n", - " pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[(1,\n", - " 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n", - " 'rss_feed'),\n", - " (2, 'missingkids.org/poster', 'url_host'),\n", - " (3, 'missingkids.org/new-poster', 'url_host'),\n", - " (4, 'breitbart.com', 'url_host'),\n", - " (5, 'child abuse', 'keyword_search')]\n" - ] - } - ], - "source": [ - "# Connect to an existing database\n", - "with psycopg.connect(connection_info) as conn:\n", - " # Open a cursor to perform database operations\n", - " with conn.cursor() as cur:\n", - " pprint( cur.execute(\"SELECT * FROM SEARCH;\").fetchall() )" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[]\n" - ] - } - ], - "source": [ - "# Connect to an existing database\n", - "with psycopg.connect(connection_info) as conn:\n", - " # Open a cursor to perform database operations\n", - " with conn.cursor() as cur:\n", - " pprint( cur.execute(\"SELECT * FROM URLS LIMIT 50;\").fetchall() )\n", - " #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\n!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'''\n", - "!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n", - "\n", - "# Connect to an existing database\n", - "with psycopg.connect(connection_info) as conn:\n", - " # Open a cursor to perform database operations\n", - " with conn.cursor() as cur:\n", - " pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\n", - " # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org', 'url_host');\" )\n", - "'''" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "matitos", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/app_urls/Dockerfile b/app_urls/Dockerfile new file mode 100644 index 0000000..1a1ddd0 --- /dev/null +++ b/app_urls/Dockerfile @@ -0,0 +1,48 @@ +FROM python:3.12 + +# Prevents Python from writing pyc files to disk +ENV PYTHONDONTWRITEBYTECODE=1 +#Prevents Python from buffering stdout and stderr +ENV PYTHONUNBUFFERED=1 + +# User +RUN useradd -m -r appuser && \ + mkdir /opt/app && \ + chown -R appuser /opt/app + +WORKDIR /opt/app + +# Copy the Django project and install dependencies +COPY requirements.txt /opt/app/ +# run this command to install all dependencies +RUN pip install --no-cache-dir -r requirements.txt + +COPY --chown=appuser:appuser . /opt/app/ + +RUN chmod -R 755 /opt/app +RUN chown -R appuser:appuser /opt/app +USER appuser + +# Initialization script +RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \ + echo 'if [ "${INITIALIZE_DB}" = false ]; then' >> /opt/app/initialize.sh && \ + echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \ + echo 'else' >> /opt/app/initialize.sh && \ + echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \ + echo 'sleep 5' >> /opt/app/initialize.sh && \ + echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \ + echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \ + echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \ + echo 'python manage.py collectstatic --no-input' >> /opt/app/initialize.sh && \ + echo 'python manage.py import --filename scheduled_tasks.json' >> /opt/app/initialize.sh && \ + echo 'fi' >> /opt/app/initialize.sh && \ + chmod +x /opt/app/initialize.sh + +# Serving script +RUN echo '#!/bin/bash' > /opt/app/run.sh && \ + echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \ + #echo 'python manage.py runserver & python manage.py rqworker high default low' >> /opt/app/run.sh && \ + chmod +x /opt/app/run.sh + +# Run Django’s server & workers +CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"] diff --git a/app_urls/README.md b/app_urls/README.md index c9d6b39..70f5486 100644 --- a/app_urls/README.md +++ b/app_urls/README.md @@ -2,18 +2,9 @@ ``` conda create -n matitos_urls python=3.12 conda activate matitos_urls -# Core -pip install django==5.1 psycopg[binary] django-redis django-tasks-scheduler -# Fetcher -pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect -# News visualization -pip install ollama +pip install -r requirements.txt ``` -* Database - * Database initialization -> 1-DB.ipynb - - * From automated inspectdb ``` # 1) Inspect DB, generate models.py @@ -74,60 +65,19 @@ class Meta: db_table = 'urls' # db_table = '{}_urls'.format(project_name) ``` +* Database & initialization + * Check initialize.sh on Dockerfile + * Environment variables -``` -# Database -DB_NAME=${DB_NAME:-matitos} -DB_USER=${DB_NAME:-supermatitos} -DB_PASSWORD=${DB_NAME:-supermatitos} -DB_HOST=${DB_NAME:-localhost} -DB_PORT=${DB_NAME:-5432} -REDIS_HOST=${REDIS_HOST:-localhost} -REDIS_PORT=${REDIS_PORT:-6379} - -# Job timeout: 30 min -JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800} - -# Logs path -PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log" - -# Fetcher -FETCHER_GNEWS_DECODE_SLEEP=2 -FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4 -FETCHER_BETWEEN_SEARCHES_SLEEP=5 -FETCHER_URL_HOST_SLEEP=5 -FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100 - -SELENIUM_ENDPOINT="http://selenium_app:80" -``` + * In docker-compose.yml * Deploy ``` -# Migrations -python manage.py makemigrations api; python manage.py migrate --fake-initial -# Create user -python manage.py createsuperuser +# Check environments variables on docker-compose.yml -# 1) Server -python manage.py runserver +# Remove previous instances +docker compose down -v -# 2) Workers -python manage.py rqworker high default low - -# Visualize DB -http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id +# Build & up +docker compose up -d --build ``` - -* Scheduled tasks -``` -# Import tasks -python manage.py import --filename scheduled_tasks.json - -# Modify using the admin panel, then save -# python manage.py export > scheduled_tasks.json -``` - -* Utils. TODO: To endpoint... -``` -python manage.py rqstats -``` \ No newline at end of file diff --git a/app_urls/api/templates/charts.html b/app_urls/api/templates/charts.html deleted file mode 100644 index 9cffde6..0000000 --- a/app_urls/api/templates/charts.html +++ /dev/null @@ -1,295 +0,0 @@ - - - - - - Charts - - - - - -

Data Visualizations

- - -
- - -
- -
-
- -
- -
- -
- -
- -
- -
- -
-
- - - - diff --git a/app_urls/core/settings.py b/app_urls/core/settings.py index 23323d9..de1dc9c 100644 --- a/app_urls/core/settings.py +++ b/app_urls/core/settings.py @@ -20,12 +20,13 @@ BASE_DIR = Path(__file__).resolve().parent.parent # Quick-start development settings - unsuitable for production # SECURITY WARNING: keep the secret key used in production secret! -SECRET_KEY = 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt' +SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt') # SECURITY WARNING: don't run with debug turned on in production! -DEBUG = True +DEBUG = (os.environ.get('DJANGO_DEBUG') == "True") +print("Django debug mode:", DEBUG) -ALLOWED_HOSTS = [] +ALLOWED_HOSTS = os.environ.get('DJANGO_ALLOWED_HOSTS', "*").split(",") # Application definition @@ -38,11 +39,12 @@ INSTALLED_APPS = [ 'django.contrib.messages', 'django.contrib.staticfiles', 'scheduler', - 'api', + 'fetcher', ] MIDDLEWARE = [ 'django.middleware.security.SecurityMiddleware', + 'whitenoise.middleware.WhiteNoiseMiddleware', # Serving static files 'django.contrib.sessions.middleware.SessionMiddleware', 'django.middleware.common.CommonMiddleware', 'django.middleware.csrf.CsrfViewMiddleware', @@ -51,6 +53,8 @@ MIDDLEWARE = [ 'django.middleware.clickjacking.XFrameOptionsMiddleware', ] +STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage' + ROOT_URLCONF = 'core.urls' TEMPLATES = [ @@ -121,7 +125,7 @@ SCHEDULER_QUEUES = { } } SCHEDULER_CONFIG = { - 'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 15 minutes + 'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 30 minutes 'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours 'EXECUTIONS_IN_PAGE': 20, 'SCHEDULER_INTERVAL': 10, # 10 seconds @@ -158,7 +162,8 @@ USE_TZ = True # Static files (CSS, JavaScript, Images) -STATIC_URL = 'static/' +STATIC_URL = '/static/' +STATIC_ROOT = os.path.join(BASE_DIR, 'static') # Default primary key field type diff --git a/app_urls/core/urls.py b/app_urls/core/urls.py index bfeb5c2..62d899a 100644 --- a/app_urls/core/urls.py +++ b/app_urls/core/urls.py @@ -20,5 +20,5 @@ from django.urls import path, include urlpatterns = [ path('admin/', admin.site.urls), path('scheduler/', include('scheduler.urls')), - path('', include('api.urls')), + path('', include('fetcher.urls')), ] diff --git a/app_urls/db.py b/app_urls/db.py new file mode 100644 index 0000000..ecf7583 --- /dev/null +++ b/app_urls/db.py @@ -0,0 +1,145 @@ +import argparse +import os +import psycopg +import re + +connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format( + os.environ.get("DB_HOST", "localhost"), + os.environ.get("DB_PORT", "5432"), + os.environ.get("DB_NAME", "matitos"), + os.environ.get("DB_USER", "supermatitos"), + os.environ.get("DB_PASSWORD", "supermatitos") +) + +def initialize_tables(): + # Connect to an existing database + with psycopg.connect(connection_info) as conn: + # Open a cursor to perform database operations + with conn.cursor() as cur: + # Autocommit at end of transaction (Atomic creation of tables) + with conn.transaction() as tx: + # Create URLs table + c = cur.execute(""" + CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate'); + + CREATE TABLE URLS ( + id SERIAL PRIMARY KEY, + url TEXT NOT NULL UNIQUE, + ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(), + status URL_STATUS NOT NULL DEFAULT 'raw' -- , + -- status_wendy WENDY_STATUS DEFAULT NULL, + -- ts_wendy TIMESTAMPTZ DEFAULT NULL + ); + CREATE INDEX idx_urls_status ON urls(status); + CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch); + + CREATE TABLE URLS_DUPLICATE ( + id_url_canonical INTEGER REFERENCES URLS(id), + id_url_duplicated INTEGER REFERENCES URLS(id), + PRIMARY KEY (id_url_canonical, id_url_duplicated) + ); + + CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host'); + CREATE TABLE SEARCH ( + id SMALLSERIAL PRIMARY KEY, + search TEXT NOT NULL UNIQUE, + type SEARCH_TYPE NOT NULL + -- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search + -- UNIQUE(search, language_country) + ); + CREATE INDEX idx_search_type ON SEARCH(type); + + CREATE TABLE SOURCE ( + id SMALLSERIAL PRIMARY KEY, + source TEXT NOT NULL UNIQUE + ); + + -- CREATE TABLE SEARCH_LANGUAGE ( + -- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. "en" + -- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. "us" + -- PRIMARY KEY (language, country) + -- ); + + CREATE TABLE URLS_SOURCE_SEARCH ( + id_url INTEGER REFERENCES URLS(id), + id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT, + id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT, + PRIMARY KEY(id_url, id_source, id_search) + ); + CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source); + CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search); + + CREATE TABLE STATUS_PATTERN_MATCHING ( + pattern TEXT PRIMARY KEY, + priority SMALLINT NOT NULL, + status URL_STATUS NOT NULL + ); + + + CREATE TABLE URL_CONTENT ( + id_url INTEGER PRIMARY KEY REFERENCES URLS(id), + date_published TIMESTAMPTZ DEFAULT NOW(), + title TEXT, + description TEXT, + content TEXT, + valid_content BOOLEAN, + language CHAR(2), -- ISO 639-1 Code + keywords TEXT[], + tags TEXT[], + authors TEXT[], + image_main_url TEXT, + images_url TEXT[], + videos_url TEXT[], + url_host TEXT, -- www.breitbart.com + site_name TEXT -- Breitbart News + ); + CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags); + CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors); + CREATE INDEX idx_date_published ON URL_CONTENT (date_published); + CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content); + CREATE INDEX idx_language ON URL_CONTENT (language); + CREATE INDEX idx_url_host ON URL_CONTENT (url_host); + """) + +def initialize_data(): + # Connect to an existing database + with psycopg.connect(connection_info) as conn: + # Open a cursor to perform database operations + with conn.cursor() as cur: + # Autocommit at end of transaction (Atomic creation of data) + with conn.transaction() as tx: + # Feeds + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" ) + # Websites of interest + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" ) + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" ) + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" ) + # Search keywords + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" ) + # TODO: Language per search + # cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');" ) + # cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');" ) + + # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/ + cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) ) + cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) ) + cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) ) + cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) ) + cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) ) + cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) ) + +def main(name): + print('Hello, %s!' % name) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Database initialization') + parser.add_argument('--initialize_tables', help='Create DB tables', action='store_true', default=False) + parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False) + args = parser.parse_args() + + if (args.initialize_tables): + print("Initializing tables") + initialize_tables() + if (args.initialize_data): + print("Initializing data") + initialize_data() diff --git a/app_urls/api/__init__.py b/app_urls/fetcher/__init__.py similarity index 100% rename from app_urls/api/__init__.py rename to app_urls/fetcher/__init__.py diff --git a/app_urls/api/admin.py b/app_urls/fetcher/admin.py similarity index 100% rename from app_urls/api/admin.py rename to app_urls/fetcher/admin.py diff --git a/app_urls/api/apps.py b/app_urls/fetcher/apps.py similarity index 63% rename from app_urls/api/apps.py rename to app_urls/fetcher/apps.py index 66656fd..9d99d4b 100644 --- a/app_urls/api/apps.py +++ b/app_urls/fetcher/apps.py @@ -1,6 +1,6 @@ from django.apps import AppConfig -class ApiConfig(AppConfig): +class FetcherConfig(AppConfig): default_auto_field = 'django.db.models.BigAutoField' - name = 'api' + name = 'fetcher' diff --git a/app_urls/api/migrations/0001_initial.py b/app_urls/fetcher/migrations/0001_initial.py similarity index 96% rename from app_urls/api/migrations/0001_initial.py rename to app_urls/fetcher/migrations/0001_initial.py index 829da62..8c1d1ef 100644 --- a/app_urls/api/migrations/0001_initial.py +++ b/app_urls/fetcher/migrations/0001_initial.py @@ -65,7 +65,7 @@ class Migration(migrations.Migration): migrations.CreateModel( name='UrlContent', fields=[ - ('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')), + ('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')), ('date_published', models.DateTimeField(blank=True, null=True)), ('title', models.TextField(blank=True, null=True)), ('description', models.TextField(blank=True, null=True)), @@ -89,7 +89,7 @@ class Migration(migrations.Migration): migrations.CreateModel( name='UrlsDuplicate', fields=[ - ('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')), + ('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')), ], options={ 'db_table': 'urls_duplicate', @@ -99,7 +99,7 @@ class Migration(migrations.Migration): migrations.CreateModel( name='UrlsSourceSearch', fields=[ - ('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')), + ('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')), ], options={ 'db_table': 'urls_source_search', diff --git a/app_urls/api/migrations/__init__.py b/app_urls/fetcher/migrations/__init__.py similarity index 100% rename from app_urls/api/migrations/__init__.py rename to app_urls/fetcher/migrations/__init__.py diff --git a/app_urls/api/models.py b/app_urls/fetcher/models.py similarity index 100% rename from app_urls/api/models.py rename to app_urls/fetcher/models.py diff --git a/app_urls/api/src/db_utils.py b/app_urls/fetcher/src/db_utils.py similarity index 100% rename from app_urls/api/src/db_utils.py rename to app_urls/fetcher/src/db_utils.py diff --git a/app_urls/api/src/fetch_feed.py b/app_urls/fetcher/src/fetch_feed.py similarity index 100% rename from app_urls/api/src/fetch_feed.py rename to app_urls/fetcher/src/fetch_feed.py diff --git a/app_urls/api/src/fetch_missing_kids.py b/app_urls/fetcher/src/fetch_missing_kids.py similarity index 100% rename from app_urls/api/src/fetch_missing_kids.py rename to app_urls/fetcher/src/fetch_missing_kids.py diff --git a/app_urls/api/src/fetch_parser.py b/app_urls/fetcher/src/fetch_parser.py similarity index 100% rename from app_urls/api/src/fetch_parser.py rename to app_urls/fetcher/src/fetch_parser.py diff --git a/app_urls/api/src/fetch_search.py b/app_urls/fetcher/src/fetch_search.py similarity index 100% rename from app_urls/api/src/fetch_search.py rename to app_urls/fetcher/src/fetch_search.py diff --git a/app_urls/api/src/fetch_search_instances.py b/app_urls/fetcher/src/fetch_search_instances.py similarity index 81% rename from app_urls/api/src/fetch_search_instances.py rename to app_urls/fetcher/src/fetch_search_instances.py index 22ba642..7be0105 100644 --- a/app_urls/api/src/fetch_search_instances.py +++ b/app_urls/fetcher/src/fetch_search_instances.py @@ -1,6 +1,8 @@ import time import feedparser import os +from django.utils import timezone +from datetime import timedelta from ..models import Search, Source from .fetch_utils import decode_gnews_urls from .logger import get_logger @@ -9,6 +11,7 @@ logger = get_logger() from gnews import GNews from duckduckgo_search import DDGS from GoogleNews import GoogleNews +from search_engines import Yahoo, Aol ########################################################################### ########################################################################### @@ -42,11 +45,19 @@ class FetcherAbstract(ABC): return raw_urls def fetch_articles(self, db_writer, obj_search): - # Search - keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search) # Source name source_name = self._get_name() - + + # Search + keyword_search = obj_search.search + # URL Host search? -> site:${URL_HOST} + if (obj_search.type == Search.TYPE_ENUM.URL_HOST): + keyword_search = "{}{}".format("site:", keyword_search) + # Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK} + if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH): + start_date = timezone.now() - timedelta(days=7) + keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year)) + logger.debug("Starting search: {} - {}".format(keyword_search, source_name)) # Fetch raw_urls = self._fetch_raw_urls(keyword_search) @@ -165,11 +176,11 @@ class SearchGoogleGeneral(FetcherAbstract): self.language = args.get("language", "en") self.country = args.get("country", "US") self.period = args.get("period", "7d") - self.max_pages = args.get("max_pages", 1) + self.pages = args.get("pages", 1) def _get_name(self): # [source] [period] [language-country] [pages] - return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.max_pages).replace("pages=None", "").strip() + return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip() def _fetch_raw_urls(self, keyword_search): try: @@ -181,7 +192,7 @@ class SearchGoogleGeneral(FetcherAbstract): set_links = set() # Iterate pages - for i in range(self.max_pages): + for i in range(self.pages): # Sleep between pages fetch time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4))) # Number of URLs fetched so far @@ -253,7 +264,45 @@ class SearchGoogleNewsRSS(FetcherAbstract): urls = [] return urls + +class SearchYahooGeneral(FetcherAbstract): + def __init__(self, args={}): + super().__init__() + # Parameters + self.pages = args.get("pages", 2) + + def _get_name(self): + # [source] [language-country] [pages] + return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip() + + def _fetch_raw_urls(self, keyword_search): + try: + results = Yahoo().search(keyword_search, pages=self.pages) + urls = results.links() + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) + urls = [] + return urls + +class SearchAOLGeneral(FetcherAbstract): + def __init__(self, args={}): + super().__init__() + # Parameters + self.pages = args.get("pages", 2) + + def _get_name(self): + # [source] [language-country] [pages] + return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip() + + def _fetch_raw_urls(self, keyword_search): + try: + results = Aol().search(keyword_search, pages=self.pages) + urls = results.links() + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) + urls = [] + return urls ########################################################################### # List of instances -ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS] +ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS] diff --git a/app_urls/api/src/fetch_utils.py b/app_urls/fetcher/src/fetch_utils.py similarity index 100% rename from app_urls/api/src/fetch_utils.py rename to app_urls/fetcher/src/fetch_utils.py diff --git a/app_urls/api/src/logger.py b/app_urls/fetcher/src/logger.py similarity index 52% rename from app_urls/api/src/logger.py rename to app_urls/fetcher/src/logger.py index fbc4405..03aa8c3 100644 --- a/app_urls/api/src/logger.py +++ b/app_urls/fetcher/src/logger.py @@ -2,30 +2,29 @@ import logging import os # Get env var -path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_fetcher_{}.log") +logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs") # Directory of logs -directory = '/'.join(path_logs_parameterization.split("/")[:-1]) -os.makedirs(directory, exist_ok=True) +os.makedirs(logs_directory, exist_ok=True) logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') -logger = logging.getLogger("news_fetcher") +logger = logging.getLogger("fetcher") logger.setLevel(logging.DEBUG) # To file log: INFO / WARNING / ERROR / CRITICAL -fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=4) +fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setLevel(logging.DEBUG) logger.addHandler(fh) # To file log: INFO / WARNING / ERROR -fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=2) +fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setLevel(logging.INFO) logger.addHandler(fh) # To file log: WARNING / ERROR / CRITICAL -fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1) +fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setLevel(logging.WARNING) logger.addHandler(fh) diff --git a/app_urls/api/src/url_processor.py b/app_urls/fetcher/src/url_processor.py similarity index 100% rename from app_urls/api/src/url_processor.py rename to app_urls/fetcher/src/url_processor.py diff --git a/app_urls/api/tasks.py b/app_urls/fetcher/tasks.py similarity index 99% rename from app_urls/api/tasks.py rename to app_urls/fetcher/tasks.py index 09fb107..b22dc5c 100644 --- a/app_urls/api/tasks.py +++ b/app_urls/fetcher/tasks.py @@ -73,9 +73,6 @@ def process_missing_kids_urls_all(batch_size=None): logger.info("Task completed: {}".format(task)) - - - @job('default') def background_task(process_type: str): logger.info("Task triggered: {}".format(process_type)) diff --git a/app_urls/fetcher/templates/charts.html b/app_urls/fetcher/templates/charts.html new file mode 100644 index 0000000..2b45226 --- /dev/null +++ b/app_urls/fetcher/templates/charts.html @@ -0,0 +1,179 @@ + + + + + + Charts + + + + + +

Data Visualizations

+ + +
+ + +
+ +
+
+ +
+ +
+ +
+ +
+ +
+ +
+ +
+
+ + + + diff --git a/app_urls/api/templates/filtered_urls.html b/app_urls/fetcher/templates/filtered_urls.html similarity index 90% rename from app_urls/api/templates/filtered_urls.html rename to app_urls/fetcher/templates/filtered_urls.html index be3e5bd..81efd19 100644 --- a/app_urls/api/templates/filtered_urls.html +++ b/app_urls/fetcher/templates/filtered_urls.html @@ -113,11 +113,11 @@ input[type="checkbox"] { } /* Themed Toggle Button */ -.theme-button, .home-button { +.theme-button, .home-button, .chart-button { background-color: var(--sidebar); border: 1px solid var(--sidebar); border-radius: 50%; - width: 45px; + width: 30px; height: 45px; font-size: 25px; display: flex; @@ -127,10 +127,10 @@ input[type="checkbox"] { cursor: pointer; } -.theme-button:hover, .home-button:hover { +.theme-button:hover, .home-button:hover, .chart-button:hover { transform: rotate(20deg); } -.theme-button:active, .home-button:active { +.theme-button:active, .home-button:active, .chart-button:acive { transform: scale(0.95); } @@ -235,6 +235,7 @@ input[type="checkbox"] {
+
@@ -477,6 +478,10 @@ input[type="checkbox"] { document.getElementById("homeButton").addEventListener("click", function () { window.location.href = "./"; // Change this to your homepage URL if different }); + // Charts + document.getElementById("chartButton").addEventListener("click", function () { + window.location.href = "./charts"; // Change this to your homepage URL if different + }); ////////////////////////////////////////////// // Timestamp to local timezone @@ -508,26 +513,32 @@ input[type="checkbox"] { }); }); - ////////////////////////////////////////////////////////////////////// - // Function to update the form parameter before submitting - function updateFormParameter(section) { - const checkboxes = document.querySelectorAll(`[name='${section}']`); - const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked); - - // If all are checked, replace them with a hidden input with value "all" - if (allChecked) { - checkboxes.forEach(checkbox => checkbox.removeAttribute("name")); - let hiddenInput = document.createElement("input"); - hiddenInput.type = "hidden"; - hiddenInput.name = section; - hiddenInput.value = "all"; - document.getElementById("filterForm").appendChild(hiddenInput); - } else { - checkboxes.forEach(checkbox => checkbox.setAttribute("name", section)); - document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove()); - } + // Function to update the form parameters for all sections before submitting + function updateFormParameters() { + // Get all distinct sections by selecting all checkboxes and extracting their "name" attributes + const sections = new Set([...document.querySelectorAll("input[type='checkbox']")].map(cb => cb.name)); - // Submit form after changes + sections.forEach(section => { + if (!section) return; // Skip any checkboxes without a name + + const checkboxes = document.querySelectorAll(`[name='${section}']`); + const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked); + + // If all checkboxes in a section are checked, remove them and add a hidden input + if (allChecked) { + checkboxes.forEach(checkbox => checkbox.removeAttribute("name")); + let hiddenInput = document.createElement("input"); + hiddenInput.type = "hidden"; + hiddenInput.name = section; + hiddenInput.value = "all"; + document.getElementById("filterForm").appendChild(hiddenInput); + } else { + checkboxes.forEach(checkbox => checkbox.setAttribute("name", section)); + document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove()); + } + }); + + // Submit the form after updating all sections document.getElementById("filterForm").submit(); } @@ -537,7 +548,7 @@ input[type="checkbox"] { const checkboxes = document.querySelectorAll(`[name='${section}']`); const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked); checkboxes.forEach(cb => cb.checked = !allChecked); - updateFormParameter(section); + updateFormParameters(); } // Attach event listeners to "Toggle All" buttons @@ -552,14 +563,14 @@ input[type="checkbox"] { // Automatically submit the form when any checkbox changes document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) { checkbox.addEventListener('change', function() { - updateFormParameter(this.name); + updateFormParameters(); }); }); document.getElementById('perPageSelect').addEventListener('change', function() { - document.getElementById('filterForm').submit(); + updateFormParameters(); }); document.getElementById('timeFilterSelect').addEventListener('change', function() { - document.getElementById('filterForm').submit(); + updateFormParameters(); }); diff --git a/app_urls/api/templates/url_detail.html b/app_urls/fetcher/templates/url_detail.html similarity index 99% rename from app_urls/api/templates/url_detail.html rename to app_urls/fetcher/templates/url_detail.html index d09e4b0..4a4377f 100644 --- a/app_urls/api/templates/url_detail.html +++ b/app_urls/fetcher/templates/url_detail.html @@ -167,13 +167,14 @@ +
diff --git a/app_urls/api/templatetags/__init__.py b/app_urls/fetcher/templatetags/__init__.py similarity index 100% rename from app_urls/api/templatetags/__init__.py rename to app_urls/fetcher/templatetags/__init__.py diff --git a/app_urls/api/templatetags/custom_filters.py b/app_urls/fetcher/templatetags/custom_filters.py similarity index 100% rename from app_urls/api/templatetags/custom_filters.py rename to app_urls/fetcher/templatetags/custom_filters.py diff --git a/app_urls/api/tests.py b/app_urls/fetcher/tests.py similarity index 100% rename from app_urls/api/tests.py rename to app_urls/fetcher/tests.py diff --git a/app_urls/api/urls.py b/app_urls/fetcher/urls.py similarity index 93% rename from app_urls/api/urls.py rename to app_urls/fetcher/urls.py index e45911d..8aa2720 100644 --- a/app_urls/api/urls.py +++ b/app_urls/fetcher/urls.py @@ -8,7 +8,7 @@ urlpatterns = [ # path('task/', views.trigger_task, name='trigger_task'), # - path('charts/', views.charts, name='charts'), + path('urls/charts/', views.charts, name='charts'), path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'), path('urls-per-status/', views.urls_per_status, name='urls_per_status'), path('urls-per-source/', views.urls_per_source, name='urls_per_source'), diff --git a/app_urls/api/views.py b/app_urls/fetcher/views.py similarity index 91% rename from app_urls/api/views.py rename to app_urls/fetcher/views.py index f8c3c86..e94fe6c 100644 --- a/app_urls/api/views.py +++ b/app_urls/fetcher/views.py @@ -2,6 +2,7 @@ from .tasks import background_task from django.core.paginator import Paginator from django.shortcuts import render, get_object_or_404 from django.http import StreamingHttpResponse, JsonResponse, HttpResponse +from django.contrib.auth.decorators import login_required import ollama from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch import os @@ -29,17 +30,18 @@ def link_list(request): # URLs "http://localhost:8000/urls", # Charts - "http://localhost:8000/charts", - # API tasks + "http://localhost:8000/urls/charts", + # Fetcher tasks ] + [os.path.join(prefix, l) for l in links] # Json return JsonResponse({"links": list_links }) #################################################################################################### +# @login_required(login_url='/admin') def logs(request, log_type): # Capture output: python manage.py rqstats try: - with open(os.getenv("PATH_LOGS_DEBUG", "logs/log_app_fetcher_{}.log".format(log_type)), "r") as f: + with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f: file_content = f.read() except Exception as e: file_content = "Error reading logs for log type :{}".format(log_type) @@ -130,8 +132,9 @@ def charts(request): return render(request, 'charts.html') def urls_by_fetch_date(request): - # Get the date for 30 days ago - start_date = timezone.now() - timedelta(days=30) + # Get the filtering date parameter + days = float(request.GET.get('days', 30)) # Default is 30 days + start_date = timezone.now() - timedelta(days=days) # Count the number of URLs grouped by fetch date urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \ @@ -141,8 +144,8 @@ def urls_by_fetch_date(request): # Format data to return as JSON data = { - 'dates': [item['ts_fetch__date'] for item in urls_data], - 'counts': [item['count'] for item in urls_data], + 'labels': [item['ts_fetch__date'] for item in urls_data], + 'values': [item['count'] for item in urls_data], } return JsonResponse(data) @@ -160,38 +163,48 @@ def urls_per_status(request): # Format data for JSON data = { - 'statuses': [item['status'] for item in urls_data], - 'counts': [item['count'] for item in urls_data], + 'labels': [item['status'] for item in urls_data], + 'values': [item['count'] for item in urls_data], } return JsonResponse(data) def urls_per_source(request): + # Get the filtering date parameter + days = float(request.GET.get('days', 30)) # Default is 30 days + start_date = timezone.now() - timedelta(days=days) + # Count the number of URLs grouped by source urls_data = UrlsSourceSearch.objects \ + .filter(id_url__ts_fetch__gte=start_date) \ .values('id_source__source') \ .annotate(count=Count('id_url')) \ .order_by('id_source__source') # Format data for JSON data = { - 'sources': [item['id_source__source'] for item in urls_data], - 'counts': [item['count'] for item in urls_data], + 'labels': [item['id_source__source'] for item in urls_data], + 'values': [item['count'] for item in urls_data], } return JsonResponse(data) def urls_per_search(request): + # Get the filtering date parameter + days = float(request.GET.get('days', 30)) # Default is 30 days + start_date = timezone.now() - timedelta(days=days) + # Count the number of URLs grouped by search urls_data = UrlsSourceSearch.objects \ + .filter(id_url__ts_fetch__gte=start_date) \ .values('id_search__search') \ .annotate(count=Count('id_url')) \ .order_by('id_search__search') # Format data for JSON data = { - 'searches': [item['id_search__search'] for item in urls_data], - 'counts': [item['count'] for item in urls_data], + 'labels': [item['id_search__search'] for item in urls_data], + 'values': [item['count'] for item in urls_data], } return JsonResponse(data) diff --git a/app_urls/requirements.txt b/app_urls/requirements.txt new file mode 100644 index 0000000..04c2c38 --- /dev/null +++ b/app_urls/requirements.txt @@ -0,0 +1,17 @@ +django==5.1 +psycopg[binary] +django-redis +django-tasks-scheduler +gunicorn +whitenoise +feedparser +python-dateutil +newspaper4k[all] +lxml[html_clean] +googlenewsdecoder +gnews +GoogleNews +duckduckgo_search +git+https://github.com/tasos-py/Search-Engines-Scraper.git +langdetect +ollama \ No newline at end of file diff --git a/app_urls/scheduled_tasks.json b/app_urls/scheduled_tasks.json index 0451899..8de7cad 100644 --- a/app_urls/scheduled_tasks.json +++ b/app_urls/scheduled_tasks.json @@ -2,10 +2,10 @@ { "model": "RepeatableTaskType", "name": "Process error URLs", - "callable": "api.tasks.process_error_urls", + "callable": "fetcher.tasks.process_error_urls", "callable_args": [], "callable_kwargs": [], - "enabled": true, + "enabled": false, "queue": "low", "repeat": null, "at_front": false, @@ -15,18 +15,39 @@ "scheduled_time": "2025-04-01T12:36:21+00:00", "interval": 4, "interval_unit": "hours", - "successful_runs": 15, + "successful_runs": 0, "failed_runs": 0, - "last_successful_run": "2025-04-01 08:37:06.722770+00:00", + "last_successful_run": null, + "last_failed_run": null + }, + { + "model": "RepeatableTaskType", + "name": "Process raw URLs", + "callable": "fetcher.tasks.process_raw_urls", + "callable_args": [], + "callable_kwargs": [], + "enabled": false, + "queue": "low", + "repeat": null, + "at_front": false, + "timeout": null, + "result_ttl": 86400, + "cron_string": null, + "scheduled_time": "2025-04-01T10:20:08+00:00", + "interval": 10, + "interval_unit": "minutes", + "successful_runs": 0, + "failed_runs": 0, + "last_successful_run": null, "last_failed_run": null }, { "model": "RepeatableTaskType", "name": "Process MissingKids URLs", - "callable": "api.tasks.process_missing_kids_urls", + "callable": "fetcher.tasks.process_missing_kids_urls", "callable_args": [], "callable_kwargs": [], - "enabled": true, + "enabled": false, "queue": "default", "repeat": null, "at_front": false, @@ -34,20 +55,20 @@ "result_ttl": 86400, "cron_string": null, "scheduled_time": "2025-04-01T10:37:50+00:00", - "interval": 2, + "interval": 4, "interval_unit": "hours", - "successful_runs": 29, + "successful_runs": 0, "failed_runs": 0, - "last_successful_run": "2025-04-01 08:42:05.864064+00:00", + "last_successful_run": null, "last_failed_run": null }, { "model": "RepeatableTaskType", "name": "Process MissingKids URLs ALL", - "callable": "api.tasks.process_missing_kids_urls_all", + "callable": "fetcher.tasks.process_missing_kids_urls_all", "callable_args": [], "callable_kwargs": [], - "enabled": true, + "enabled": false, "queue": "default", "repeat": null, "at_front": false, @@ -65,10 +86,10 @@ { "model": "RepeatableTaskType", "name": "Fetch Feeds", - "callable": "api.tasks.fetch_feeds", + "callable": "fetcher.tasks.fetch_feeds", "callable_args": [], "callable_kwargs": [], - "enabled": true, + "enabled": false, "queue": "default", "repeat": null, "at_front": false, @@ -78,39 +99,18 @@ "scheduled_time": "2025-04-01T10:18:56+00:00", "interval": 15, "interval_unit": "minutes", - "successful_runs": 288, + "successful_runs": 0, "failed_runs": 0, - "last_successful_run": "2025-04-01 10:03:58.363856+00:00", - "last_failed_run": null - }, - { - "model": "RepeatableTaskType", - "name": "Process raw URLs", - "callable": "api.tasks.process_raw_urls", - "callable_args": [], - "callable_kwargs": [], - "enabled": true, - "queue": "low", - "repeat": null, - "at_front": false, - "timeout": null, - "result_ttl": 86400, - "cron_string": null, - "scheduled_time": "2025-04-01T10:20:08+00:00", - "interval": 15, - "interval_unit": "minutes", - "successful_runs": 78, - "failed_runs": 0, - "last_successful_run": "2025-04-01 10:05:08.394472+00:00", + "last_successful_run": null, "last_failed_run": null }, { "model": "RepeatableTaskType", "name": "Fetch Parser", - "callable": "api.tasks.fetch_parser", + "callable": "fetcher.tasks.fetch_parser", "callable_args": [], "callable_kwargs": [], - "enabled": true, + "enabled": false, "queue": "default", "repeat": null, "at_front": false, @@ -120,18 +120,18 @@ "scheduled_time": "2025-04-01T10:25:42+00:00", "interval": 1, "interval_unit": "hours", - "successful_runs": 62, + "successful_runs": 0, "failed_runs": 0, - "last_successful_run": "2025-04-01 09:25:57.977051+00:00", + "last_successful_run": null, "last_failed_run": null }, { "model": "RepeatableTaskType", "name": "Fetch Search", - "callable": "api.tasks.fetch_search", + "callable": "fetcher.tasks.fetch_search", "callable_args": [], "callable_kwargs": [], - "enabled": true, + "enabled": false, "queue": "default", "repeat": null, "at_front": false, @@ -141,9 +141,51 @@ "scheduled_time": "2025-04-01T10:29:33+00:00", "interval": 1, "interval_unit": "hours", - "successful_runs": 63, + "successful_runs": 0, "failed_runs": 0, - "last_successful_run": "2025-04-01 09:37:20.671072+00:00", + "last_successful_run": null, + "last_failed_run": null + }, + { + "model": "RepeatableTaskType", + "name": "Fetch MissingKids", + "callable": "fetcher.tasks.fetch_missing_kids", + "callable_args": [], + "callable_kwargs": [], + "enabled": false, + "queue": "default", + "repeat": null, + "at_front": false, + "timeout": null, + "result_ttl": 86400, + "cron_string": null, + "scheduled_time": "2025-04-01T10:29:33+00:00", + "interval": 4, + "interval_unit": "hours", + "successful_runs": 0, + "failed_runs": 0, + "last_successful_run": null, + "last_failed_run": null + }, + { + "model": "RepeatableTaskType", + "name": "Fetch MissingKids ALL", + "callable": "fetcher.tasks.fetch_missing_kids_all", + "callable_args": [], + "callable_kwargs": [], + "enabled": false, + "queue": "default", + "repeat": null, + "at_front": false, + "timeout": null, + "result_ttl": 86400, + "cron_string": null, + "scheduled_time": "2025-04-01T10:29:33+00:00", + "interval": 1, + "interval_unit": "weeks", + "successful_runs": 0, + "failed_runs": 0, + "last_successful_run": null, "last_failed_run": null } ] diff --git a/docker-compose.yml b/docker-compose.yml index 252c4c7..0c0ce9e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,101 +2,106 @@ version: '3.9' services: - fetcher_selenium: + fetcher_app_selenium: + image: fetcher_app_selenium build: context: ./app_selenium - container_name: selenium_app - restart: unless-stopped + container_name: fetcher_app_selenium + # restart: unless-stopped shm_size: 512mb environment: - - SELENIUM_SLEEP_PER_PAGE=4 - - PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log" + - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4} + - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs} ports: - 80 + dns: + - 1.1.1.1 + - 1.0.0.1 + deploy: + resources: + limits: + cpus: '4' + memory: 4G - fetcher_urls_app: + fetcher_app_urls: + image: fetcher_app_urls build: context: ./app_urls - container_name: urls_app - restart: unless-stopped + container_name: fetcher_app_urls + # restart: unless-stopped environment: - #- name=value + # Initialization + - INITIALIZE_DB=${INITIALIZE_DB:-true} + - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME:-matitos} + - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD:-matitos} + - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL:-matitos@matitos.org} + # Django + - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:-abc123456789qwerty} + - DJANGO_DEBUG=${DJANGO_DEBUG:-False} + - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2 # Database - DB_NAME=${DB_NAME:-matitos} - - DB_USER=${DB_NAME:-supermatitos} - - DB_PASSWORD=${DB_NAME:-supermatitos} - - DB_HOST=${DB_NAME:-localhost} # db_postgres - - DB_PORT=${DB_NAME:-5432} - - REDIS_HOST=${REDIS_HOST:-localhost} + - DB_USER=${DB_USER:-supermatitos} + - DB_PASSWORD=${DB_PASSWORD:-supermatitos} + - DB_HOST=${DB_HOST:-fetcher_db} + - DB_PORT=${DB_PORT:-5432} + - REDIS_HOST=${REDIS_HOST:-fetcher_redis} - REDIS_PORT=${REDIS_PORT:-6379} # Job timeout: 30 min - JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800} # Logs path - - PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log" + - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs} # Fetcher - - FETCHER_GNEWS_DECODE_SLEEP=2 - - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4 - - FETCHER_BETWEEN_SEARCHES_SLEEP=5 - - FETCHER_URL_HOST_SLEEP=5 + - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2} + - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5} + - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP:-1} + - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-2} # Selenium - - SELENIUM_ENDPOINT="http://selenium_app:80" + - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80} + - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org} ports: - - 80 + - 8000:8000 + depends_on: + - fetcher_db + - fetcher_redis + dns: + - 1.1.1.1 + - 1.0.0.1 + deploy: + resources: + limits: + cpus: '4' + memory: 4G fetcher_db: image: postgres:17 - container_name: db_postgres + container_name: fetcher_db restart: unless-stopped # Set shared memory limit when using docker-compose shm_size: 128mb environment: + POSTGRES_DB: ${DB_NAME:-matitos} POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos} - POSTGRES_USER: ${DB_USERNAME:-supermatitos} - POSTGRES_DB: ${DB_DATABASE_NAME:-matitos} + POSTGRES_USER: ${DB_USER:-supermatitos} POSTGRES_INITDB_ARGS: '--data-checksums' - #volumes: - # - ${PATH_BASE:-.}/postgres:/var/lib/postgresql/data + #volumes: # Persistent DB? + # - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data ports: - - 5432:5432 + - 5432 #:5432 fetcher_redis: image: redis:alpine - container_name: db_redis + container_name: fetcher_redis restart: unless-stopped ports: - - 6379:6379 - #expose: - # - 6379 - - fetcher_adminer: - # http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public - image: adminer - container_name: adminer - restart: unless-stopped - environment: - - ADMINER_DEFAULT_DB_DRIVER=pgsql - #- ADMINER_DEFAULT_DB_HOST - #- ADMINER_DEFAULT_DB_NAME - depends_on: - - matitos_db - ports: - - 8080:8080 + - 6379 #:6379 fetcher_dozzle: - container_name: dozzle + container_name: fetcher_dozzle image: amir20/dozzle:latest volumes: - /var/run/docker.sock:/var/run/docker.sock:ro ports: - 8888:8080 environment: - - DOZZLE_FILTER="name=matitos_" # Need container name matitos_ ? - - -# django: -# Env: DB_HOST=matitos_db -# DJANGO_DB_NAME=${DB_DATABASE_NAME:-matitos} -# DJANGO_DB_USER=${DB_USERNAME:-supermatitos} -# DJANGO_DB_PASSWORD=${DB_PASSWORD:-supermatitos} -# DJANGO_DB_HOST=${DB_HOST:-localhost} -# DJANGO_DB_PORT=${DB_PORT:-5432} + - DOZZLE_FILTER="name=fetcher_"