Dockerization, whitenoise serving static, refactor

This commit is contained in:
Luciano Gervasoni
2025-04-04 10:53:16 +02:00
parent 5addfa5ba9
commit 4dbe2e55ef
39 changed files with 708 additions and 1238 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@ __pycache__/
*.pyc
**/credentials.py
logs/
postgres/

View File

@@ -1,363 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install git+https://github.com/tasos-py/Search-Engines-Scraper.git\n",
"import search_engines\n",
"\n",
"engine = search_engines.Bing()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = engine.search('news: \"child abuse\"', pages=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"engine = search_engines.search_engines_dict[\"brave\"]()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = 'news: child abuse'\n",
"r = engine.search(query, pages=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"r.__dict__"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import newspaper\n",
"newspaper.ArticleBinaryDataException"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"'''\n",
"import newspaper\n",
"\n",
"url = 'https://www.missingkids.org/poster/USVA/VA25-0820/1'\n",
"art_1 = newspaper.article(url)\n",
"url = 'https://www.missingkids.org/poster/NCMC/2045193/1'\n",
"art_2 = newspaper.article(url)\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import ollama\n",
"\n",
"#model = \"llama3.2:1b\"\n",
"client = ollama.Client(\n",
" host = 'https://ollamamodel.matitos.org',\n",
")\n",
"l = client.list()\n",
"list_models = [m.get(\"model\") for m in l.model_dump().get(\"models\")]\n",
"\n",
"print(list_models)\n",
"\n",
"for m in list_models:\n",
" context_key = [ k for k in client.show(m).model_dump().get(\"modelinfo\").keys() if \"context_length\" in k]\n",
" if (len(context_key) != 1):\n",
" print(\"Problem!!!\")\n",
" print(m, client.show(m).model_dump().get(\"modelinfo\").get(context_key[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text = \"...\"\n",
"model = \"falcon3:1b\"\n",
"\n",
"msg_content = {\n",
" \"role\": \"user\", \n",
" \"content\": text,\n",
"}\n",
"response = client.chat(model=model, messages=[msg_content], stream=False)\n",
"print(response[\"message\"][\"content\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import cv2\n",
"import base64\n",
"import numpy as np\n",
"\n",
"endpoint = \"http://192.168.2.64:12343/image\"\n",
"\n",
"\n",
"\n",
"prompt = \"Majestic mountain landscape with snow-capped peaks, autumn foliage in vibrant reds and oranges, a turquoise river winding through a valley, crisp and serene atmosphere, ultra-realistic style.\"\n",
"prompt = \"A group of kids happily playing in a joy environment\"\n",
"#prompt = \"A bitcoin behaving like a king, surrounded by small alternative coins. Detailed, geometric style\"\n",
"\n",
"json = {\n",
" \"prompt\": prompt,\n",
" \"num_inference_steps\": 10,\n",
" \"size\": \"512x512\",\n",
" \"seed\": 123456,\n",
"}\n",
"\n",
"for inf_step in [1, 4, 10, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100]:\n",
" json[\"num_inference_steps\"] = inf_step\n",
"\n",
" %time r = requests.post(endpoint, json=json)\n",
" print(\"Status code\", r.status_code)\n",
"\n",
" # Image\n",
" png_as_np = np.frombuffer(base64.b64decode(r.text), dtype=np.uint8)\n",
" image_bgr = cv2.imdecode(png_as_np, cv2.IMREAD_COLOR)\n",
"\n",
" cv2.imwrite(\"sample_img_{}.png\".format(json[\"num_inference_steps\"]), image_bgr)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install trafilatura\n",
"import trafilatura\n",
"from pprint import pprint\n",
"\n",
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
"url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
"\n",
"# Fetch\n",
"doc = trafilatura.fetch_url(url)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Content & metadata\n",
"metadata = trafilatura.extract_metadata(doc)\n",
"content = trafilatura.extract(doc)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pprint(metadata.as_dict())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install newspaper4k\n",
"# !pip install langdetect \n",
"import newspaper\n",
"import langdetect\n",
"langdetect.DetectorFactory.seed = 0\n",
"\n",
"\n",
"\n",
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
"#url = \"https://www.waff.com/2025/03/11/colbert-heights-high-school-employee-arrested-child-abuse/\"\n",
"\n",
"\n",
"\n",
"#url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
"\n",
"\n",
"url = \"https://apnews.com/article/canada-trump-us-tariffs-steel-2517a6a2baf0596cb1a43d3a7d1e7939\"\n",
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"#url = \"https://www.ft.com/content/6d7c6915-4ceb-43fc-9896-590036b12a87\"\n",
"#url = \"https://www.lanacion.com.ar/politica/milei-en-bahia-blanca-un-viaje-sorpresa-para-frenar-las-criticas-y-mostrar-cercania-nid12032025/\"\n",
"#url = \"https://www.missingkids.org/poster/NCMC/2043547/1\"\n",
"\n",
"try:\n",
" article = newspaper.article(url)\n",
"except newspaper.ArticleException as e:\n",
" print(\"ArticleException: {}\".format(str(e)))\n",
"except Exception as e:\n",
" print(\"Err: {}\".format(str(e)))\n",
"\n",
"# url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])\n",
"# article.is_valid_url(), article.is_parsed, article.is_media_news(), article.is_valid_body()\n",
"article.meta_data\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install news-please\n",
"from newsplease import NewsPlease\n",
"\n",
"url = \"https://variety.com/2025/film/news/gene-hackman-death-suspicious-gas-leak-search-warrant-1236322610/\"\n",
"url = \"https://www.bbc.com/news/articles/cewkkkvkzn9o\"\n",
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"article = NewsPlease.from_url(url)\n",
"print(article.title)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(article.maintext)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -5,6 +5,14 @@
- Fetch parsing URL host
- Fetch from RSS feed
- Fetch searching (Google search & news, DuckDuckGo, ...)
++ Sources -> Robustness to TooManyRequests block
- Selenium based
- Sites change their logic, request captcha, ...
- Brave Search API
- Free up to X requests per day. Need credit card association (no charges)
- Bing API
- Subscription required
- Yandex. No API?
- Process URLs -> Updates raw URLs
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
- Determines if it is a valid article content

View File

@@ -2,30 +2,29 @@ import logging
import os
# Get env var
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log")
logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
# Directory of logs
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
os.makedirs(directory, exist_ok=True)
os.makedirs(logs_directory, exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
logger = logging.getLogger("selenium")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1)
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# To file log: INFO / WARNING / ERROR
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1)
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.INFO)
logger.addHandler(fh)
# To file log: WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.WARNING)
logger.addHandler(fh)

View File

@@ -28,7 +28,7 @@ class MissingKidsFetcher():
logger.debug("Processing page: {}...".format(i))
try:
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3)
time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3)
# Fetch poster URLs
for element_type in ["a"]: # ["a", "p", "div"]:
for elem in driver.find_elements(By.TAG_NAME, element_type):

View File

@@ -1,341 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# !pip install psycopg[binary]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"db_postgres\n",
"db_redis\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h"
]
}
],
"source": [
"!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5\n",
"!rm logs/*"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"INSERT_TABLES = True\n",
"INSERT_SAMPLE_DATA = False\n",
"\n",
"import psycopg\n",
"connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n",
"\n",
"from datetime import datetime, timezone\n",
"import re\n",
"from pprint import pprint"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"if INSERT_TABLES:\n",
" # Connect to an existing database\n",
" with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" # Autocommit at end of transaction (Atomic insert of URLs and sources)\n",
" with conn.transaction() as tx:\n",
" # Create URLs table\n",
" c = cur.execute(\"\"\"\n",
" CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');\n",
"\n",
" CREATE TABLE URLS (\n",
" id SERIAL PRIMARY KEY,\n",
" url TEXT NOT NULL UNIQUE,\n",
" ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n",
" status URL_STATUS NOT NULL DEFAULT 'raw' -- ,\n",
" -- status_wendy WENDY_STATUS DEFAULT NULL,\n",
" -- ts_wendy TIMESTAMPTZ DEFAULT NULL\n",
" );\n",
" CREATE INDEX idx_urls_status ON urls(status);\n",
" CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\n",
"\n",
" CREATE TABLE URLS_DUPLICATE (\n",
" id_url_canonical INTEGER REFERENCES URLS(id),\n",
" id_url_duplicated INTEGER REFERENCES URLS(id),\n",
" PRIMARY KEY (id_url_canonical, id_url_duplicated)\n",
" );\n",
" \n",
" CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');\n",
" CREATE TABLE SEARCH (\n",
" id SMALLSERIAL PRIMARY KEY,\n",
" search TEXT NOT NULL UNIQUE,\n",
" type SEARCH_TYPE NOT NULL\n",
" -- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search\n",
" -- UNIQUE(search, language_country)\n",
" );\n",
" CREATE INDEX idx_search_type ON SEARCH(type);\n",
" \n",
" CREATE TABLE SOURCE (\n",
" id SMALLSERIAL PRIMARY KEY,\n",
" source TEXT NOT NULL UNIQUE\n",
" );\n",
" \n",
" -- CREATE TABLE SEARCH_LANGUAGE (\n",
" -- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. \"en\"\n",
" -- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. \"us\"\n",
" -- PRIMARY KEY (language, country)\n",
" -- );\n",
" \n",
" CREATE TABLE URLS_SOURCE_SEARCH (\n",
" id_url INTEGER REFERENCES URLS(id),\n",
" id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n",
" id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n",
" PRIMARY KEY(id_url, id_source, id_search)\n",
" );\n",
" CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);\n",
" CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);\n",
"\n",
" CREATE TABLE STATUS_PATTERN_MATCHING (\n",
" pattern TEXT PRIMARY KEY,\n",
" priority SMALLINT NOT NULL,\n",
" status URL_STATUS NOT NULL\n",
" );\n",
" \n",
" \n",
" CREATE TABLE URL_CONTENT (\n",
" id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\n",
" date_published TIMESTAMPTZ DEFAULT NOW(),\n",
" title TEXT,\n",
" description TEXT,\n",
" content TEXT,\n",
" valid_content BOOLEAN,\n",
" language CHAR(2), -- ISO 639-1 Code\n",
" keywords TEXT[],\n",
" tags TEXT[],\n",
" authors TEXT[],\n",
" image_main_url TEXT,\n",
" images_url TEXT[],\n",
" videos_url TEXT[],\n",
" url_host TEXT, -- www.breitbart.com\n",
" site_name TEXT -- Breitbart News\n",
" );\n",
" CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n",
" CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n",
" CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\n",
" CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\n",
" CREATE INDEX idx_language ON URL_CONTENT (language);\n",
" CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n",
" \"\"\")\n",
"\n",
" ### Default insert values\n",
" \n",
" # Feeds\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n",
" # Websites of interest\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');\" )\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');\" )\n",
" # Search keywords\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');\" )\n",
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');\" )\n",
" \n",
" # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n",
" # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"youtube.com/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"tiktok.com/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"twitter.com/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"reddit.com/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"libreddit.de/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\t urls\n",
"[]\n",
"\t urls_duplicate\n",
"[]\n",
"\t urls_source_search\n",
"[]\n",
"\t source\n",
"[]\n",
"\t search\n",
"[(1,\n",
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
" 'rss_feed'),\n",
" (2, 'missingkids.org/poster', 'url_host'),\n",
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
" (4, 'breitbart.com', 'url_host'),\n",
" (5, 'child abuse', 'keyword_search')]\n",
"\t status_pattern_matching\n",
"[('.*youtube\\\\.com/.*', 50, 'invalid'),\n",
" ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n",
" ('.*twitter\\\\.com/.*', 50, 'invalid'),\n",
" ('.*reddit\\\\.com/.*', 50, 'invalid'),\n",
" ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n",
" ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n",
"\t url_content\n",
"[]\n"
]
}
],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" # Get tables\n",
" cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema='public';\")\n",
" tables = [t[0] for t in cur.fetchall()]\n",
"\n",
" for t in tables:\n",
" print(\"\\t\", t)\n",
" pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(1,\n",
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
" 'rss_feed'),\n",
" (2, 'missingkids.org/poster', 'url_host'),\n",
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
" (4, 'breitbart.com', 'url_host'),\n",
" (5, 'child abuse', 'keyword_search')]\n"
]
}
],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" pprint( cur.execute(\"SELECT * FROM SEARCH;\").fetchall() )"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[]\n"
]
}
],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 50;\").fetchall() )\n",
" #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\n!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n",
"\n",
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\n",
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org', 'url_host');\" )\n",
"'''"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

48
app_urls/Dockerfile Normal file
View File

@@ -0,0 +1,48 @@
FROM python:3.12
# Prevents Python from writing pyc files to disk
ENV PYTHONDONTWRITEBYTECODE=1
#Prevents Python from buffering stdout and stderr
ENV PYTHONUNBUFFERED=1
# User
RUN useradd -m -r appuser && \
mkdir /opt/app && \
chown -R appuser /opt/app
WORKDIR /opt/app
# Copy the Django project and install dependencies
COPY requirements.txt /opt/app/
# run this command to install all dependencies
RUN pip install --no-cache-dir -r requirements.txt
COPY --chown=appuser:appuser . /opt/app/
RUN chmod -R 755 /opt/app
RUN chown -R appuser:appuser /opt/app
USER appuser
# Initialization script
RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
echo 'if [ "${INITIALIZE_DB}" = false ]; then' >> /opt/app/initialize.sh && \
echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \
echo 'else' >> /opt/app/initialize.sh && \
echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \
echo 'sleep 5' >> /opt/app/initialize.sh && \
echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \
echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \
echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \
echo 'python manage.py collectstatic --no-input' >> /opt/app/initialize.sh && \
echo 'python manage.py import --filename scheduled_tasks.json' >> /opt/app/initialize.sh && \
echo 'fi' >> /opt/app/initialize.sh && \
chmod +x /opt/app/initialize.sh
# Serving script
RUN echo '#!/bin/bash' > /opt/app/run.sh && \
echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
#echo 'python manage.py runserver & python manage.py rqworker high default low' >> /opt/app/run.sh && \
chmod +x /opt/app/run.sh
# Run Djangos server & workers
CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"]

View File

@@ -2,18 +2,9 @@
```
conda create -n matitos_urls python=3.12
conda activate matitos_urls
# Core
pip install django==5.1 psycopg[binary] django-redis django-tasks-scheduler
# Fetcher
pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect
# News visualization
pip install ollama
pip install -r requirements.txt
```
* Database
* Database initialization -> 1-DB.ipynb
* From automated inspectdb
```
# 1) Inspect DB, generate models.py
@@ -74,60 +65,19 @@ class Meta:
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
```
* Database & initialization
* Check initialize.sh on Dockerfile
* Environment variables
```
# Database
DB_NAME=${DB_NAME:-matitos}
DB_USER=${DB_NAME:-supermatitos}
DB_PASSWORD=${DB_NAME:-supermatitos}
DB_HOST=${DB_NAME:-localhost}
DB_PORT=${DB_NAME:-5432}
REDIS_HOST=${REDIS_HOST:-localhost}
REDIS_PORT=${REDIS_PORT:-6379}
# Job timeout: 30 min
JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
# Logs path
PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
# Fetcher
FETCHER_GNEWS_DECODE_SLEEP=2
FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
FETCHER_BETWEEN_SEARCHES_SLEEP=5
FETCHER_URL_HOST_SLEEP=5
FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
SELENIUM_ENDPOINT="http://selenium_app:80"
```
* In docker-compose.yml
* Deploy
```
# Migrations
python manage.py makemigrations api; python manage.py migrate --fake-initial
# Create user
python manage.py createsuperuser
# Check environments variables on docker-compose.yml
# 1) Server
python manage.py runserver
# Remove previous instances
docker compose down -v
# 2) Workers
python manage.py rqworker high default low
# Visualize DB
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id
# Build & up
docker compose up -d --build
```
* Scheduled tasks
```
# Import tasks
python manage.py import --filename scheduled_tasks.json
# Modify using the admin panel, then save
# python manage.py export > scheduled_tasks.json
```
* Utils. TODO: To endpoint...
```
python manage.py rqstats
```

View File

@@ -1,295 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Charts</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<style>
body {
background-color: #333;
color: #fff;
font-family: Arial, sans-serif;
}
h2 {
color: #fff;
text-align: center;
margin-bottom: 40px;
}
.chart-container {
width: 45%;
display: inline-block;
margin: 20px;
background-color: #444;
border-radius: 10px;
padding: 5px;
}
canvas {
background-color: #2c2c2c;
border-radius: 5px;
}
.container {
display: flex;
justify-content: center;
flex-wrap: wrap;
}
.filter-container {
text-align: center;
margin-bottom: 20px;
}
select {
padding: 8px;
background-color: #555;
color: white;
border: 1px solid #444;
border-radius: 5px;
}
</style>
</head>
<body>
<h2>Data Visualizations</h2>
<!-- Filter for Number of Days -->
<div class="filter-container">
<label for="daysFilter">Select Number of Days:</label>
<select id="daysFilter">
<option value="0.25">Last 6 Hours</option>
<option value="1">Last 24 Hours</option>
<option value="3">Last 3 Days</option>
<option value="7" selected>Last 7 Days</option>
<option value="30">Last 30 Days</option>
<option value="90">Last 90 Days</option>
<option value="365">Last 365 Days</option>
</select>
</div>
<div class="container">
<div class="chart-container">
<canvas id="urlFetchDateChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlStatusChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSourceChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSearchChart"></canvas>
</div>
</div>
<script>
$(document).ready(function () {
// Fetch initial data (default 30 days)
const defaultDays = 7;
fetchDataAndRenderCharts(defaultDays);
// Apply the filter automatically when the user changes the selection
$('#daysFilter').change(function () {
const selectedDays = $(this).val();
fetchDataAndRenderCharts(selectedDays);
});
});
function fetchDataAndRenderCharts(days) {
// Fetch and render the URL Fetch Date chart
$.getJSON(`/urls-by-fetch-date/?days=${days}`, function (data) {
renderUrlFetchDateChart(data);
});
// Fetch and render the URL Status chart (with dynamic date filtering)
$.getJSON(`/urls-per-status/?days=${days}`, function (data) {
renderUrlStatusChart(data);
});
// Fetch and render the URLs per Source chart
$.getJSON(`/urls-per-source/?days=${days}`, function (data) {
renderUrlsPerSourceChart(data);
});
// Fetch and render the URLs per Search chart
$.getJSON(`/urls-per-search/?days=${days}`, function (data) {
renderUrlsPerSearchChart(data);
});
}
function renderUrlFetchDateChart(data) {
new Chart(document.getElementById("urlFetchDateChart"), {
type: 'bar',
data: {
labels: data.dates,
datasets: [{
label: 'URLs by Fetch Date',
data: data.counts,
backgroundColor: 'blue',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
function renderUrlStatusChart(data) {
new Chart(document.getElementById("urlStatusChart"), {
type: 'bar',
data: {
labels: data.statuses,
datasets: [{
label: 'URLs by Status',
data: data.counts,
backgroundColor: 'green',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
function renderUrlsPerSourceChart(data) {
new Chart(document.getElementById("urlsPerSourceChart"), {
type: 'bar',
data: {
labels: data.sources,
datasets: [{
label: 'URLs by Source',
data: data.counts,
backgroundColor: 'purple',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
function renderUrlsPerSearchChart(data) {
new Chart(document.getElementById("urlsPerSearchChart"), {
type: 'bar',
data: {
labels: data.searches,
datasets: [{
label: 'URLs by Search',
data: data.counts,
backgroundColor: 'orange',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
</script>
</body>
</html>

View File

@@ -20,12 +20,13 @@ BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt'
SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt')
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
DEBUG = (os.environ.get('DJANGO_DEBUG') == "True")
print("Django debug mode:", DEBUG)
ALLOWED_HOSTS = []
ALLOWED_HOSTS = os.environ.get('DJANGO_ALLOWED_HOSTS', "*").split(",")
# Application definition
@@ -38,11 +39,12 @@ INSTALLED_APPS = [
'django.contrib.messages',
'django.contrib.staticfiles',
'scheduler',
'api',
'fetcher',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'whitenoise.middleware.WhiteNoiseMiddleware', # Serving static files
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
@@ -51,6 +53,8 @@ MIDDLEWARE = [
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'
ROOT_URLCONF = 'core.urls'
TEMPLATES = [
@@ -121,7 +125,7 @@ SCHEDULER_QUEUES = {
}
}
SCHEDULER_CONFIG = {
'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 15 minutes
'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 30 minutes
'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours
'EXECUTIONS_IN_PAGE': 20,
'SCHEDULER_INTERVAL': 10, # 10 seconds
@@ -158,7 +162,8 @@ USE_TZ = True
# Static files (CSS, JavaScript, Images)
STATIC_URL = 'static/'
STATIC_URL = '/static/'
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
# Default primary key field type

View File

@@ -20,5 +20,5 @@ from django.urls import path, include
urlpatterns = [
path('admin/', admin.site.urls),
path('scheduler/', include('scheduler.urls')),
path('', include('api.urls')),
path('', include('fetcher.urls')),
]

145
app_urls/db.py Normal file
View File

@@ -0,0 +1,145 @@
import argparse
import os
import psycopg
import re
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
os.environ.get("DB_HOST", "localhost"),
os.environ.get("DB_PORT", "5432"),
os.environ.get("DB_NAME", "matitos"),
os.environ.get("DB_USER", "supermatitos"),
os.environ.get("DB_PASSWORD", "supermatitos")
)
def initialize_tables():
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Autocommit at end of transaction (Atomic creation of tables)
with conn.transaction() as tx:
# Create URLs table
c = cur.execute("""
CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');
CREATE TABLE URLS (
id SERIAL PRIMARY KEY,
url TEXT NOT NULL UNIQUE,
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
-- status_wendy WENDY_STATUS DEFAULT NULL,
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
);
CREATE INDEX idx_urls_status ON urls(status);
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
CREATE TABLE URLS_DUPLICATE (
id_url_canonical INTEGER REFERENCES URLS(id),
id_url_duplicated INTEGER REFERENCES URLS(id),
PRIMARY KEY (id_url_canonical, id_url_duplicated)
);
CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');
CREATE TABLE SEARCH (
id SMALLSERIAL PRIMARY KEY,
search TEXT NOT NULL UNIQUE,
type SEARCH_TYPE NOT NULL
-- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search
-- UNIQUE(search, language_country)
);
CREATE INDEX idx_search_type ON SEARCH(type);
CREATE TABLE SOURCE (
id SMALLSERIAL PRIMARY KEY,
source TEXT NOT NULL UNIQUE
);
-- CREATE TABLE SEARCH_LANGUAGE (
-- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. "en"
-- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. "us"
-- PRIMARY KEY (language, country)
-- );
CREATE TABLE URLS_SOURCE_SEARCH (
id_url INTEGER REFERENCES URLS(id),
id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,
id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,
PRIMARY KEY(id_url, id_source, id_search)
);
CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);
CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);
CREATE TABLE STATUS_PATTERN_MATCHING (
pattern TEXT PRIMARY KEY,
priority SMALLINT NOT NULL,
status URL_STATUS NOT NULL
);
CREATE TABLE URL_CONTENT (
id_url INTEGER PRIMARY KEY REFERENCES URLS(id),
date_published TIMESTAMPTZ DEFAULT NOW(),
title TEXT,
description TEXT,
content TEXT,
valid_content BOOLEAN,
language CHAR(2), -- ISO 639-1 Code
keywords TEXT[],
tags TEXT[],
authors TEXT[],
image_main_url TEXT,
images_url TEXT[],
videos_url TEXT[],
url_host TEXT, -- www.breitbart.com
site_name TEXT -- Breitbart News
);
CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);
CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);
CREATE INDEX idx_date_published ON URL_CONTENT (date_published);
CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);
CREATE INDEX idx_language ON URL_CONTENT (language);
CREATE INDEX idx_url_host ON URL_CONTENT (url_host);
""")
def initialize_data():
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Autocommit at end of transaction (Atomic creation of data)
with conn.transaction() as tx:
# Feeds
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" )
# Websites of interest
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" )
# Search keywords
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" )
# TODO: Language per search
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');" )
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');" )
# Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) )
def main(name):
print('Hello, %s!' % name)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Database initialization')
parser.add_argument('--initialize_tables', help='Create DB tables', action='store_true', default=False)
parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
args = parser.parse_args()
if (args.initialize_tables):
print("Initializing tables")
initialize_tables()
if (args.initialize_data):
print("Initializing data")
initialize_data()

View File

@@ -1,6 +1,6 @@
from django.apps import AppConfig
class ApiConfig(AppConfig):
class FetcherConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'api'
name = 'fetcher'

View File

@@ -65,7 +65,7 @@ class Migration(migrations.Migration):
migrations.CreateModel(
name='UrlContent',
fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
('date_published', models.DateTimeField(blank=True, null=True)),
('title', models.TextField(blank=True, null=True)),
('description', models.TextField(blank=True, null=True)),
@@ -89,7 +89,7 @@ class Migration(migrations.Migration):
migrations.CreateModel(
name='UrlsDuplicate',
fields=[
('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
],
options={
'db_table': 'urls_duplicate',
@@ -99,7 +99,7 @@ class Migration(migrations.Migration):
migrations.CreateModel(
name='UrlsSourceSearch',
fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
],
options={
'db_table': 'urls_source_search',

View File

@@ -1,6 +1,8 @@
import time
import feedparser
import os
from django.utils import timezone
from datetime import timedelta
from ..models import Search, Source
from .fetch_utils import decode_gnews_urls
from .logger import get_logger
@@ -9,6 +11,7 @@ logger = get_logger()
from gnews import GNews
from duckduckgo_search import DDGS
from GoogleNews import GoogleNews
from search_engines import Yahoo, Aol
###########################################################################
###########################################################################
@@ -42,11 +45,19 @@ class FetcherAbstract(ABC):
return raw_urls
def fetch_articles(self, db_writer, obj_search):
# Search
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
# Source name
source_name = self._get_name()
# Search
keyword_search = obj_search.search
# URL Host search? -> site:${URL_HOST}
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
keyword_search = "{}{}".format("site:", keyword_search)
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
start_date = timezone.now() - timedelta(days=7)
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
# Fetch
raw_urls = self._fetch_raw_urls(keyword_search)
@@ -165,11 +176,11 @@ class SearchGoogleGeneral(FetcherAbstract):
self.language = args.get("language", "en")
self.country = args.get("country", "US")
self.period = args.get("period", "7d")
self.max_pages = args.get("max_pages", 1)
self.pages = args.get("pages", 1)
def _get_name(self):
# [source] [period] [language-country] [pages]
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.max_pages).replace("pages=None", "").strip()
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
@@ -181,7 +192,7 @@ class SearchGoogleGeneral(FetcherAbstract):
set_links = set()
# Iterate pages
for i in range(self.max_pages):
for i in range(self.pages):
# Sleep between pages fetch
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
# Number of URLs fetched so far
@@ -253,7 +264,45 @@ class SearchGoogleNewsRSS(FetcherAbstract):
urls = []
return urls
class SearchYahooGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.pages = args.get("pages", 2)
def _get_name(self):
# [source] [language-country] [pages]
return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
results = Yahoo().search(keyword_search, pages=self.pages)
urls = results.links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchAOLGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.pages = args.get("pages", 2)
def _get_name(self):
# [source] [language-country] [pages]
return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
results = Aol().search(keyword_search, pages=self.pages)
urls = results.links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
###########################################################################
# List of instances
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]

View File

@@ -2,30 +2,29 @@ import logging
import os
# Get env var
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_fetcher_{}.log")
logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
# Directory of logs
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
os.makedirs(directory, exist_ok=True)
os.makedirs(logs_directory, exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
logger = logging.getLogger("fetcher")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=4)
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# To file log: INFO / WARNING / ERROR
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=2)
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.INFO)
logger.addHandler(fh)
# To file log: WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.WARNING)
logger.addHandler(fh)

View File

@@ -73,9 +73,6 @@ def process_missing_kids_urls_all(batch_size=None):
logger.info("Task completed: {}".format(task))
@job('default')
def background_task(process_type: str):
logger.info("Task triggered: {}".format(process_type))

View File

@@ -0,0 +1,179 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Charts</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<style>
body {
background-color: #333;
color: #fff;
font-family: Arial, sans-serif;
}
h2 {
color: #fff;
text-align: center;
margin-bottom: 40px;
}
.chart-container {
width: 45%;
display: inline-block;
margin: 20px;
background-color: #444;
border-radius: 10px;
padding: 5px;
}
canvas {
background-color: #2c2c2c;
border-radius: 5px;
}
.container {
display: flex;
justify-content: center;
flex-wrap: wrap;
}
.filter-container {
text-align: center;
margin-bottom: 20px;
}
select {
padding: 8px;
background-color: #555;
color: white;
border: 1px solid #444;
border-radius: 5px;
}
</style>
</head>
<body>
<h2>Data Visualizations</h2>
<!-- Filter for Number of Days -->
<div class="filter-container">
<label for="daysFilter">Select Number of Days:</label>
<select id="daysFilter">
<option value="0.0625">Last 90 Minutes</option>
<option value="0.25">Last 6 Hours</option>
<option value="1">Last 24 Hours</option>
<option value="7" selected>Last 7 Days</option>
<option value="30">Last 30 Days</option>
<option value="90">Last 90 Days</option>
<option value="365">Last 365 Days</option>
</select>
</div>
<div class="container">
<div class="chart-container">
<canvas id="urlFetchDateChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlStatusChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSourceChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSearchChart"></canvas>
</div>
</div>
<script>
$(document).ready(function () {
let chartInstances = {}; // Store chart instances
// Fetch initial data (default 7 days)
const defaultDays = 7;
fetchDataAndRenderCharts(defaultDays);
// Apply the filter automatically when the user changes the selection
$('#daysFilter').on('change', function () {
const selectedDays = $(this).val();
fetchDataAndRenderCharts(selectedDays);
});
function fetchDataAndRenderCharts(days) {
fetchAndRenderChart(`/urls-by-fetch-date/?days=${days}`, 'urlFetchDateChart', 'URLs by Fetch Date', 'bar');
fetchAndRenderChart(`/urls-per-status/?days=${days}`, 'urlStatusChart', 'URLs by Status', 'bar');
fetchAndRenderChart(`/urls-per-source/?days=${days}`, 'urlsPerSourceChart', 'URLs by Source', 'bar');
fetchAndRenderChart(`/urls-per-search/?days=${days}`, 'urlsPerSearchChart', 'URLs by Search', 'bar');
}
const categoryColors = {
'URLs by Fetch Date': '#4BC0C0', // Color for this category
'URLs by Status': '#36A2EB', // Color for this category
'URLs by Source': '#4BC0C0', // Color for this category
'URLs by Search': '#36A2EB' // Color for this category
};
const maxLabelLength = 35; // Limit X-axis labels to 10 characters
function fetchAndRenderChart(url, canvasId, chartTitle, chartType) {
$.getJSON(url, function (data) {
if (chartInstances[canvasId]) {
chartInstances[canvasId].destroy(); // Destroy previous chart
}
const ctx = document.getElementById(canvasId).getContext('2d');
chartInstances[canvasId] = new Chart(ctx, {
type: chartType,
data: {
labels: data.labels, // Ensure labels are passed as strings
datasets: [{
label: chartTitle,
data: data.values,
backgroundColor: categoryColors[chartTitle], // Assign the same color based on category
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: { color: '#fff' }
}
},
scales: {
x: {
ticks: {
color: "#fff", // Set the color of x-axis ticks
callback: function (value) {
let label = data.labels[value];
if (label.length > maxLabelLength) { return label.slice(0, maxLabelLength) + '...'; }
return label;
}
},
grid: {
color: "#444" // Set the grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set the color of y-axis ticks
},
grid: {
color: "#444" // Set the grid lines color
}
}
}
}
});
});
}
});
</script>
</body>
</html>

View File

@@ -113,11 +113,11 @@ input[type="checkbox"] {
}
/* Themed Toggle Button */
.theme-button, .home-button {
.theme-button, .home-button, .chart-button {
background-color: var(--sidebar);
border: 1px solid var(--sidebar);
border-radius: 50%;
width: 45px;
width: 30px;
height: 45px;
font-size: 25px;
display: flex;
@@ -127,10 +127,10 @@ input[type="checkbox"] {
cursor: pointer;
}
.theme-button:hover, .home-button:hover {
.theme-button:hover, .home-button:hover, .chart-button:hover {
transform: rotate(20deg);
}
.theme-button:active, .home-button:active {
.theme-button:active, .home-button:active, .chart-button:acive {
transform: scale(0.95);
}
@@ -235,6 +235,7 @@ input[type="checkbox"] {
<div class="button-container">
<button id="homeButton" class="home-button">🏠</button>
<button id="themeToggle" class="theme-button">🌙</button>
<button id="chartButton" class="chart-button">📊</button>
</div>
<form method="GET" action="" id="filterForm">
@@ -477,6 +478,10 @@ input[type="checkbox"] {
document.getElementById("homeButton").addEventListener("click", function () {
window.location.href = "./"; // Change this to your homepage URL if different
});
// Charts
document.getElementById("chartButton").addEventListener("click", function () {
window.location.href = "./charts"; // Change this to your homepage URL if different
});
//////////////////////////////////////////////
// Timestamp to local timezone
@@ -508,26 +513,32 @@ input[type="checkbox"] {
});
});
//////////////////////////////////////////////////////////////////////
// Function to update the form parameter before submitting
function updateFormParameter(section) {
const checkboxes = document.querySelectorAll(`[name='${section}']`);
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
// If all are checked, replace them with a hidden input with value "all"
if (allChecked) {
checkboxes.forEach(checkbox => checkbox.removeAttribute("name"));
let hiddenInput = document.createElement("input");
hiddenInput.type = "hidden";
hiddenInput.name = section;
hiddenInput.value = "all";
document.getElementById("filterForm").appendChild(hiddenInput);
} else {
checkboxes.forEach(checkbox => checkbox.setAttribute("name", section));
document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove());
}
// Function to update the form parameters for all sections before submitting
function updateFormParameters() {
// Get all distinct sections by selecting all checkboxes and extracting their "name" attributes
const sections = new Set([...document.querySelectorAll("input[type='checkbox']")].map(cb => cb.name));
// Submit form after changes
sections.forEach(section => {
if (!section) return; // Skip any checkboxes without a name
const checkboxes = document.querySelectorAll(`[name='${section}']`);
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
// If all checkboxes in a section are checked, remove them and add a hidden input
if (allChecked) {
checkboxes.forEach(checkbox => checkbox.removeAttribute("name"));
let hiddenInput = document.createElement("input");
hiddenInput.type = "hidden";
hiddenInput.name = section;
hiddenInput.value = "all";
document.getElementById("filterForm").appendChild(hiddenInput);
} else {
checkboxes.forEach(checkbox => checkbox.setAttribute("name", section));
document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove());
}
});
// Submit the form after updating all sections
document.getElementById("filterForm").submit();
}
@@ -537,7 +548,7 @@ input[type="checkbox"] {
const checkboxes = document.querySelectorAll(`[name='${section}']`);
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
checkboxes.forEach(cb => cb.checked = !allChecked);
updateFormParameter(section);
updateFormParameters();
}
// Attach event listeners to "Toggle All" buttons
@@ -552,14 +563,14 @@ input[type="checkbox"] {
// Automatically submit the form when any checkbox changes
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
checkbox.addEventListener('change', function() {
updateFormParameter(this.name);
updateFormParameters();
});
});
document.getElementById('perPageSelect').addEventListener('change', function() {
document.getElementById('filterForm').submit();
updateFormParameters();
});
document.getElementById('timeFilterSelect').addEventListener('change', function() {
document.getElementById('filterForm').submit();
updateFormParameters();
});

View File

@@ -167,13 +167,14 @@
</script>
<body>
<!--
<div class="sidebar">
<div class="button-container">
<button id="homeButton" class="home-button">🏠</button>
<button id="themeToggle" class="theme-button">🌙</button>
</div>
</div>
-->
<!-- Main Content -->
<div class="container mt-4">

View File

@@ -8,7 +8,7 @@ urlpatterns = [
#
path('task/<str:task>', views.trigger_task, name='trigger_task'),
#
path('charts/', views.charts, name='charts'),
path('urls/charts/', views.charts, name='charts'),
path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
path('urls-per-status/', views.urls_per_status, name='urls_per_status'),
path('urls-per-source/', views.urls_per_source, name='urls_per_source'),

View File

@@ -2,6 +2,7 @@ from .tasks import background_task
from django.core.paginator import Paginator
from django.shortcuts import render, get_object_or_404
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
from django.contrib.auth.decorators import login_required
import ollama
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
import os
@@ -29,17 +30,18 @@ def link_list(request):
# URLs
"http://localhost:8000/urls",
# Charts
"http://localhost:8000/charts",
# API tasks
"http://localhost:8000/urls/charts",
# Fetcher tasks
] + [os.path.join(prefix, l) for l in links]
# Json
return JsonResponse({"links": list_links })
####################################################################################################
# @login_required(login_url='/admin')
def logs(request, log_type):
# Capture output: python manage.py rqstats
try:
with open(os.getenv("PATH_LOGS_DEBUG", "logs/log_app_fetcher_{}.log".format(log_type)), "r") as f:
with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
file_content = f.read()
except Exception as e:
file_content = "Error reading logs for log type :{}".format(log_type)
@@ -130,8 +132,9 @@ def charts(request):
return render(request, 'charts.html')
def urls_by_fetch_date(request):
# Get the date for 30 days ago
start_date = timezone.now() - timedelta(days=30)
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by fetch date
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
@@ -141,8 +144,8 @@ def urls_by_fetch_date(request):
# Format data to return as JSON
data = {
'dates': [item['ts_fetch__date'] for item in urls_data],
'counts': [item['count'] for item in urls_data],
'labels': [item['ts_fetch__date'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)
@@ -160,38 +163,48 @@ def urls_per_status(request):
# Format data for JSON
data = {
'statuses': [item['status'] for item in urls_data],
'counts': [item['count'] for item in urls_data],
'labels': [item['status'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)
def urls_per_source(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by source
urls_data = UrlsSourceSearch.objects \
.filter(id_url__ts_fetch__gte=start_date) \
.values('id_source__source') \
.annotate(count=Count('id_url')) \
.order_by('id_source__source')
# Format data for JSON
data = {
'sources': [item['id_source__source'] for item in urls_data],
'counts': [item['count'] for item in urls_data],
'labels': [item['id_source__source'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)
def urls_per_search(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by search
urls_data = UrlsSourceSearch.objects \
.filter(id_url__ts_fetch__gte=start_date) \
.values('id_search__search') \
.annotate(count=Count('id_url')) \
.order_by('id_search__search')
# Format data for JSON
data = {
'searches': [item['id_search__search'] for item in urls_data],
'counts': [item['count'] for item in urls_data],
'labels': [item['id_search__search'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)

17
app_urls/requirements.txt Normal file
View File

@@ -0,0 +1,17 @@
django==5.1
psycopg[binary]
django-redis
django-tasks-scheduler
gunicorn
whitenoise
feedparser
python-dateutil
newspaper4k[all]
lxml[html_clean]
googlenewsdecoder
gnews
GoogleNews
duckduckgo_search
git+https://github.com/tasos-py/Search-Engines-Scraper.git
langdetect
ollama

View File

@@ -2,10 +2,10 @@
{
"model": "RepeatableTaskType",
"name": "Process error URLs",
"callable": "api.tasks.process_error_urls",
"callable": "fetcher.tasks.process_error_urls",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"enabled": false,
"queue": "low",
"repeat": null,
"at_front": false,
@@ -15,18 +15,39 @@
"scheduled_time": "2025-04-01T12:36:21+00:00",
"interval": 4,
"interval_unit": "hours",
"successful_runs": 15,
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": "2025-04-01 08:37:06.722770+00:00",
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process raw URLs",
"callable": "fetcher.tasks.process_raw_urls",
"callable_args": [],
"callable_kwargs": [],
"enabled": false,
"queue": "low",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:20:08+00:00",
"interval": 10,
"interval_unit": "minutes",
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process MissingKids URLs",
"callable": "api.tasks.process_missing_kids_urls",
"callable": "fetcher.tasks.process_missing_kids_urls",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
@@ -34,20 +55,20 @@
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:37:50+00:00",
"interval": 2,
"interval": 4,
"interval_unit": "hours",
"successful_runs": 29,
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": "2025-04-01 08:42:05.864064+00:00",
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process MissingKids URLs ALL",
"callable": "api.tasks.process_missing_kids_urls_all",
"callable": "fetcher.tasks.process_missing_kids_urls_all",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
@@ -65,10 +86,10 @@
{
"model": "RepeatableTaskType",
"name": "Fetch Feeds",
"callable": "api.tasks.fetch_feeds",
"callable": "fetcher.tasks.fetch_feeds",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
@@ -78,39 +99,18 @@
"scheduled_time": "2025-04-01T10:18:56+00:00",
"interval": 15,
"interval_unit": "minutes",
"successful_runs": 288,
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": "2025-04-01 10:03:58.363856+00:00",
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process raw URLs",
"callable": "api.tasks.process_raw_urls",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"queue": "low",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:20:08+00:00",
"interval": 15,
"interval_unit": "minutes",
"successful_runs": 78,
"failed_runs": 0,
"last_successful_run": "2025-04-01 10:05:08.394472+00:00",
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch Parser",
"callable": "api.tasks.fetch_parser",
"callable": "fetcher.tasks.fetch_parser",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
@@ -120,18 +120,18 @@
"scheduled_time": "2025-04-01T10:25:42+00:00",
"interval": 1,
"interval_unit": "hours",
"successful_runs": 62,
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": "2025-04-01 09:25:57.977051+00:00",
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch Search",
"callable": "api.tasks.fetch_search",
"callable": "fetcher.tasks.fetch_search",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
@@ -141,9 +141,51 @@
"scheduled_time": "2025-04-01T10:29:33+00:00",
"interval": 1,
"interval_unit": "hours",
"successful_runs": 63,
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": "2025-04-01 09:37:20.671072+00:00",
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch MissingKids",
"callable": "fetcher.tasks.fetch_missing_kids",
"callable_args": [],
"callable_kwargs": [],
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:29:33+00:00",
"interval": 4,
"interval_unit": "hours",
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch MissingKids ALL",
"callable": "fetcher.tasks.fetch_missing_kids_all",
"callable_args": [],
"callable_kwargs": [],
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:29:33+00:00",
"interval": 1,
"interval_unit": "weeks",
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null
}
]

View File

@@ -2,101 +2,106 @@ version: '3.9'
services:
fetcher_selenium:
fetcher_app_selenium:
image: fetcher_app_selenium
build:
context: ./app_selenium
container_name: selenium_app
restart: unless-stopped
container_name: fetcher_app_selenium
# restart: unless-stopped
shm_size: 512mb
environment:
- SELENIUM_SLEEP_PER_PAGE=4
- PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log"
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
ports:
- 80
dns:
- 1.1.1.1
- 1.0.0.1
deploy:
resources:
limits:
cpus: '4'
memory: 4G
fetcher_urls_app:
fetcher_app_urls:
image: fetcher_app_urls
build:
context: ./app_urls
container_name: urls_app
restart: unless-stopped
container_name: fetcher_app_urls
# restart: unless-stopped
environment:
#- name=value
# Initialization
- INITIALIZE_DB=${INITIALIZE_DB:-true}
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME:-matitos}
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD:-matitos}
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL:-matitos@matitos.org}
# Django
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:-abc123456789qwerty}
- DJANGO_DEBUG=${DJANGO_DEBUG:-False}
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2
# Database
- DB_NAME=${DB_NAME:-matitos}
- DB_USER=${DB_NAME:-supermatitos}
- DB_PASSWORD=${DB_NAME:-supermatitos}
- DB_HOST=${DB_NAME:-localhost} # db_postgres
- DB_PORT=${DB_NAME:-5432}
- REDIS_HOST=${REDIS_HOST:-localhost}
- DB_USER=${DB_USER:-supermatitos}
- DB_PASSWORD=${DB_PASSWORD:-supermatitos}
- DB_HOST=${DB_HOST:-fetcher_db}
- DB_PORT=${DB_PORT:-5432}
- REDIS_HOST=${REDIS_HOST:-fetcher_redis}
- REDIS_PORT=${REDIS_PORT:-6379}
# Job timeout: 30 min
- JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
# Logs path
- PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
# Fetcher
- FETCHER_GNEWS_DECODE_SLEEP=2
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
- FETCHER_BETWEEN_SEARCHES_SLEEP=5
- FETCHER_URL_HOST_SLEEP=5
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP:-1}
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-2}
# Selenium
- SELENIUM_ENDPOINT="http://selenium_app:80"
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
ports:
- 80
- 8000:8000
depends_on:
- fetcher_db
- fetcher_redis
dns:
- 1.1.1.1
- 1.0.0.1
deploy:
resources:
limits:
cpus: '4'
memory: 4G
fetcher_db:
image: postgres:17
container_name: db_postgres
container_name: fetcher_db
restart: unless-stopped
# Set shared memory limit when using docker-compose
shm_size: 128mb
environment:
POSTGRES_DB: ${DB_NAME:-matitos}
POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos}
POSTGRES_USER: ${DB_USERNAME:-supermatitos}
POSTGRES_DB: ${DB_DATABASE_NAME:-matitos}
POSTGRES_USER: ${DB_USER:-supermatitos}
POSTGRES_INITDB_ARGS: '--data-checksums'
#volumes:
# - ${PATH_BASE:-.}/postgres:/var/lib/postgresql/data
#volumes: # Persistent DB?
# - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
ports:
- 5432:5432
- 5432 #:5432
fetcher_redis:
image: redis:alpine
container_name: db_redis
container_name: fetcher_redis
restart: unless-stopped
ports:
- 6379:6379
#expose:
# - 6379
fetcher_adminer:
# http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public
image: adminer
container_name: adminer
restart: unless-stopped
environment:
- ADMINER_DEFAULT_DB_DRIVER=pgsql
#- ADMINER_DEFAULT_DB_HOST
#- ADMINER_DEFAULT_DB_NAME
depends_on:
- matitos_db
ports:
- 8080:8080
- 6379 #:6379
fetcher_dozzle:
container_name: dozzle
container_name: fetcher_dozzle
image: amir20/dozzle:latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
ports:
- 8888:8080
environment:
- DOZZLE_FILTER="name=matitos_" # Need container name matitos_ ?
# django:
# Env: DB_HOST=matitos_db
# DJANGO_DB_NAME=${DB_DATABASE_NAME:-matitos}
# DJANGO_DB_USER=${DB_USERNAME:-supermatitos}
# DJANGO_DB_PASSWORD=${DB_PASSWORD:-supermatitos}
# DJANGO_DB_HOST=${DB_HOST:-localhost}
# DJANGO_DB_PORT=${DB_PORT:-5432}
- DOZZLE_FILTER="name=fetcher_"