Dockerization, whitenoise serving static, refactor

This commit is contained in:
Luciano Gervasoni
2025-04-04 10:53:16 +02:00
parent 5addfa5ba9
commit 4dbe2e55ef
39 changed files with 708 additions and 1238 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@ __pycache__/
*.pyc *.pyc
**/credentials.py **/credentials.py
logs/ logs/
postgres/

View File

@@ -1,363 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install git+https://github.com/tasos-py/Search-Engines-Scraper.git\n",
"import search_engines\n",
"\n",
"engine = search_engines.Bing()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = engine.search('news: \"child abuse\"', pages=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"engine = search_engines.search_engines_dict[\"brave\"]()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = 'news: child abuse'\n",
"r = engine.search(query, pages=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"r.__dict__"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import newspaper\n",
"newspaper.ArticleBinaryDataException"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"'''\n",
"import newspaper\n",
"\n",
"url = 'https://www.missingkids.org/poster/USVA/VA25-0820/1'\n",
"art_1 = newspaper.article(url)\n",
"url = 'https://www.missingkids.org/poster/NCMC/2045193/1'\n",
"art_2 = newspaper.article(url)\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import ollama\n",
"\n",
"#model = \"llama3.2:1b\"\n",
"client = ollama.Client(\n",
" host = 'https://ollamamodel.matitos.org',\n",
")\n",
"l = client.list()\n",
"list_models = [m.get(\"model\") for m in l.model_dump().get(\"models\")]\n",
"\n",
"print(list_models)\n",
"\n",
"for m in list_models:\n",
" context_key = [ k for k in client.show(m).model_dump().get(\"modelinfo\").keys() if \"context_length\" in k]\n",
" if (len(context_key) != 1):\n",
" print(\"Problem!!!\")\n",
" print(m, client.show(m).model_dump().get(\"modelinfo\").get(context_key[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text = \"...\"\n",
"model = \"falcon3:1b\"\n",
"\n",
"msg_content = {\n",
" \"role\": \"user\", \n",
" \"content\": text,\n",
"}\n",
"response = client.chat(model=model, messages=[msg_content], stream=False)\n",
"print(response[\"message\"][\"content\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import cv2\n",
"import base64\n",
"import numpy as np\n",
"\n",
"endpoint = \"http://192.168.2.64:12343/image\"\n",
"\n",
"\n",
"\n",
"prompt = \"Majestic mountain landscape with snow-capped peaks, autumn foliage in vibrant reds and oranges, a turquoise river winding through a valley, crisp and serene atmosphere, ultra-realistic style.\"\n",
"prompt = \"A group of kids happily playing in a joy environment\"\n",
"#prompt = \"A bitcoin behaving like a king, surrounded by small alternative coins. Detailed, geometric style\"\n",
"\n",
"json = {\n",
" \"prompt\": prompt,\n",
" \"num_inference_steps\": 10,\n",
" \"size\": \"512x512\",\n",
" \"seed\": 123456,\n",
"}\n",
"\n",
"for inf_step in [1, 4, 10, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100]:\n",
" json[\"num_inference_steps\"] = inf_step\n",
"\n",
" %time r = requests.post(endpoint, json=json)\n",
" print(\"Status code\", r.status_code)\n",
"\n",
" # Image\n",
" png_as_np = np.frombuffer(base64.b64decode(r.text), dtype=np.uint8)\n",
" image_bgr = cv2.imdecode(png_as_np, cv2.IMREAD_COLOR)\n",
"\n",
" cv2.imwrite(\"sample_img_{}.png\".format(json[\"num_inference_steps\"]), image_bgr)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install trafilatura\n",
"import trafilatura\n",
"from pprint import pprint\n",
"\n",
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
"url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
"\n",
"# Fetch\n",
"doc = trafilatura.fetch_url(url)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Content & metadata\n",
"metadata = trafilatura.extract_metadata(doc)\n",
"content = trafilatura.extract(doc)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pprint(metadata.as_dict())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install newspaper4k\n",
"# !pip install langdetect \n",
"import newspaper\n",
"import langdetect\n",
"langdetect.DetectorFactory.seed = 0\n",
"\n",
"\n",
"\n",
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
"#url = \"https://www.waff.com/2025/03/11/colbert-heights-high-school-employee-arrested-child-abuse/\"\n",
"\n",
"\n",
"\n",
"#url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
"\n",
"\n",
"url = \"https://apnews.com/article/canada-trump-us-tariffs-steel-2517a6a2baf0596cb1a43d3a7d1e7939\"\n",
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"#url = \"https://www.ft.com/content/6d7c6915-4ceb-43fc-9896-590036b12a87\"\n",
"#url = \"https://www.lanacion.com.ar/politica/milei-en-bahia-blanca-un-viaje-sorpresa-para-frenar-las-criticas-y-mostrar-cercania-nid12032025/\"\n",
"#url = \"https://www.missingkids.org/poster/NCMC/2043547/1\"\n",
"\n",
"try:\n",
" article = newspaper.article(url)\n",
"except newspaper.ArticleException as e:\n",
" print(\"ArticleException: {}\".format(str(e)))\n",
"except Exception as e:\n",
" print(\"Err: {}\".format(str(e)))\n",
"\n",
"# url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])\n",
"# article.is_valid_url(), article.is_parsed, article.is_media_news(), article.is_valid_body()\n",
"article.meta_data\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install news-please\n",
"from newsplease import NewsPlease\n",
"\n",
"url = \"https://variety.com/2025/film/news/gene-hackman-death-suspicious-gas-leak-search-warrant-1236322610/\"\n",
"url = \"https://www.bbc.com/news/articles/cewkkkvkzn9o\"\n",
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"article = NewsPlease.from_url(url)\n",
"print(article.title)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(article.maintext)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -5,6 +5,14 @@
- Fetch parsing URL host - Fetch parsing URL host
- Fetch from RSS feed - Fetch from RSS feed
- Fetch searching (Google search & news, DuckDuckGo, ...) - Fetch searching (Google search & news, DuckDuckGo, ...)
++ Sources -> Robustness to TooManyRequests block
- Selenium based
- Sites change their logic, request captcha, ...
- Brave Search API
- Free up to X requests per day. Need credit card association (no charges)
- Bing API
- Subscription required
- Yandex. No API?
- Process URLs -> Updates raw URLs - Process URLs -> Updates raw URLs
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
- Determines if it is a valid article content - Determines if it is a valid article content

View File

@@ -2,30 +2,29 @@ import logging
import os import os
# Get env var # Get env var
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log") logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
# Directory of logs # Directory of logs
directory = '/'.join(path_logs_parameterization.split("/")[:-1]) os.makedirs(logs_directory, exist_ok=True)
os.makedirs(directory, exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher") logger = logging.getLogger("selenium")
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL # To file log: INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1) fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG) fh.setLevel(logging.DEBUG)
logger.addHandler(fh) logger.addHandler(fh)
# To file log: INFO / WARNING / ERROR # To file log: INFO / WARNING / ERROR
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1) fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.INFO) fh.setLevel(logging.INFO)
logger.addHandler(fh) logger.addHandler(fh)
# To file log: WARNING / ERROR / CRITICAL # To file log: WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1) fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.WARNING) fh.setLevel(logging.WARNING)
logger.addHandler(fh) logger.addHandler(fh)

View File

@@ -28,7 +28,7 @@ class MissingKidsFetcher():
logger.debug("Processing page: {}...".format(i)) logger.debug("Processing page: {}...".format(i))
try: try:
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3) time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3)
# Fetch poster URLs # Fetch poster URLs
for element_type in ["a"]: # ["a", "p", "div"]: for element_type in ["a"]: # ["a", "p", "div"]:
for elem in driver.find_elements(By.TAG_NAME, element_type): for elem in driver.find_elements(By.TAG_NAME, element_type):

View File

@@ -1,341 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# !pip install psycopg[binary]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"db_postgres\n",
"db_redis\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h"
]
}
],
"source": [
"!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5\n",
"!rm logs/*"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"INSERT_TABLES = True\n",
"INSERT_SAMPLE_DATA = False\n",
"\n",
"import psycopg\n",
"connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n",
"\n",
"from datetime import datetime, timezone\n",
"import re\n",
"from pprint import pprint"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"if INSERT_TABLES:\n",
" # Connect to an existing database\n",
" with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" # Autocommit at end of transaction (Atomic insert of URLs and sources)\n",
" with conn.transaction() as tx:\n",
" # Create URLs table\n",
" c = cur.execute(\"\"\"\n",
" CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');\n",
"\n",
" CREATE TABLE URLS (\n",
" id SERIAL PRIMARY KEY,\n",
" url TEXT NOT NULL UNIQUE,\n",
" ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n",
" status URL_STATUS NOT NULL DEFAULT 'raw' -- ,\n",
" -- status_wendy WENDY_STATUS DEFAULT NULL,\n",
" -- ts_wendy TIMESTAMPTZ DEFAULT NULL\n",
" );\n",
" CREATE INDEX idx_urls_status ON urls(status);\n",
" CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\n",
"\n",
" CREATE TABLE URLS_DUPLICATE (\n",
" id_url_canonical INTEGER REFERENCES URLS(id),\n",
" id_url_duplicated INTEGER REFERENCES URLS(id),\n",
" PRIMARY KEY (id_url_canonical, id_url_duplicated)\n",
" );\n",
" \n",
" CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');\n",
" CREATE TABLE SEARCH (\n",
" id SMALLSERIAL PRIMARY KEY,\n",
" search TEXT NOT NULL UNIQUE,\n",
" type SEARCH_TYPE NOT NULL\n",
" -- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search\n",
" -- UNIQUE(search, language_country)\n",
" );\n",
" CREATE INDEX idx_search_type ON SEARCH(type);\n",
" \n",
" CREATE TABLE SOURCE (\n",
" id SMALLSERIAL PRIMARY KEY,\n",
" source TEXT NOT NULL UNIQUE\n",
" );\n",
" \n",
" -- CREATE TABLE SEARCH_LANGUAGE (\n",
" -- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. \"en\"\n",
" -- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. \"us\"\n",
" -- PRIMARY KEY (language, country)\n",
" -- );\n",
" \n",
" CREATE TABLE URLS_SOURCE_SEARCH (\n",
" id_url INTEGER REFERENCES URLS(id),\n",
" id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n",
" id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n",
" PRIMARY KEY(id_url, id_source, id_search)\n",
" );\n",
" CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);\n",
" CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);\n",
"\n",
" CREATE TABLE STATUS_PATTERN_MATCHING (\n",
" pattern TEXT PRIMARY KEY,\n",
" priority SMALLINT NOT NULL,\n",
" status URL_STATUS NOT NULL\n",
" );\n",
" \n",
" \n",
" CREATE TABLE URL_CONTENT (\n",
" id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\n",
" date_published TIMESTAMPTZ DEFAULT NOW(),\n",
" title TEXT,\n",
" description TEXT,\n",
" content TEXT,\n",
" valid_content BOOLEAN,\n",
" language CHAR(2), -- ISO 639-1 Code\n",
" keywords TEXT[],\n",
" tags TEXT[],\n",
" authors TEXT[],\n",
" image_main_url TEXT,\n",
" images_url TEXT[],\n",
" videos_url TEXT[],\n",
" url_host TEXT, -- www.breitbart.com\n",
" site_name TEXT -- Breitbart News\n",
" );\n",
" CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n",
" CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n",
" CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\n",
" CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\n",
" CREATE INDEX idx_language ON URL_CONTENT (language);\n",
" CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n",
" \"\"\")\n",
"\n",
" ### Default insert values\n",
" \n",
" # Feeds\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n",
" # Websites of interest\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');\" )\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');\" )\n",
" # Search keywords\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');\" )\n",
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');\" )\n",
" \n",
" # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n",
" # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"youtube.com/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"tiktok.com/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"twitter.com/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"reddit.com/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"libreddit.de/\"))) )\n",
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\t urls\n",
"[]\n",
"\t urls_duplicate\n",
"[]\n",
"\t urls_source_search\n",
"[]\n",
"\t source\n",
"[]\n",
"\t search\n",
"[(1,\n",
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
" 'rss_feed'),\n",
" (2, 'missingkids.org/poster', 'url_host'),\n",
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
" (4, 'breitbart.com', 'url_host'),\n",
" (5, 'child abuse', 'keyword_search')]\n",
"\t status_pattern_matching\n",
"[('.*youtube\\\\.com/.*', 50, 'invalid'),\n",
" ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n",
" ('.*twitter\\\\.com/.*', 50, 'invalid'),\n",
" ('.*reddit\\\\.com/.*', 50, 'invalid'),\n",
" ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n",
" ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n",
"\t url_content\n",
"[]\n"
]
}
],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" # Get tables\n",
" cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema='public';\")\n",
" tables = [t[0] for t in cur.fetchall()]\n",
"\n",
" for t in tables:\n",
" print(\"\\t\", t)\n",
" pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(1,\n",
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
" 'rss_feed'),\n",
" (2, 'missingkids.org/poster', 'url_host'),\n",
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
" (4, 'breitbart.com', 'url_host'),\n",
" (5, 'child abuse', 'keyword_search')]\n"
]
}
],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" pprint( cur.execute(\"SELECT * FROM SEARCH;\").fetchall() )"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[]\n"
]
}
],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 50;\").fetchall() )\n",
" #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\n!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n",
"\n",
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\n",
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org', 'url_host');\" )\n",
"'''"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

48
app_urls/Dockerfile Normal file
View File

@@ -0,0 +1,48 @@
FROM python:3.12
# Prevents Python from writing pyc files to disk
ENV PYTHONDONTWRITEBYTECODE=1
#Prevents Python from buffering stdout and stderr
ENV PYTHONUNBUFFERED=1
# User
RUN useradd -m -r appuser && \
mkdir /opt/app && \
chown -R appuser /opt/app
WORKDIR /opt/app
# Copy the Django project and install dependencies
COPY requirements.txt /opt/app/
# run this command to install all dependencies
RUN pip install --no-cache-dir -r requirements.txt
COPY --chown=appuser:appuser . /opt/app/
RUN chmod -R 755 /opt/app
RUN chown -R appuser:appuser /opt/app
USER appuser
# Initialization script
RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
echo 'if [ "${INITIALIZE_DB}" = false ]; then' >> /opt/app/initialize.sh && \
echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \
echo 'else' >> /opt/app/initialize.sh && \
echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \
echo 'sleep 5' >> /opt/app/initialize.sh && \
echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \
echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \
echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \
echo 'python manage.py collectstatic --no-input' >> /opt/app/initialize.sh && \
echo 'python manage.py import --filename scheduled_tasks.json' >> /opt/app/initialize.sh && \
echo 'fi' >> /opt/app/initialize.sh && \
chmod +x /opt/app/initialize.sh
# Serving script
RUN echo '#!/bin/bash' > /opt/app/run.sh && \
echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
#echo 'python manage.py runserver & python manage.py rqworker high default low' >> /opt/app/run.sh && \
chmod +x /opt/app/run.sh
# Run Djangos server & workers
CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"]

View File

@@ -2,18 +2,9 @@
``` ```
conda create -n matitos_urls python=3.12 conda create -n matitos_urls python=3.12
conda activate matitos_urls conda activate matitos_urls
# Core pip install -r requirements.txt
pip install django==5.1 psycopg[binary] django-redis django-tasks-scheduler
# Fetcher
pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect
# News visualization
pip install ollama
``` ```
* Database
* Database initialization -> 1-DB.ipynb
* From automated inspectdb * From automated inspectdb
``` ```
# 1) Inspect DB, generate models.py # 1) Inspect DB, generate models.py
@@ -74,60 +65,19 @@ class Meta:
db_table = 'urls' # db_table = '{}_urls'.format(project_name) db_table = 'urls' # db_table = '{}_urls'.format(project_name)
``` ```
* Database & initialization
* Check initialize.sh on Dockerfile
* Environment variables * Environment variables
``` * In docker-compose.yml
# Database
DB_NAME=${DB_NAME:-matitos}
DB_USER=${DB_NAME:-supermatitos}
DB_PASSWORD=${DB_NAME:-supermatitos}
DB_HOST=${DB_NAME:-localhost}
DB_PORT=${DB_NAME:-5432}
REDIS_HOST=${REDIS_HOST:-localhost}
REDIS_PORT=${REDIS_PORT:-6379}
# Job timeout: 30 min
JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
# Logs path
PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
# Fetcher
FETCHER_GNEWS_DECODE_SLEEP=2
FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
FETCHER_BETWEEN_SEARCHES_SLEEP=5
FETCHER_URL_HOST_SLEEP=5
FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
SELENIUM_ENDPOINT="http://selenium_app:80"
```
* Deploy * Deploy
``` ```
# Migrations # Check environments variables on docker-compose.yml
python manage.py makemigrations api; python manage.py migrate --fake-initial
# Create user
python manage.py createsuperuser
# 1) Server # Remove previous instances
python manage.py runserver docker compose down -v
# 2) Workers # Build & up
python manage.py rqworker high default low docker compose up -d --build
# Visualize DB
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id
``` ```
* Scheduled tasks
```
# Import tasks
python manage.py import --filename scheduled_tasks.json
# Modify using the admin panel, then save
# python manage.py export > scheduled_tasks.json
```
* Utils. TODO: To endpoint...
```
python manage.py rqstats
```

View File

@@ -1,295 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Charts</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<style>
body {
background-color: #333;
color: #fff;
font-family: Arial, sans-serif;
}
h2 {
color: #fff;
text-align: center;
margin-bottom: 40px;
}
.chart-container {
width: 45%;
display: inline-block;
margin: 20px;
background-color: #444;
border-radius: 10px;
padding: 5px;
}
canvas {
background-color: #2c2c2c;
border-radius: 5px;
}
.container {
display: flex;
justify-content: center;
flex-wrap: wrap;
}
.filter-container {
text-align: center;
margin-bottom: 20px;
}
select {
padding: 8px;
background-color: #555;
color: white;
border: 1px solid #444;
border-radius: 5px;
}
</style>
</head>
<body>
<h2>Data Visualizations</h2>
<!-- Filter for Number of Days -->
<div class="filter-container">
<label for="daysFilter">Select Number of Days:</label>
<select id="daysFilter">
<option value="0.25">Last 6 Hours</option>
<option value="1">Last 24 Hours</option>
<option value="3">Last 3 Days</option>
<option value="7" selected>Last 7 Days</option>
<option value="30">Last 30 Days</option>
<option value="90">Last 90 Days</option>
<option value="365">Last 365 Days</option>
</select>
</div>
<div class="container">
<div class="chart-container">
<canvas id="urlFetchDateChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlStatusChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSourceChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSearchChart"></canvas>
</div>
</div>
<script>
$(document).ready(function () {
// Fetch initial data (default 30 days)
const defaultDays = 7;
fetchDataAndRenderCharts(defaultDays);
// Apply the filter automatically when the user changes the selection
$('#daysFilter').change(function () {
const selectedDays = $(this).val();
fetchDataAndRenderCharts(selectedDays);
});
});
function fetchDataAndRenderCharts(days) {
// Fetch and render the URL Fetch Date chart
$.getJSON(`/urls-by-fetch-date/?days=${days}`, function (data) {
renderUrlFetchDateChart(data);
});
// Fetch and render the URL Status chart (with dynamic date filtering)
$.getJSON(`/urls-per-status/?days=${days}`, function (data) {
renderUrlStatusChart(data);
});
// Fetch and render the URLs per Source chart
$.getJSON(`/urls-per-source/?days=${days}`, function (data) {
renderUrlsPerSourceChart(data);
});
// Fetch and render the URLs per Search chart
$.getJSON(`/urls-per-search/?days=${days}`, function (data) {
renderUrlsPerSearchChart(data);
});
}
function renderUrlFetchDateChart(data) {
new Chart(document.getElementById("urlFetchDateChart"), {
type: 'bar',
data: {
labels: data.dates,
datasets: [{
label: 'URLs by Fetch Date',
data: data.counts,
backgroundColor: 'blue',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
function renderUrlStatusChart(data) {
new Chart(document.getElementById("urlStatusChart"), {
type: 'bar',
data: {
labels: data.statuses,
datasets: [{
label: 'URLs by Status',
data: data.counts,
backgroundColor: 'green',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
function renderUrlsPerSourceChart(data) {
new Chart(document.getElementById("urlsPerSourceChart"), {
type: 'bar',
data: {
labels: data.sources,
datasets: [{
label: 'URLs by Source',
data: data.counts,
backgroundColor: 'purple',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
function renderUrlsPerSearchChart(data) {
new Chart(document.getElementById("urlsPerSearchChart"), {
type: 'bar',
data: {
labels: data.searches,
datasets: [{
label: 'URLs by Search',
data: data.counts,
backgroundColor: 'orange',
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: {
color: '#fff' // Change the legend text color to white
}
}
},
scales: {
x: {
ticks: {
color: "#fff" // Set x-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set y-axis ticks color
},
grid: {
color: "#444" // Set grid lines color
}
}
}
}
});
}
</script>
</body>
</html>

View File

@@ -20,12 +20,13 @@ BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production # Quick-start development settings - unsuitable for production
# SECURITY WARNING: keep the secret key used in production secret! # SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt' SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt')
# SECURITY WARNING: don't run with debug turned on in production! # SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True DEBUG = (os.environ.get('DJANGO_DEBUG') == "True")
print("Django debug mode:", DEBUG)
ALLOWED_HOSTS = [] ALLOWED_HOSTS = os.environ.get('DJANGO_ALLOWED_HOSTS', "*").split(",")
# Application definition # Application definition
@@ -38,11 +39,12 @@ INSTALLED_APPS = [
'django.contrib.messages', 'django.contrib.messages',
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'scheduler', 'scheduler',
'api', 'fetcher',
] ]
MIDDLEWARE = [ MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware', 'django.middleware.security.SecurityMiddleware',
'whitenoise.middleware.WhiteNoiseMiddleware', # Serving static files
'django.contrib.sessions.middleware.SessionMiddleware', 'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware', 'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware', 'django.middleware.csrf.CsrfViewMiddleware',
@@ -51,6 +53,8 @@ MIDDLEWARE = [
'django.middleware.clickjacking.XFrameOptionsMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware',
] ]
STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'
ROOT_URLCONF = 'core.urls' ROOT_URLCONF = 'core.urls'
TEMPLATES = [ TEMPLATES = [
@@ -121,7 +125,7 @@ SCHEDULER_QUEUES = {
} }
} }
SCHEDULER_CONFIG = { SCHEDULER_CONFIG = {
'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 15 minutes 'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 30 minutes
'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours 'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours
'EXECUTIONS_IN_PAGE': 20, 'EXECUTIONS_IN_PAGE': 20,
'SCHEDULER_INTERVAL': 10, # 10 seconds 'SCHEDULER_INTERVAL': 10, # 10 seconds
@@ -158,7 +162,8 @@ USE_TZ = True
# Static files (CSS, JavaScript, Images) # Static files (CSS, JavaScript, Images)
STATIC_URL = 'static/' STATIC_URL = '/static/'
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
# Default primary key field type # Default primary key field type

View File

@@ -20,5 +20,5 @@ from django.urls import path, include
urlpatterns = [ urlpatterns = [
path('admin/', admin.site.urls), path('admin/', admin.site.urls),
path('scheduler/', include('scheduler.urls')), path('scheduler/', include('scheduler.urls')),
path('', include('api.urls')), path('', include('fetcher.urls')),
] ]

145
app_urls/db.py Normal file
View File

@@ -0,0 +1,145 @@
import argparse
import os
import psycopg
import re
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
os.environ.get("DB_HOST", "localhost"),
os.environ.get("DB_PORT", "5432"),
os.environ.get("DB_NAME", "matitos"),
os.environ.get("DB_USER", "supermatitos"),
os.environ.get("DB_PASSWORD", "supermatitos")
)
def initialize_tables():
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Autocommit at end of transaction (Atomic creation of tables)
with conn.transaction() as tx:
# Create URLs table
c = cur.execute("""
CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');
CREATE TABLE URLS (
id SERIAL PRIMARY KEY,
url TEXT NOT NULL UNIQUE,
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
-- status_wendy WENDY_STATUS DEFAULT NULL,
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
);
CREATE INDEX idx_urls_status ON urls(status);
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
CREATE TABLE URLS_DUPLICATE (
id_url_canonical INTEGER REFERENCES URLS(id),
id_url_duplicated INTEGER REFERENCES URLS(id),
PRIMARY KEY (id_url_canonical, id_url_duplicated)
);
CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');
CREATE TABLE SEARCH (
id SMALLSERIAL PRIMARY KEY,
search TEXT NOT NULL UNIQUE,
type SEARCH_TYPE NOT NULL
-- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search
-- UNIQUE(search, language_country)
);
CREATE INDEX idx_search_type ON SEARCH(type);
CREATE TABLE SOURCE (
id SMALLSERIAL PRIMARY KEY,
source TEXT NOT NULL UNIQUE
);
-- CREATE TABLE SEARCH_LANGUAGE (
-- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. "en"
-- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. "us"
-- PRIMARY KEY (language, country)
-- );
CREATE TABLE URLS_SOURCE_SEARCH (
id_url INTEGER REFERENCES URLS(id),
id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,
id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,
PRIMARY KEY(id_url, id_source, id_search)
);
CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);
CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);
CREATE TABLE STATUS_PATTERN_MATCHING (
pattern TEXT PRIMARY KEY,
priority SMALLINT NOT NULL,
status URL_STATUS NOT NULL
);
CREATE TABLE URL_CONTENT (
id_url INTEGER PRIMARY KEY REFERENCES URLS(id),
date_published TIMESTAMPTZ DEFAULT NOW(),
title TEXT,
description TEXT,
content TEXT,
valid_content BOOLEAN,
language CHAR(2), -- ISO 639-1 Code
keywords TEXT[],
tags TEXT[],
authors TEXT[],
image_main_url TEXT,
images_url TEXT[],
videos_url TEXT[],
url_host TEXT, -- www.breitbart.com
site_name TEXT -- Breitbart News
);
CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);
CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);
CREATE INDEX idx_date_published ON URL_CONTENT (date_published);
CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);
CREATE INDEX idx_language ON URL_CONTENT (language);
CREATE INDEX idx_url_host ON URL_CONTENT (url_host);
""")
def initialize_data():
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Autocommit at end of transaction (Atomic creation of data)
with conn.transaction() as tx:
# Feeds
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" )
# Websites of interest
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" )
# Search keywords
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" )
# TODO: Language per search
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');" )
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');" )
# Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) )
def main(name):
print('Hello, %s!' % name)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Database initialization')
parser.add_argument('--initialize_tables', help='Create DB tables', action='store_true', default=False)
parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
args = parser.parse_args()
if (args.initialize_tables):
print("Initializing tables")
initialize_tables()
if (args.initialize_data):
print("Initializing data")
initialize_data()

View File

@@ -1,6 +1,6 @@
from django.apps import AppConfig from django.apps import AppConfig
class ApiConfig(AppConfig): class FetcherConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField' default_auto_field = 'django.db.models.BigAutoField'
name = 'api' name = 'fetcher'

View File

@@ -65,7 +65,7 @@ class Migration(migrations.Migration):
migrations.CreateModel( migrations.CreateModel(
name='UrlContent', name='UrlContent',
fields=[ fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')), ('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
('date_published', models.DateTimeField(blank=True, null=True)), ('date_published', models.DateTimeField(blank=True, null=True)),
('title', models.TextField(blank=True, null=True)), ('title', models.TextField(blank=True, null=True)),
('description', models.TextField(blank=True, null=True)), ('description', models.TextField(blank=True, null=True)),
@@ -89,7 +89,7 @@ class Migration(migrations.Migration):
migrations.CreateModel( migrations.CreateModel(
name='UrlsDuplicate', name='UrlsDuplicate',
fields=[ fields=[
('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')), ('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
], ],
options={ options={
'db_table': 'urls_duplicate', 'db_table': 'urls_duplicate',
@@ -99,7 +99,7 @@ class Migration(migrations.Migration):
migrations.CreateModel( migrations.CreateModel(
name='UrlsSourceSearch', name='UrlsSourceSearch',
fields=[ fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')), ('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
], ],
options={ options={
'db_table': 'urls_source_search', 'db_table': 'urls_source_search',

View File

@@ -1,6 +1,8 @@
import time import time
import feedparser import feedparser
import os import os
from django.utils import timezone
from datetime import timedelta
from ..models import Search, Source from ..models import Search, Source
from .fetch_utils import decode_gnews_urls from .fetch_utils import decode_gnews_urls
from .logger import get_logger from .logger import get_logger
@@ -9,6 +11,7 @@ logger = get_logger()
from gnews import GNews from gnews import GNews
from duckduckgo_search import DDGS from duckduckgo_search import DDGS
from GoogleNews import GoogleNews from GoogleNews import GoogleNews
from search_engines import Yahoo, Aol
########################################################################### ###########################################################################
########################################################################### ###########################################################################
@@ -42,11 +45,19 @@ class FetcherAbstract(ABC):
return raw_urls return raw_urls
def fetch_articles(self, db_writer, obj_search): def fetch_articles(self, db_writer, obj_search):
# Search
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
# Source name # Source name
source_name = self._get_name() source_name = self._get_name()
# Search
keyword_search = obj_search.search
# URL Host search? -> site:${URL_HOST}
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
keyword_search = "{}{}".format("site:", keyword_search)
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
start_date = timezone.now() - timedelta(days=7)
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
logger.debug("Starting search: {} - {}".format(keyword_search, source_name)) logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
# Fetch # Fetch
raw_urls = self._fetch_raw_urls(keyword_search) raw_urls = self._fetch_raw_urls(keyword_search)
@@ -165,11 +176,11 @@ class SearchGoogleGeneral(FetcherAbstract):
self.language = args.get("language", "en") self.language = args.get("language", "en")
self.country = args.get("country", "US") self.country = args.get("country", "US")
self.period = args.get("period", "7d") self.period = args.get("period", "7d")
self.max_pages = args.get("max_pages", 1) self.pages = args.get("pages", 1)
def _get_name(self): def _get_name(self):
# [source] [period] [language-country] [pages] # [source] [period] [language-country] [pages]
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.max_pages).replace("pages=None", "").strip() return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search): def _fetch_raw_urls(self, keyword_search):
try: try:
@@ -181,7 +192,7 @@ class SearchGoogleGeneral(FetcherAbstract):
set_links = set() set_links = set()
# Iterate pages # Iterate pages
for i in range(self.max_pages): for i in range(self.pages):
# Sleep between pages fetch # Sleep between pages fetch
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4))) time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
# Number of URLs fetched so far # Number of URLs fetched so far
@@ -253,7 +264,45 @@ class SearchGoogleNewsRSS(FetcherAbstract):
urls = [] urls = []
return urls return urls
class SearchYahooGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.pages = args.get("pages", 2)
def _get_name(self):
# [source] [language-country] [pages]
return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
results = Yahoo().search(keyword_search, pages=self.pages)
urls = results.links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchAOLGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.pages = args.get("pages", 2)
def _get_name(self):
# [source] [language-country] [pages]
return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
results = Aol().search(keyword_search, pages=self.pages)
urls = results.links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
########################################################################### ###########################################################################
# List of instances # List of instances
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS] ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]

View File

@@ -2,30 +2,29 @@ import logging
import os import os
# Get env var # Get env var
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_fetcher_{}.log") logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
# Directory of logs # Directory of logs
directory = '/'.join(path_logs_parameterization.split("/")[:-1]) os.makedirs(logs_directory, exist_ok=True)
os.makedirs(directory, exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher") logger = logging.getLogger("fetcher")
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL # To file log: INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=4) fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG) fh.setLevel(logging.DEBUG)
logger.addHandler(fh) logger.addHandler(fh)
# To file log: INFO / WARNING / ERROR # To file log: INFO / WARNING / ERROR
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=2) fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.INFO) fh.setLevel(logging.INFO)
logger.addHandler(fh) logger.addHandler(fh)
# To file log: WARNING / ERROR / CRITICAL # To file log: WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1) fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.WARNING) fh.setLevel(logging.WARNING)
logger.addHandler(fh) logger.addHandler(fh)

View File

@@ -73,9 +73,6 @@ def process_missing_kids_urls_all(batch_size=None):
logger.info("Task completed: {}".format(task)) logger.info("Task completed: {}".format(task))
@job('default') @job('default')
def background_task(process_type: str): def background_task(process_type: str):
logger.info("Task triggered: {}".format(process_type)) logger.info("Task triggered: {}".format(process_type))

View File

@@ -0,0 +1,179 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Charts</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<style>
body {
background-color: #333;
color: #fff;
font-family: Arial, sans-serif;
}
h2 {
color: #fff;
text-align: center;
margin-bottom: 40px;
}
.chart-container {
width: 45%;
display: inline-block;
margin: 20px;
background-color: #444;
border-radius: 10px;
padding: 5px;
}
canvas {
background-color: #2c2c2c;
border-radius: 5px;
}
.container {
display: flex;
justify-content: center;
flex-wrap: wrap;
}
.filter-container {
text-align: center;
margin-bottom: 20px;
}
select {
padding: 8px;
background-color: #555;
color: white;
border: 1px solid #444;
border-radius: 5px;
}
</style>
</head>
<body>
<h2>Data Visualizations</h2>
<!-- Filter for Number of Days -->
<div class="filter-container">
<label for="daysFilter">Select Number of Days:</label>
<select id="daysFilter">
<option value="0.0625">Last 90 Minutes</option>
<option value="0.25">Last 6 Hours</option>
<option value="1">Last 24 Hours</option>
<option value="7" selected>Last 7 Days</option>
<option value="30">Last 30 Days</option>
<option value="90">Last 90 Days</option>
<option value="365">Last 365 Days</option>
</select>
</div>
<div class="container">
<div class="chart-container">
<canvas id="urlFetchDateChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlStatusChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSourceChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSearchChart"></canvas>
</div>
</div>
<script>
$(document).ready(function () {
let chartInstances = {}; // Store chart instances
// Fetch initial data (default 7 days)
const defaultDays = 7;
fetchDataAndRenderCharts(defaultDays);
// Apply the filter automatically when the user changes the selection
$('#daysFilter').on('change', function () {
const selectedDays = $(this).val();
fetchDataAndRenderCharts(selectedDays);
});
function fetchDataAndRenderCharts(days) {
fetchAndRenderChart(`/urls-by-fetch-date/?days=${days}`, 'urlFetchDateChart', 'URLs by Fetch Date', 'bar');
fetchAndRenderChart(`/urls-per-status/?days=${days}`, 'urlStatusChart', 'URLs by Status', 'bar');
fetchAndRenderChart(`/urls-per-source/?days=${days}`, 'urlsPerSourceChart', 'URLs by Source', 'bar');
fetchAndRenderChart(`/urls-per-search/?days=${days}`, 'urlsPerSearchChart', 'URLs by Search', 'bar');
}
const categoryColors = {
'URLs by Fetch Date': '#4BC0C0', // Color for this category
'URLs by Status': '#36A2EB', // Color for this category
'URLs by Source': '#4BC0C0', // Color for this category
'URLs by Search': '#36A2EB' // Color for this category
};
const maxLabelLength = 35; // Limit X-axis labels to 10 characters
function fetchAndRenderChart(url, canvasId, chartTitle, chartType) {
$.getJSON(url, function (data) {
if (chartInstances[canvasId]) {
chartInstances[canvasId].destroy(); // Destroy previous chart
}
const ctx = document.getElementById(canvasId).getContext('2d');
chartInstances[canvasId] = new Chart(ctx, {
type: chartType,
data: {
labels: data.labels, // Ensure labels are passed as strings
datasets: [{
label: chartTitle,
data: data.values,
backgroundColor: categoryColors[chartTitle], // Assign the same color based on category
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: { color: '#fff' }
}
},
scales: {
x: {
ticks: {
color: "#fff", // Set the color of x-axis ticks
callback: function (value) {
let label = data.labels[value];
if (label.length > maxLabelLength) { return label.slice(0, maxLabelLength) + '...'; }
return label;
}
},
grid: {
color: "#444" // Set the grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set the color of y-axis ticks
},
grid: {
color: "#444" // Set the grid lines color
}
}
}
}
});
});
}
});
</script>
</body>
</html>

View File

@@ -113,11 +113,11 @@ input[type="checkbox"] {
} }
/* Themed Toggle Button */ /* Themed Toggle Button */
.theme-button, .home-button { .theme-button, .home-button, .chart-button {
background-color: var(--sidebar); background-color: var(--sidebar);
border: 1px solid var(--sidebar); border: 1px solid var(--sidebar);
border-radius: 50%; border-radius: 50%;
width: 45px; width: 30px;
height: 45px; height: 45px;
font-size: 25px; font-size: 25px;
display: flex; display: flex;
@@ -127,10 +127,10 @@ input[type="checkbox"] {
cursor: pointer; cursor: pointer;
} }
.theme-button:hover, .home-button:hover { .theme-button:hover, .home-button:hover, .chart-button:hover {
transform: rotate(20deg); transform: rotate(20deg);
} }
.theme-button:active, .home-button:active { .theme-button:active, .home-button:active, .chart-button:acive {
transform: scale(0.95); transform: scale(0.95);
} }
@@ -235,6 +235,7 @@ input[type="checkbox"] {
<div class="button-container"> <div class="button-container">
<button id="homeButton" class="home-button">🏠</button> <button id="homeButton" class="home-button">🏠</button>
<button id="themeToggle" class="theme-button">🌙</button> <button id="themeToggle" class="theme-button">🌙</button>
<button id="chartButton" class="chart-button">📊</button>
</div> </div>
<form method="GET" action="" id="filterForm"> <form method="GET" action="" id="filterForm">
@@ -477,6 +478,10 @@ input[type="checkbox"] {
document.getElementById("homeButton").addEventListener("click", function () { document.getElementById("homeButton").addEventListener("click", function () {
window.location.href = "./"; // Change this to your homepage URL if different window.location.href = "./"; // Change this to your homepage URL if different
}); });
// Charts
document.getElementById("chartButton").addEventListener("click", function () {
window.location.href = "./charts"; // Change this to your homepage URL if different
});
////////////////////////////////////////////// //////////////////////////////////////////////
// Timestamp to local timezone // Timestamp to local timezone
@@ -508,26 +513,32 @@ input[type="checkbox"] {
}); });
}); });
////////////////////////////////////////////////////////////////////// // Function to update the form parameters for all sections before submitting
// Function to update the form parameter before submitting function updateFormParameters() {
function updateFormParameter(section) { // Get all distinct sections by selecting all checkboxes and extracting their "name" attributes
const checkboxes = document.querySelectorAll(`[name='${section}']`); const sections = new Set([...document.querySelectorAll("input[type='checkbox']")].map(cb => cb.name));
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
// If all are checked, replace them with a hidden input with value "all"
if (allChecked) {
checkboxes.forEach(checkbox => checkbox.removeAttribute("name"));
let hiddenInput = document.createElement("input");
hiddenInput.type = "hidden";
hiddenInput.name = section;
hiddenInput.value = "all";
document.getElementById("filterForm").appendChild(hiddenInput);
} else {
checkboxes.forEach(checkbox => checkbox.setAttribute("name", section));
document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove());
}
// Submit form after changes sections.forEach(section => {
if (!section) return; // Skip any checkboxes without a name
const checkboxes = document.querySelectorAll(`[name='${section}']`);
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
// If all checkboxes in a section are checked, remove them and add a hidden input
if (allChecked) {
checkboxes.forEach(checkbox => checkbox.removeAttribute("name"));
let hiddenInput = document.createElement("input");
hiddenInput.type = "hidden";
hiddenInput.name = section;
hiddenInput.value = "all";
document.getElementById("filterForm").appendChild(hiddenInput);
} else {
checkboxes.forEach(checkbox => checkbox.setAttribute("name", section));
document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove());
}
});
// Submit the form after updating all sections
document.getElementById("filterForm").submit(); document.getElementById("filterForm").submit();
} }
@@ -537,7 +548,7 @@ input[type="checkbox"] {
const checkboxes = document.querySelectorAll(`[name='${section}']`); const checkboxes = document.querySelectorAll(`[name='${section}']`);
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked); const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
checkboxes.forEach(cb => cb.checked = !allChecked); checkboxes.forEach(cb => cb.checked = !allChecked);
updateFormParameter(section); updateFormParameters();
} }
// Attach event listeners to "Toggle All" buttons // Attach event listeners to "Toggle All" buttons
@@ -552,14 +563,14 @@ input[type="checkbox"] {
// Automatically submit the form when any checkbox changes // Automatically submit the form when any checkbox changes
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) { document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
checkbox.addEventListener('change', function() { checkbox.addEventListener('change', function() {
updateFormParameter(this.name); updateFormParameters();
}); });
}); });
document.getElementById('perPageSelect').addEventListener('change', function() { document.getElementById('perPageSelect').addEventListener('change', function() {
document.getElementById('filterForm').submit(); updateFormParameters();
}); });
document.getElementById('timeFilterSelect').addEventListener('change', function() { document.getElementById('timeFilterSelect').addEventListener('change', function() {
document.getElementById('filterForm').submit(); updateFormParameters();
}); });

View File

@@ -167,13 +167,14 @@
</script> </script>
<body> <body>
<!--
<div class="sidebar"> <div class="sidebar">
<div class="button-container"> <div class="button-container">
<button id="homeButton" class="home-button">🏠</button> <button id="homeButton" class="home-button">🏠</button>
<button id="themeToggle" class="theme-button">🌙</button> <button id="themeToggle" class="theme-button">🌙</button>
</div> </div>
</div> </div>
-->
<!-- Main Content --> <!-- Main Content -->
<div class="container mt-4"> <div class="container mt-4">

View File

@@ -8,7 +8,7 @@ urlpatterns = [
# #
path('task/<str:task>', views.trigger_task, name='trigger_task'), path('task/<str:task>', views.trigger_task, name='trigger_task'),
# #
path('charts/', views.charts, name='charts'), path('urls/charts/', views.charts, name='charts'),
path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'), path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
path('urls-per-status/', views.urls_per_status, name='urls_per_status'), path('urls-per-status/', views.urls_per_status, name='urls_per_status'),
path('urls-per-source/', views.urls_per_source, name='urls_per_source'), path('urls-per-source/', views.urls_per_source, name='urls_per_source'),

View File

@@ -2,6 +2,7 @@ from .tasks import background_task
from django.core.paginator import Paginator from django.core.paginator import Paginator
from django.shortcuts import render, get_object_or_404 from django.shortcuts import render, get_object_or_404
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
from django.contrib.auth.decorators import login_required
import ollama import ollama
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
import os import os
@@ -29,17 +30,18 @@ def link_list(request):
# URLs # URLs
"http://localhost:8000/urls", "http://localhost:8000/urls",
# Charts # Charts
"http://localhost:8000/charts", "http://localhost:8000/urls/charts",
# API tasks # Fetcher tasks
] + [os.path.join(prefix, l) for l in links] ] + [os.path.join(prefix, l) for l in links]
# Json # Json
return JsonResponse({"links": list_links }) return JsonResponse({"links": list_links })
#################################################################################################### ####################################################################################################
# @login_required(login_url='/admin')
def logs(request, log_type): def logs(request, log_type):
# Capture output: python manage.py rqstats # Capture output: python manage.py rqstats
try: try:
with open(os.getenv("PATH_LOGS_DEBUG", "logs/log_app_fetcher_{}.log".format(log_type)), "r") as f: with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
file_content = f.read() file_content = f.read()
except Exception as e: except Exception as e:
file_content = "Error reading logs for log type :{}".format(log_type) file_content = "Error reading logs for log type :{}".format(log_type)
@@ -130,8 +132,9 @@ def charts(request):
return render(request, 'charts.html') return render(request, 'charts.html')
def urls_by_fetch_date(request): def urls_by_fetch_date(request):
# Get the date for 30 days ago # Get the filtering date parameter
start_date = timezone.now() - timedelta(days=30) days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by fetch date # Count the number of URLs grouped by fetch date
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \ urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
@@ -141,8 +144,8 @@ def urls_by_fetch_date(request):
# Format data to return as JSON # Format data to return as JSON
data = { data = {
'dates': [item['ts_fetch__date'] for item in urls_data], 'labels': [item['ts_fetch__date'] for item in urls_data],
'counts': [item['count'] for item in urls_data], 'values': [item['count'] for item in urls_data],
} }
return JsonResponse(data) return JsonResponse(data)
@@ -160,38 +163,48 @@ def urls_per_status(request):
# Format data for JSON # Format data for JSON
data = { data = {
'statuses': [item['status'] for item in urls_data], 'labels': [item['status'] for item in urls_data],
'counts': [item['count'] for item in urls_data], 'values': [item['count'] for item in urls_data],
} }
return JsonResponse(data) return JsonResponse(data)
def urls_per_source(request): def urls_per_source(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by source # Count the number of URLs grouped by source
urls_data = UrlsSourceSearch.objects \ urls_data = UrlsSourceSearch.objects \
.filter(id_url__ts_fetch__gte=start_date) \
.values('id_source__source') \ .values('id_source__source') \
.annotate(count=Count('id_url')) \ .annotate(count=Count('id_url')) \
.order_by('id_source__source') .order_by('id_source__source')
# Format data for JSON # Format data for JSON
data = { data = {
'sources': [item['id_source__source'] for item in urls_data], 'labels': [item['id_source__source'] for item in urls_data],
'counts': [item['count'] for item in urls_data], 'values': [item['count'] for item in urls_data],
} }
return JsonResponse(data) return JsonResponse(data)
def urls_per_search(request): def urls_per_search(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by search # Count the number of URLs grouped by search
urls_data = UrlsSourceSearch.objects \ urls_data = UrlsSourceSearch.objects \
.filter(id_url__ts_fetch__gte=start_date) \
.values('id_search__search') \ .values('id_search__search') \
.annotate(count=Count('id_url')) \ .annotate(count=Count('id_url')) \
.order_by('id_search__search') .order_by('id_search__search')
# Format data for JSON # Format data for JSON
data = { data = {
'searches': [item['id_search__search'] for item in urls_data], 'labels': [item['id_search__search'] for item in urls_data],
'counts': [item['count'] for item in urls_data], 'values': [item['count'] for item in urls_data],
} }
return JsonResponse(data) return JsonResponse(data)

17
app_urls/requirements.txt Normal file
View File

@@ -0,0 +1,17 @@
django==5.1
psycopg[binary]
django-redis
django-tasks-scheduler
gunicorn
whitenoise
feedparser
python-dateutil
newspaper4k[all]
lxml[html_clean]
googlenewsdecoder
gnews
GoogleNews
duckduckgo_search
git+https://github.com/tasos-py/Search-Engines-Scraper.git
langdetect
ollama

View File

@@ -2,10 +2,10 @@
{ {
"model": "RepeatableTaskType", "model": "RepeatableTaskType",
"name": "Process error URLs", "name": "Process error URLs",
"callable": "api.tasks.process_error_urls", "callable": "fetcher.tasks.process_error_urls",
"callable_args": [], "callable_args": [],
"callable_kwargs": [], "callable_kwargs": [],
"enabled": true, "enabled": false,
"queue": "low", "queue": "low",
"repeat": null, "repeat": null,
"at_front": false, "at_front": false,
@@ -15,18 +15,39 @@
"scheduled_time": "2025-04-01T12:36:21+00:00", "scheduled_time": "2025-04-01T12:36:21+00:00",
"interval": 4, "interval": 4,
"interval_unit": "hours", "interval_unit": "hours",
"successful_runs": 15, "successful_runs": 0,
"failed_runs": 0, "failed_runs": 0,
"last_successful_run": "2025-04-01 08:37:06.722770+00:00", "last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process raw URLs",
"callable": "fetcher.tasks.process_raw_urls",
"callable_args": [],
"callable_kwargs": [],
"enabled": false,
"queue": "low",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:20:08+00:00",
"interval": 10,
"interval_unit": "minutes",
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null "last_failed_run": null
}, },
{ {
"model": "RepeatableTaskType", "model": "RepeatableTaskType",
"name": "Process MissingKids URLs", "name": "Process MissingKids URLs",
"callable": "api.tasks.process_missing_kids_urls", "callable": "fetcher.tasks.process_missing_kids_urls",
"callable_args": [], "callable_args": [],
"callable_kwargs": [], "callable_kwargs": [],
"enabled": true, "enabled": false,
"queue": "default", "queue": "default",
"repeat": null, "repeat": null,
"at_front": false, "at_front": false,
@@ -34,20 +55,20 @@
"result_ttl": 86400, "result_ttl": 86400,
"cron_string": null, "cron_string": null,
"scheduled_time": "2025-04-01T10:37:50+00:00", "scheduled_time": "2025-04-01T10:37:50+00:00",
"interval": 2, "interval": 4,
"interval_unit": "hours", "interval_unit": "hours",
"successful_runs": 29, "successful_runs": 0,
"failed_runs": 0, "failed_runs": 0,
"last_successful_run": "2025-04-01 08:42:05.864064+00:00", "last_successful_run": null,
"last_failed_run": null "last_failed_run": null
}, },
{ {
"model": "RepeatableTaskType", "model": "RepeatableTaskType",
"name": "Process MissingKids URLs ALL", "name": "Process MissingKids URLs ALL",
"callable": "api.tasks.process_missing_kids_urls_all", "callable": "fetcher.tasks.process_missing_kids_urls_all",
"callable_args": [], "callable_args": [],
"callable_kwargs": [], "callable_kwargs": [],
"enabled": true, "enabled": false,
"queue": "default", "queue": "default",
"repeat": null, "repeat": null,
"at_front": false, "at_front": false,
@@ -65,10 +86,10 @@
{ {
"model": "RepeatableTaskType", "model": "RepeatableTaskType",
"name": "Fetch Feeds", "name": "Fetch Feeds",
"callable": "api.tasks.fetch_feeds", "callable": "fetcher.tasks.fetch_feeds",
"callable_args": [], "callable_args": [],
"callable_kwargs": [], "callable_kwargs": [],
"enabled": true, "enabled": false,
"queue": "default", "queue": "default",
"repeat": null, "repeat": null,
"at_front": false, "at_front": false,
@@ -78,39 +99,18 @@
"scheduled_time": "2025-04-01T10:18:56+00:00", "scheduled_time": "2025-04-01T10:18:56+00:00",
"interval": 15, "interval": 15,
"interval_unit": "minutes", "interval_unit": "minutes",
"successful_runs": 288, "successful_runs": 0,
"failed_runs": 0, "failed_runs": 0,
"last_successful_run": "2025-04-01 10:03:58.363856+00:00", "last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process raw URLs",
"callable": "api.tasks.process_raw_urls",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"queue": "low",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:20:08+00:00",
"interval": 15,
"interval_unit": "minutes",
"successful_runs": 78,
"failed_runs": 0,
"last_successful_run": "2025-04-01 10:05:08.394472+00:00",
"last_failed_run": null "last_failed_run": null
}, },
{ {
"model": "RepeatableTaskType", "model": "RepeatableTaskType",
"name": "Fetch Parser", "name": "Fetch Parser",
"callable": "api.tasks.fetch_parser", "callable": "fetcher.tasks.fetch_parser",
"callable_args": [], "callable_args": [],
"callable_kwargs": [], "callable_kwargs": [],
"enabled": true, "enabled": false,
"queue": "default", "queue": "default",
"repeat": null, "repeat": null,
"at_front": false, "at_front": false,
@@ -120,18 +120,18 @@
"scheduled_time": "2025-04-01T10:25:42+00:00", "scheduled_time": "2025-04-01T10:25:42+00:00",
"interval": 1, "interval": 1,
"interval_unit": "hours", "interval_unit": "hours",
"successful_runs": 62, "successful_runs": 0,
"failed_runs": 0, "failed_runs": 0,
"last_successful_run": "2025-04-01 09:25:57.977051+00:00", "last_successful_run": null,
"last_failed_run": null "last_failed_run": null
}, },
{ {
"model": "RepeatableTaskType", "model": "RepeatableTaskType",
"name": "Fetch Search", "name": "Fetch Search",
"callable": "api.tasks.fetch_search", "callable": "fetcher.tasks.fetch_search",
"callable_args": [], "callable_args": [],
"callable_kwargs": [], "callable_kwargs": [],
"enabled": true, "enabled": false,
"queue": "default", "queue": "default",
"repeat": null, "repeat": null,
"at_front": false, "at_front": false,
@@ -141,9 +141,51 @@
"scheduled_time": "2025-04-01T10:29:33+00:00", "scheduled_time": "2025-04-01T10:29:33+00:00",
"interval": 1, "interval": 1,
"interval_unit": "hours", "interval_unit": "hours",
"successful_runs": 63, "successful_runs": 0,
"failed_runs": 0, "failed_runs": 0,
"last_successful_run": "2025-04-01 09:37:20.671072+00:00", "last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch MissingKids",
"callable": "fetcher.tasks.fetch_missing_kids",
"callable_args": [],
"callable_kwargs": [],
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:29:33+00:00",
"interval": 4,
"interval_unit": "hours",
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch MissingKids ALL",
"callable": "fetcher.tasks.fetch_missing_kids_all",
"callable_args": [],
"callable_kwargs": [],
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:29:33+00:00",
"interval": 1,
"interval_unit": "weeks",
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null "last_failed_run": null
} }
] ]

View File

@@ -2,101 +2,106 @@ version: '3.9'
services: services:
fetcher_selenium: fetcher_app_selenium:
image: fetcher_app_selenium
build: build:
context: ./app_selenium context: ./app_selenium
container_name: selenium_app container_name: fetcher_app_selenium
restart: unless-stopped # restart: unless-stopped
shm_size: 512mb shm_size: 512mb
environment: environment:
- SELENIUM_SLEEP_PER_PAGE=4 - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
- PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log" - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
ports: ports:
- 80 - 80
dns:
- 1.1.1.1
- 1.0.0.1
deploy:
resources:
limits:
cpus: '4'
memory: 4G
fetcher_urls_app: fetcher_app_urls:
image: fetcher_app_urls
build: build:
context: ./app_urls context: ./app_urls
container_name: urls_app container_name: fetcher_app_urls
restart: unless-stopped # restart: unless-stopped
environment: environment:
#- name=value # Initialization
- INITIALIZE_DB=${INITIALIZE_DB:-true}
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME:-matitos}
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD:-matitos}
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL:-matitos@matitos.org}
# Django
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:-abc123456789qwerty}
- DJANGO_DEBUG=${DJANGO_DEBUG:-False}
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2
# Database # Database
- DB_NAME=${DB_NAME:-matitos} - DB_NAME=${DB_NAME:-matitos}
- DB_USER=${DB_NAME:-supermatitos} - DB_USER=${DB_USER:-supermatitos}
- DB_PASSWORD=${DB_NAME:-supermatitos} - DB_PASSWORD=${DB_PASSWORD:-supermatitos}
- DB_HOST=${DB_NAME:-localhost} # db_postgres - DB_HOST=${DB_HOST:-fetcher_db}
- DB_PORT=${DB_NAME:-5432} - DB_PORT=${DB_PORT:-5432}
- REDIS_HOST=${REDIS_HOST:-localhost} - REDIS_HOST=${REDIS_HOST:-fetcher_redis}
- REDIS_PORT=${REDIS_PORT:-6379} - REDIS_PORT=${REDIS_PORT:-6379}
# Job timeout: 30 min # Job timeout: 30 min
- JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800} - JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
# Logs path # Logs path
- PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log" - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
# Fetcher # Fetcher
- FETCHER_GNEWS_DECODE_SLEEP=2 - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4 - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
- FETCHER_BETWEEN_SEARCHES_SLEEP=5 - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP:-1}
- FETCHER_URL_HOST_SLEEP=5 - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-2}
# Selenium # Selenium
- SELENIUM_ENDPOINT="http://selenium_app:80" - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
ports: ports:
- 80 - 8000:8000
depends_on:
- fetcher_db
- fetcher_redis
dns:
- 1.1.1.1
- 1.0.0.1
deploy:
resources:
limits:
cpus: '4'
memory: 4G
fetcher_db: fetcher_db:
image: postgres:17 image: postgres:17
container_name: db_postgres container_name: fetcher_db
restart: unless-stopped restart: unless-stopped
# Set shared memory limit when using docker-compose # Set shared memory limit when using docker-compose
shm_size: 128mb shm_size: 128mb
environment: environment:
POSTGRES_DB: ${DB_NAME:-matitos}
POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos} POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos}
POSTGRES_USER: ${DB_USERNAME:-supermatitos} POSTGRES_USER: ${DB_USER:-supermatitos}
POSTGRES_DB: ${DB_DATABASE_NAME:-matitos}
POSTGRES_INITDB_ARGS: '--data-checksums' POSTGRES_INITDB_ARGS: '--data-checksums'
#volumes: #volumes: # Persistent DB?
# - ${PATH_BASE:-.}/postgres:/var/lib/postgresql/data # - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
ports: ports:
- 5432:5432 - 5432 #:5432
fetcher_redis: fetcher_redis:
image: redis:alpine image: redis:alpine
container_name: db_redis container_name: fetcher_redis
restart: unless-stopped restart: unless-stopped
ports: ports:
- 6379:6379 - 6379 #:6379
#expose:
# - 6379
fetcher_adminer:
# http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public
image: adminer
container_name: adminer
restart: unless-stopped
environment:
- ADMINER_DEFAULT_DB_DRIVER=pgsql
#- ADMINER_DEFAULT_DB_HOST
#- ADMINER_DEFAULT_DB_NAME
depends_on:
- matitos_db
ports:
- 8080:8080
fetcher_dozzle: fetcher_dozzle:
container_name: dozzle container_name: fetcher_dozzle
image: amir20/dozzle:latest image: amir20/dozzle:latest
volumes: volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro - /var/run/docker.sock:/var/run/docker.sock:ro
ports: ports:
- 8888:8080 - 8888:8080
environment: environment:
- DOZZLE_FILTER="name=matitos_" # Need container name matitos_ ? - DOZZLE_FILTER="name=fetcher_"
# django:
# Env: DB_HOST=matitos_db
# DJANGO_DB_NAME=${DB_DATABASE_NAME:-matitos}
# DJANGO_DB_USER=${DB_USERNAME:-supermatitos}
# DJANGO_DB_PASSWORD=${DB_PASSWORD:-supermatitos}
# DJANGO_DB_HOST=${DB_HOST:-localhost}
# DJANGO_DB_PORT=${DB_PORT:-5432}