Dockerization, whitenoise serving static, refactor
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@ __pycache__/
|
||||
*.pyc
|
||||
**/credentials.py
|
||||
logs/
|
||||
postgres/
|
||||
@@ -1,363 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install git+https://github.com/tasos-py/Search-Engines-Scraper.git\n",
|
||||
"import search_engines\n",
|
||||
"\n",
|
||||
"engine = search_engines.Bing()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"results = engine.search('news: \"child abuse\"', pages=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"engine = search_engines.search_engines_dict[\"brave\"]()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = 'news: child abuse'\n",
|
||||
"r = engine.search(query, pages=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"r.__dict__"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import newspaper\n",
|
||||
"newspaper.ArticleBinaryDataException"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"'''\n",
|
||||
"import newspaper\n",
|
||||
"\n",
|
||||
"url = 'https://www.missingkids.org/poster/USVA/VA25-0820/1'\n",
|
||||
"art_1 = newspaper.article(url)\n",
|
||||
"url = 'https://www.missingkids.org/poster/NCMC/2045193/1'\n",
|
||||
"art_2 = newspaper.article(url)\n",
|
||||
"'''"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import ollama\n",
|
||||
"\n",
|
||||
"#model = \"llama3.2:1b\"\n",
|
||||
"client = ollama.Client(\n",
|
||||
" host = 'https://ollamamodel.matitos.org',\n",
|
||||
")\n",
|
||||
"l = client.list()\n",
|
||||
"list_models = [m.get(\"model\") for m in l.model_dump().get(\"models\")]\n",
|
||||
"\n",
|
||||
"print(list_models)\n",
|
||||
"\n",
|
||||
"for m in list_models:\n",
|
||||
" context_key = [ k for k in client.show(m).model_dump().get(\"modelinfo\").keys() if \"context_length\" in k]\n",
|
||||
" if (len(context_key) != 1):\n",
|
||||
" print(\"Problem!!!\")\n",
|
||||
" print(m, client.show(m).model_dump().get(\"modelinfo\").get(context_key[0]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"...\"\n",
|
||||
"model = \"falcon3:1b\"\n",
|
||||
"\n",
|
||||
"msg_content = {\n",
|
||||
" \"role\": \"user\", \n",
|
||||
" \"content\": text,\n",
|
||||
"}\n",
|
||||
"response = client.chat(model=model, messages=[msg_content], stream=False)\n",
|
||||
"print(response[\"message\"][\"content\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"import cv2\n",
|
||||
"import base64\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"endpoint = \"http://192.168.2.64:12343/image\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"prompt = \"Majestic mountain landscape with snow-capped peaks, autumn foliage in vibrant reds and oranges, a turquoise river winding through a valley, crisp and serene atmosphere, ultra-realistic style.\"\n",
|
||||
"prompt = \"A group of kids happily playing in a joy environment\"\n",
|
||||
"#prompt = \"A bitcoin behaving like a king, surrounded by small alternative coins. Detailed, geometric style\"\n",
|
||||
"\n",
|
||||
"json = {\n",
|
||||
" \"prompt\": prompt,\n",
|
||||
" \"num_inference_steps\": 10,\n",
|
||||
" \"size\": \"512x512\",\n",
|
||||
" \"seed\": 123456,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"for inf_step in [1, 4, 10, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100]:\n",
|
||||
" json[\"num_inference_steps\"] = inf_step\n",
|
||||
"\n",
|
||||
" %time r = requests.post(endpoint, json=json)\n",
|
||||
" print(\"Status code\", r.status_code)\n",
|
||||
"\n",
|
||||
" # Image\n",
|
||||
" png_as_np = np.frombuffer(base64.b64decode(r.text), dtype=np.uint8)\n",
|
||||
" image_bgr = cv2.imdecode(png_as_np, cv2.IMREAD_COLOR)\n",
|
||||
"\n",
|
||||
" cv2.imwrite(\"sample_img_{}.png\".format(json[\"num_inference_steps\"]), image_bgr)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install trafilatura\n",
|
||||
"import trafilatura\n",
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
||||
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
|
||||
"url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
|
||||
"\n",
|
||||
"# Fetch\n",
|
||||
"doc = trafilatura.fetch_url(url)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Content & metadata\n",
|
||||
"metadata = trafilatura.extract_metadata(doc)\n",
|
||||
"content = trafilatura.extract(doc)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pprint(metadata.as_dict())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install newspaper4k\n",
|
||||
"# !pip install langdetect \n",
|
||||
"import newspaper\n",
|
||||
"import langdetect\n",
|
||||
"langdetect.DetectorFactory.seed = 0\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
|
||||
"#url = \"https://www.waff.com/2025/03/11/colbert-heights-high-school-employee-arrested-child-abuse/\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"url = \"https://apnews.com/article/canada-trump-us-tariffs-steel-2517a6a2baf0596cb1a43d3a7d1e7939\"\n",
|
||||
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
||||
"#url = \"https://www.ft.com/content/6d7c6915-4ceb-43fc-9896-590036b12a87\"\n",
|
||||
"#url = \"https://www.lanacion.com.ar/politica/milei-en-bahia-blanca-un-viaje-sorpresa-para-frenar-las-criticas-y-mostrar-cercania-nid12032025/\"\n",
|
||||
"#url = \"https://www.missingkids.org/poster/NCMC/2043547/1\"\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" article = newspaper.article(url)\n",
|
||||
"except newspaper.ArticleException as e:\n",
|
||||
" print(\"ArticleException: {}\".format(str(e)))\n",
|
||||
"except Exception as e:\n",
|
||||
" print(\"Err: {}\".format(str(e)))\n",
|
||||
"\n",
|
||||
"# url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])\n",
|
||||
"# article.is_valid_url(), article.is_parsed, article.is_media_news(), article.is_valid_body()\n",
|
||||
"article.meta_data\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install news-please\n",
|
||||
"from newsplease import NewsPlease\n",
|
||||
"\n",
|
||||
"url = \"https://variety.com/2025/film/news/gene-hackman-death-suspicious-gas-leak-search-warrant-1236322610/\"\n",
|
||||
"url = \"https://www.bbc.com/news/articles/cewkkkvkzn9o\"\n",
|
||||
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
||||
"article = NewsPlease.from_url(url)\n",
|
||||
"print(article.title)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(article.maintext)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -5,6 +5,14 @@
|
||||
- Fetch parsing URL host
|
||||
- Fetch from RSS feed
|
||||
- Fetch searching (Google search & news, DuckDuckGo, ...)
|
||||
++ Sources -> Robustness to TooManyRequests block
|
||||
- Selenium based
|
||||
- Sites change their logic, request captcha, ...
|
||||
- Brave Search API
|
||||
- Free up to X requests per day. Need credit card association (no charges)
|
||||
- Bing API
|
||||
- Subscription required
|
||||
- Yandex. No API?
|
||||
- Process URLs -> Updates raw URLs
|
||||
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
|
||||
- Determines if it is a valid article content
|
||||
|
||||
@@ -2,30 +2,29 @@ import logging
|
||||
import os
|
||||
|
||||
# Get env var
|
||||
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log")
|
||||
logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
|
||||
|
||||
# Directory of logs
|
||||
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
os.makedirs(logs_directory, exist_ok=True)
|
||||
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
logger = logging.getLogger("selenium")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.INFO)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.WARNING)
|
||||
logger.addHandler(fh)
|
||||
|
||||
@@ -28,7 +28,7 @@ class MissingKidsFetcher():
|
||||
logger.debug("Processing page: {}...".format(i))
|
||||
|
||||
try:
|
||||
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3)
|
||||
time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3)
|
||||
# Fetch poster URLs
|
||||
for element_type in ["a"]: # ["a", "p", "div"]:
|
||||
for elem in driver.find_elements(By.TAG_NAME, element_type):
|
||||
|
||||
@@ -1,341 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install psycopg[binary]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"db_postgres\n",
|
||||
"db_redis\n",
|
||||
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n",
|
||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
|
||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5\n",
|
||||
"!rm logs/*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"INSERT_TABLES = True\n",
|
||||
"INSERT_SAMPLE_DATA = False\n",
|
||||
"\n",
|
||||
"import psycopg\n",
|
||||
"connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n",
|
||||
"\n",
|
||||
"from datetime import datetime, timezone\n",
|
||||
"import re\n",
|
||||
"from pprint import pprint"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if INSERT_TABLES:\n",
|
||||
" # Connect to an existing database\n",
|
||||
" with psycopg.connect(connection_info) as conn:\n",
|
||||
" # Open a cursor to perform database operations\n",
|
||||
" with conn.cursor() as cur:\n",
|
||||
" # Autocommit at end of transaction (Atomic insert of URLs and sources)\n",
|
||||
" with conn.transaction() as tx:\n",
|
||||
" # Create URLs table\n",
|
||||
" c = cur.execute(\"\"\"\n",
|
||||
" CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');\n",
|
||||
"\n",
|
||||
" CREATE TABLE URLS (\n",
|
||||
" id SERIAL PRIMARY KEY,\n",
|
||||
" url TEXT NOT NULL UNIQUE,\n",
|
||||
" ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n",
|
||||
" status URL_STATUS NOT NULL DEFAULT 'raw' -- ,\n",
|
||||
" -- status_wendy WENDY_STATUS DEFAULT NULL,\n",
|
||||
" -- ts_wendy TIMESTAMPTZ DEFAULT NULL\n",
|
||||
" );\n",
|
||||
" CREATE INDEX idx_urls_status ON urls(status);\n",
|
||||
" CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\n",
|
||||
"\n",
|
||||
" CREATE TABLE URLS_DUPLICATE (\n",
|
||||
" id_url_canonical INTEGER REFERENCES URLS(id),\n",
|
||||
" id_url_duplicated INTEGER REFERENCES URLS(id),\n",
|
||||
" PRIMARY KEY (id_url_canonical, id_url_duplicated)\n",
|
||||
" );\n",
|
||||
" \n",
|
||||
" CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');\n",
|
||||
" CREATE TABLE SEARCH (\n",
|
||||
" id SMALLSERIAL PRIMARY KEY,\n",
|
||||
" search TEXT NOT NULL UNIQUE,\n",
|
||||
" type SEARCH_TYPE NOT NULL\n",
|
||||
" -- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search\n",
|
||||
" -- UNIQUE(search, language_country)\n",
|
||||
" );\n",
|
||||
" CREATE INDEX idx_search_type ON SEARCH(type);\n",
|
||||
" \n",
|
||||
" CREATE TABLE SOURCE (\n",
|
||||
" id SMALLSERIAL PRIMARY KEY,\n",
|
||||
" source TEXT NOT NULL UNIQUE\n",
|
||||
" );\n",
|
||||
" \n",
|
||||
" -- CREATE TABLE SEARCH_LANGUAGE (\n",
|
||||
" -- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. \"en\"\n",
|
||||
" -- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. \"us\"\n",
|
||||
" -- PRIMARY KEY (language, country)\n",
|
||||
" -- );\n",
|
||||
" \n",
|
||||
" CREATE TABLE URLS_SOURCE_SEARCH (\n",
|
||||
" id_url INTEGER REFERENCES URLS(id),\n",
|
||||
" id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n",
|
||||
" id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n",
|
||||
" PRIMARY KEY(id_url, id_source, id_search)\n",
|
||||
" );\n",
|
||||
" CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);\n",
|
||||
" CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);\n",
|
||||
"\n",
|
||||
" CREATE TABLE STATUS_PATTERN_MATCHING (\n",
|
||||
" pattern TEXT PRIMARY KEY,\n",
|
||||
" priority SMALLINT NOT NULL,\n",
|
||||
" status URL_STATUS NOT NULL\n",
|
||||
" );\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" CREATE TABLE URL_CONTENT (\n",
|
||||
" id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\n",
|
||||
" date_published TIMESTAMPTZ DEFAULT NOW(),\n",
|
||||
" title TEXT,\n",
|
||||
" description TEXT,\n",
|
||||
" content TEXT,\n",
|
||||
" valid_content BOOLEAN,\n",
|
||||
" language CHAR(2), -- ISO 639-1 Code\n",
|
||||
" keywords TEXT[],\n",
|
||||
" tags TEXT[],\n",
|
||||
" authors TEXT[],\n",
|
||||
" image_main_url TEXT,\n",
|
||||
" images_url TEXT[],\n",
|
||||
" videos_url TEXT[],\n",
|
||||
" url_host TEXT, -- www.breitbart.com\n",
|
||||
" site_name TEXT -- Breitbart News\n",
|
||||
" );\n",
|
||||
" CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n",
|
||||
" CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n",
|
||||
" CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\n",
|
||||
" CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\n",
|
||||
" CREATE INDEX idx_language ON URL_CONTENT (language);\n",
|
||||
" CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n",
|
||||
" \"\"\")\n",
|
||||
"\n",
|
||||
" ### Default insert values\n",
|
||||
" \n",
|
||||
" # Feeds\n",
|
||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n",
|
||||
" # Websites of interest\n",
|
||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n",
|
||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');\" )\n",
|
||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');\" )\n",
|
||||
" # Search keywords\n",
|
||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
|
||||
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');\" )\n",
|
||||
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');\" )\n",
|
||||
" \n",
|
||||
" # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n",
|
||||
" # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n",
|
||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"youtube.com/\"))) )\n",
|
||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"tiktok.com/\"))) )\n",
|
||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"twitter.com/\"))) )\n",
|
||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"reddit.com/\"))) )\n",
|
||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"libreddit.de/\"))) )\n",
|
||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\t urls\n",
|
||||
"[]\n",
|
||||
"\t urls_duplicate\n",
|
||||
"[]\n",
|
||||
"\t urls_source_search\n",
|
||||
"[]\n",
|
||||
"\t source\n",
|
||||
"[]\n",
|
||||
"\t search\n",
|
||||
"[(1,\n",
|
||||
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
|
||||
" 'rss_feed'),\n",
|
||||
" (2, 'missingkids.org/poster', 'url_host'),\n",
|
||||
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
|
||||
" (4, 'breitbart.com', 'url_host'),\n",
|
||||
" (5, 'child abuse', 'keyword_search')]\n",
|
||||
"\t status_pattern_matching\n",
|
||||
"[('.*youtube\\\\.com/.*', 50, 'invalid'),\n",
|
||||
" ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n",
|
||||
" ('.*twitter\\\\.com/.*', 50, 'invalid'),\n",
|
||||
" ('.*reddit\\\\.com/.*', 50, 'invalid'),\n",
|
||||
" ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n",
|
||||
" ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n",
|
||||
"\t url_content\n",
|
||||
"[]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Connect to an existing database\n",
|
||||
"with psycopg.connect(connection_info) as conn:\n",
|
||||
" # Open a cursor to perform database operations\n",
|
||||
" with conn.cursor() as cur:\n",
|
||||
" # Get tables\n",
|
||||
" cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema='public';\")\n",
|
||||
" tables = [t[0] for t in cur.fetchall()]\n",
|
||||
"\n",
|
||||
" for t in tables:\n",
|
||||
" print(\"\\t\", t)\n",
|
||||
" pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[(1,\n",
|
||||
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
|
||||
" 'rss_feed'),\n",
|
||||
" (2, 'missingkids.org/poster', 'url_host'),\n",
|
||||
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
|
||||
" (4, 'breitbart.com', 'url_host'),\n",
|
||||
" (5, 'child abuse', 'keyword_search')]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Connect to an existing database\n",
|
||||
"with psycopg.connect(connection_info) as conn:\n",
|
||||
" # Open a cursor to perform database operations\n",
|
||||
" with conn.cursor() as cur:\n",
|
||||
" pprint( cur.execute(\"SELECT * FROM SEARCH;\").fetchall() )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Connect to an existing database\n",
|
||||
"with psycopg.connect(connection_info) as conn:\n",
|
||||
" # Open a cursor to perform database operations\n",
|
||||
" with conn.cursor() as cur:\n",
|
||||
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 50;\").fetchall() )\n",
|
||||
" #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\n!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"'''\n",
|
||||
"!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n",
|
||||
"\n",
|
||||
"# Connect to an existing database\n",
|
||||
"with psycopg.connect(connection_info) as conn:\n",
|
||||
" # Open a cursor to perform database operations\n",
|
||||
" with conn.cursor() as cur:\n",
|
||||
" pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\n",
|
||||
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org', 'url_host');\" )\n",
|
||||
"'''"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
48
app_urls/Dockerfile
Normal file
48
app_urls/Dockerfile
Normal file
@@ -0,0 +1,48 @@
|
||||
FROM python:3.12
|
||||
|
||||
# Prevents Python from writing pyc files to disk
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
#Prevents Python from buffering stdout and stderr
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# User
|
||||
RUN useradd -m -r appuser && \
|
||||
mkdir /opt/app && \
|
||||
chown -R appuser /opt/app
|
||||
|
||||
WORKDIR /opt/app
|
||||
|
||||
# Copy the Django project and install dependencies
|
||||
COPY requirements.txt /opt/app/
|
||||
# run this command to install all dependencies
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY --chown=appuser:appuser . /opt/app/
|
||||
|
||||
RUN chmod -R 755 /opt/app
|
||||
RUN chown -R appuser:appuser /opt/app
|
||||
USER appuser
|
||||
|
||||
# Initialization script
|
||||
RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
|
||||
echo 'if [ "${INITIALIZE_DB}" = false ]; then' >> /opt/app/initialize.sh && \
|
||||
echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \
|
||||
echo 'else' >> /opt/app/initialize.sh && \
|
||||
echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \
|
||||
echo 'sleep 5' >> /opt/app/initialize.sh && \
|
||||
echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \
|
||||
echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \
|
||||
echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \
|
||||
echo 'python manage.py collectstatic --no-input' >> /opt/app/initialize.sh && \
|
||||
echo 'python manage.py import --filename scheduled_tasks.json' >> /opt/app/initialize.sh && \
|
||||
echo 'fi' >> /opt/app/initialize.sh && \
|
||||
chmod +x /opt/app/initialize.sh
|
||||
|
||||
# Serving script
|
||||
RUN echo '#!/bin/bash' > /opt/app/run.sh && \
|
||||
echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
|
||||
#echo 'python manage.py runserver & python manage.py rqworker high default low' >> /opt/app/run.sh && \
|
||||
chmod +x /opt/app/run.sh
|
||||
|
||||
# Run Django’s server & workers
|
||||
CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"]
|
||||
@@ -2,18 +2,9 @@
|
||||
```
|
||||
conda create -n matitos_urls python=3.12
|
||||
conda activate matitos_urls
|
||||
# Core
|
||||
pip install django==5.1 psycopg[binary] django-redis django-tasks-scheduler
|
||||
# Fetcher
|
||||
pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect
|
||||
# News visualization
|
||||
pip install ollama
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
* Database
|
||||
* Database initialization -> 1-DB.ipynb
|
||||
|
||||
|
||||
* From automated inspectdb
|
||||
```
|
||||
# 1) Inspect DB, generate models.py
|
||||
@@ -74,60 +65,19 @@ class Meta:
|
||||
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
|
||||
```
|
||||
|
||||
* Database & initialization
|
||||
* Check initialize.sh on Dockerfile
|
||||
|
||||
* Environment variables
|
||||
```
|
||||
# Database
|
||||
DB_NAME=${DB_NAME:-matitos}
|
||||
DB_USER=${DB_NAME:-supermatitos}
|
||||
DB_PASSWORD=${DB_NAME:-supermatitos}
|
||||
DB_HOST=${DB_NAME:-localhost}
|
||||
DB_PORT=${DB_NAME:-5432}
|
||||
REDIS_HOST=${REDIS_HOST:-localhost}
|
||||
REDIS_PORT=${REDIS_PORT:-6379}
|
||||
|
||||
# Job timeout: 30 min
|
||||
JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
|
||||
|
||||
# Logs path
|
||||
PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
|
||||
|
||||
# Fetcher
|
||||
FETCHER_GNEWS_DECODE_SLEEP=2
|
||||
FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
|
||||
FETCHER_BETWEEN_SEARCHES_SLEEP=5
|
||||
FETCHER_URL_HOST_SLEEP=5
|
||||
FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
|
||||
|
||||
SELENIUM_ENDPOINT="http://selenium_app:80"
|
||||
```
|
||||
* In docker-compose.yml
|
||||
|
||||
* Deploy
|
||||
```
|
||||
# Migrations
|
||||
python manage.py makemigrations api; python manage.py migrate --fake-initial
|
||||
# Create user
|
||||
python manage.py createsuperuser
|
||||
# Check environments variables on docker-compose.yml
|
||||
|
||||
# 1) Server
|
||||
python manage.py runserver
|
||||
# Remove previous instances
|
||||
docker compose down -v
|
||||
|
||||
# 2) Workers
|
||||
python manage.py rqworker high default low
|
||||
|
||||
# Visualize DB
|
||||
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id
|
||||
# Build & up
|
||||
docker compose up -d --build
|
||||
```
|
||||
|
||||
* Scheduled tasks
|
||||
```
|
||||
# Import tasks
|
||||
python manage.py import --filename scheduled_tasks.json
|
||||
|
||||
# Modify using the admin panel, then save
|
||||
# python manage.py export > scheduled_tasks.json
|
||||
```
|
||||
|
||||
* Utils. TODO: To endpoint...
|
||||
```
|
||||
python manage.py rqstats
|
||||
```
|
||||
@@ -1,295 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Charts</title>
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
||||
<style>
|
||||
body {
|
||||
background-color: #333;
|
||||
color: #fff;
|
||||
font-family: Arial, sans-serif;
|
||||
}
|
||||
|
||||
h2 {
|
||||
color: #fff;
|
||||
text-align: center;
|
||||
margin-bottom: 40px;
|
||||
}
|
||||
|
||||
.chart-container {
|
||||
width: 45%;
|
||||
display: inline-block;
|
||||
margin: 20px;
|
||||
background-color: #444;
|
||||
border-radius: 10px;
|
||||
padding: 5px;
|
||||
}
|
||||
|
||||
canvas {
|
||||
background-color: #2c2c2c;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.container {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.filter-container {
|
||||
text-align: center;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
select {
|
||||
padding: 8px;
|
||||
background-color: #555;
|
||||
color: white;
|
||||
border: 1px solid #444;
|
||||
border-radius: 5px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h2>Data Visualizations</h2>
|
||||
|
||||
<!-- Filter for Number of Days -->
|
||||
<div class="filter-container">
|
||||
<label for="daysFilter">Select Number of Days:</label>
|
||||
<select id="daysFilter">
|
||||
<option value="0.25">Last 6 Hours</option>
|
||||
<option value="1">Last 24 Hours</option>
|
||||
<option value="3">Last 3 Days</option>
|
||||
<option value="7" selected>Last 7 Days</option>
|
||||
<option value="30">Last 30 Days</option>
|
||||
<option value="90">Last 90 Days</option>
|
||||
<option value="365">Last 365 Days</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div class="container">
|
||||
<div class="chart-container">
|
||||
<canvas id="urlFetchDateChart"></canvas>
|
||||
</div>
|
||||
|
||||
<div class="chart-container">
|
||||
<canvas id="urlStatusChart"></canvas>
|
||||
</div>
|
||||
|
||||
<div class="chart-container">
|
||||
<canvas id="urlsPerSourceChart"></canvas>
|
||||
</div>
|
||||
|
||||
<div class="chart-container">
|
||||
<canvas id="urlsPerSearchChart"></canvas>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
$(document).ready(function () {
|
||||
// Fetch initial data (default 30 days)
|
||||
const defaultDays = 7;
|
||||
fetchDataAndRenderCharts(defaultDays);
|
||||
|
||||
// Apply the filter automatically when the user changes the selection
|
||||
$('#daysFilter').change(function () {
|
||||
const selectedDays = $(this).val();
|
||||
fetchDataAndRenderCharts(selectedDays);
|
||||
});
|
||||
});
|
||||
|
||||
function fetchDataAndRenderCharts(days) {
|
||||
// Fetch and render the URL Fetch Date chart
|
||||
$.getJSON(`/urls-by-fetch-date/?days=${days}`, function (data) {
|
||||
renderUrlFetchDateChart(data);
|
||||
});
|
||||
|
||||
// Fetch and render the URL Status chart (with dynamic date filtering)
|
||||
$.getJSON(`/urls-per-status/?days=${days}`, function (data) {
|
||||
renderUrlStatusChart(data);
|
||||
});
|
||||
|
||||
// Fetch and render the URLs per Source chart
|
||||
$.getJSON(`/urls-per-source/?days=${days}`, function (data) {
|
||||
renderUrlsPerSourceChart(data);
|
||||
});
|
||||
|
||||
// Fetch and render the URLs per Search chart
|
||||
$.getJSON(`/urls-per-search/?days=${days}`, function (data) {
|
||||
renderUrlsPerSearchChart(data);
|
||||
});
|
||||
}
|
||||
|
||||
function renderUrlFetchDateChart(data) {
|
||||
new Chart(document.getElementById("urlFetchDateChart"), {
|
||||
type: 'bar',
|
||||
data: {
|
||||
labels: data.dates,
|
||||
datasets: [{
|
||||
label: 'URLs by Fetch Date',
|
||||
data: data.counts,
|
||||
backgroundColor: 'blue',
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
plugins: {
|
||||
legend: {
|
||||
labels: {
|
||||
color: '#fff' // Change the legend text color to white
|
||||
}
|
||||
}
|
||||
},
|
||||
scales: {
|
||||
x: {
|
||||
ticks: {
|
||||
color: "#fff" // Set x-axis ticks color
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set grid lines color
|
||||
}
|
||||
},
|
||||
y: {
|
||||
ticks: {
|
||||
color: "#fff" // Set y-axis ticks color
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set grid lines color
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function renderUrlStatusChart(data) {
|
||||
new Chart(document.getElementById("urlStatusChart"), {
|
||||
type: 'bar',
|
||||
data: {
|
||||
labels: data.statuses,
|
||||
datasets: [{
|
||||
label: 'URLs by Status',
|
||||
data: data.counts,
|
||||
backgroundColor: 'green',
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
plugins: {
|
||||
legend: {
|
||||
labels: {
|
||||
color: '#fff' // Change the legend text color to white
|
||||
}
|
||||
}
|
||||
},
|
||||
scales: {
|
||||
x: {
|
||||
ticks: {
|
||||
color: "#fff" // Set x-axis ticks color
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set grid lines color
|
||||
}
|
||||
},
|
||||
y: {
|
||||
ticks: {
|
||||
color: "#fff" // Set y-axis ticks color
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set grid lines color
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function renderUrlsPerSourceChart(data) {
|
||||
new Chart(document.getElementById("urlsPerSourceChart"), {
|
||||
type: 'bar',
|
||||
data: {
|
||||
labels: data.sources,
|
||||
datasets: [{
|
||||
label: 'URLs by Source',
|
||||
data: data.counts,
|
||||
backgroundColor: 'purple',
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
plugins: {
|
||||
legend: {
|
||||
labels: {
|
||||
color: '#fff' // Change the legend text color to white
|
||||
}
|
||||
}
|
||||
},
|
||||
scales: {
|
||||
x: {
|
||||
ticks: {
|
||||
color: "#fff" // Set x-axis ticks color
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set grid lines color
|
||||
}
|
||||
},
|
||||
y: {
|
||||
ticks: {
|
||||
color: "#fff" // Set y-axis ticks color
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set grid lines color
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function renderUrlsPerSearchChart(data) {
|
||||
new Chart(document.getElementById("urlsPerSearchChart"), {
|
||||
type: 'bar',
|
||||
data: {
|
||||
labels: data.searches,
|
||||
datasets: [{
|
||||
label: 'URLs by Search',
|
||||
data: data.counts,
|
||||
backgroundColor: 'orange',
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
plugins: {
|
||||
legend: {
|
||||
labels: {
|
||||
color: '#fff' // Change the legend text color to white
|
||||
}
|
||||
}
|
||||
},
|
||||
scales: {
|
||||
x: {
|
||||
ticks: {
|
||||
color: "#fff" // Set x-axis ticks color
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set grid lines color
|
||||
}
|
||||
},
|
||||
y: {
|
||||
ticks: {
|
||||
color: "#fff" // Set y-axis ticks color
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set grid lines color
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -20,12 +20,13 @@ BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
# Quick-start development settings - unsuitable for production
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt'
|
||||
SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt')
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
DEBUG = (os.environ.get('DJANGO_DEBUG') == "True")
|
||||
print("Django debug mode:", DEBUG)
|
||||
|
||||
ALLOWED_HOSTS = []
|
||||
ALLOWED_HOSTS = os.environ.get('DJANGO_ALLOWED_HOSTS', "*").split(",")
|
||||
|
||||
|
||||
# Application definition
|
||||
@@ -38,11 +39,12 @@ INSTALLED_APPS = [
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'scheduler',
|
||||
'api',
|
||||
'fetcher',
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'whitenoise.middleware.WhiteNoiseMiddleware', # Serving static files
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
@@ -51,6 +53,8 @@ MIDDLEWARE = [
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
]
|
||||
|
||||
STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'
|
||||
|
||||
ROOT_URLCONF = 'core.urls'
|
||||
|
||||
TEMPLATES = [
|
||||
@@ -121,7 +125,7 @@ SCHEDULER_QUEUES = {
|
||||
}
|
||||
}
|
||||
SCHEDULER_CONFIG = {
|
||||
'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 15 minutes
|
||||
'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 30 minutes
|
||||
'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours
|
||||
'EXECUTIONS_IN_PAGE': 20,
|
||||
'SCHEDULER_INTERVAL': 10, # 10 seconds
|
||||
@@ -158,7 +162,8 @@ USE_TZ = True
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
|
||||
STATIC_URL = 'static/'
|
||||
STATIC_URL = '/static/'
|
||||
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
|
||||
|
||||
# Default primary key field type
|
||||
|
||||
|
||||
@@ -20,5 +20,5 @@ from django.urls import path, include
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('scheduler/', include('scheduler.urls')),
|
||||
path('', include('api.urls')),
|
||||
path('', include('fetcher.urls')),
|
||||
]
|
||||
|
||||
145
app_urls/db.py
Normal file
145
app_urls/db.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import argparse
|
||||
import os
|
||||
import psycopg
|
||||
import re
|
||||
|
||||
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
|
||||
os.environ.get("DB_HOST", "localhost"),
|
||||
os.environ.get("DB_PORT", "5432"),
|
||||
os.environ.get("DB_NAME", "matitos"),
|
||||
os.environ.get("DB_USER", "supermatitos"),
|
||||
os.environ.get("DB_PASSWORD", "supermatitos")
|
||||
)
|
||||
|
||||
def initialize_tables():
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Autocommit at end of transaction (Atomic creation of tables)
|
||||
with conn.transaction() as tx:
|
||||
# Create URLs table
|
||||
c = cur.execute("""
|
||||
CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');
|
||||
|
||||
CREATE TABLE URLS (
|
||||
id SERIAL PRIMARY KEY,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
|
||||
-- status_wendy WENDY_STATUS DEFAULT NULL,
|
||||
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
|
||||
);
|
||||
CREATE INDEX idx_urls_status ON urls(status);
|
||||
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
|
||||
|
||||
CREATE TABLE URLS_DUPLICATE (
|
||||
id_url_canonical INTEGER REFERENCES URLS(id),
|
||||
id_url_duplicated INTEGER REFERENCES URLS(id),
|
||||
PRIMARY KEY (id_url_canonical, id_url_duplicated)
|
||||
);
|
||||
|
||||
CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');
|
||||
CREATE TABLE SEARCH (
|
||||
id SMALLSERIAL PRIMARY KEY,
|
||||
search TEXT NOT NULL UNIQUE,
|
||||
type SEARCH_TYPE NOT NULL
|
||||
-- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search
|
||||
-- UNIQUE(search, language_country)
|
||||
);
|
||||
CREATE INDEX idx_search_type ON SEARCH(type);
|
||||
|
||||
CREATE TABLE SOURCE (
|
||||
id SMALLSERIAL PRIMARY KEY,
|
||||
source TEXT NOT NULL UNIQUE
|
||||
);
|
||||
|
||||
-- CREATE TABLE SEARCH_LANGUAGE (
|
||||
-- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. "en"
|
||||
-- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. "us"
|
||||
-- PRIMARY KEY (language, country)
|
||||
-- );
|
||||
|
||||
CREATE TABLE URLS_SOURCE_SEARCH (
|
||||
id_url INTEGER REFERENCES URLS(id),
|
||||
id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,
|
||||
id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,
|
||||
PRIMARY KEY(id_url, id_source, id_search)
|
||||
);
|
||||
CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);
|
||||
CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);
|
||||
|
||||
CREATE TABLE STATUS_PATTERN_MATCHING (
|
||||
pattern TEXT PRIMARY KEY,
|
||||
priority SMALLINT NOT NULL,
|
||||
status URL_STATUS NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE URL_CONTENT (
|
||||
id_url INTEGER PRIMARY KEY REFERENCES URLS(id),
|
||||
date_published TIMESTAMPTZ DEFAULT NOW(),
|
||||
title TEXT,
|
||||
description TEXT,
|
||||
content TEXT,
|
||||
valid_content BOOLEAN,
|
||||
language CHAR(2), -- ISO 639-1 Code
|
||||
keywords TEXT[],
|
||||
tags TEXT[],
|
||||
authors TEXT[],
|
||||
image_main_url TEXT,
|
||||
images_url TEXT[],
|
||||
videos_url TEXT[],
|
||||
url_host TEXT, -- www.breitbart.com
|
||||
site_name TEXT -- Breitbart News
|
||||
);
|
||||
CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);
|
||||
CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);
|
||||
CREATE INDEX idx_date_published ON URL_CONTENT (date_published);
|
||||
CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);
|
||||
CREATE INDEX idx_language ON URL_CONTENT (language);
|
||||
CREATE INDEX idx_url_host ON URL_CONTENT (url_host);
|
||||
""")
|
||||
|
||||
def initialize_data():
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Autocommit at end of transaction (Atomic creation of data)
|
||||
with conn.transaction() as tx:
|
||||
# Feeds
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" )
|
||||
# Websites of interest
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" )
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" )
|
||||
# Search keywords
|
||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" )
|
||||
# TODO: Language per search
|
||||
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');" )
|
||||
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');" )
|
||||
|
||||
# Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) )
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) )
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) )
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) )
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) )
|
||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) )
|
||||
|
||||
def main(name):
|
||||
print('Hello, %s!' % name)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Database initialization')
|
||||
parser.add_argument('--initialize_tables', help='Create DB tables', action='store_true', default=False)
|
||||
parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
if (args.initialize_tables):
|
||||
print("Initializing tables")
|
||||
initialize_tables()
|
||||
if (args.initialize_data):
|
||||
print("Initializing data")
|
||||
initialize_data()
|
||||
@@ -1,6 +1,6 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class ApiConfig(AppConfig):
|
||||
class FetcherConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'api'
|
||||
name = 'fetcher'
|
||||
@@ -65,7 +65,7 @@ class Migration(migrations.Migration):
|
||||
migrations.CreateModel(
|
||||
name='UrlContent',
|
||||
fields=[
|
||||
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
|
||||
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
|
||||
('date_published', models.DateTimeField(blank=True, null=True)),
|
||||
('title', models.TextField(blank=True, null=True)),
|
||||
('description', models.TextField(blank=True, null=True)),
|
||||
@@ -89,7 +89,7 @@ class Migration(migrations.Migration):
|
||||
migrations.CreateModel(
|
||||
name='UrlsDuplicate',
|
||||
fields=[
|
||||
('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
|
||||
('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'urls_duplicate',
|
||||
@@ -99,7 +99,7 @@ class Migration(migrations.Migration):
|
||||
migrations.CreateModel(
|
||||
name='UrlsSourceSearch',
|
||||
fields=[
|
||||
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
|
||||
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'urls_source_search',
|
||||
@@ -1,6 +1,8 @@
|
||||
import time
|
||||
import feedparser
|
||||
import os
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from ..models import Search, Source
|
||||
from .fetch_utils import decode_gnews_urls
|
||||
from .logger import get_logger
|
||||
@@ -9,6 +11,7 @@ logger = get_logger()
|
||||
from gnews import GNews
|
||||
from duckduckgo_search import DDGS
|
||||
from GoogleNews import GoogleNews
|
||||
from search_engines import Yahoo, Aol
|
||||
|
||||
###########################################################################
|
||||
###########################################################################
|
||||
@@ -42,11 +45,19 @@ class FetcherAbstract(ABC):
|
||||
return raw_urls
|
||||
|
||||
def fetch_articles(self, db_writer, obj_search):
|
||||
# Search
|
||||
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
|
||||
# Source name
|
||||
source_name = self._get_name()
|
||||
|
||||
|
||||
# Search
|
||||
keyword_search = obj_search.search
|
||||
# URL Host search? -> site:${URL_HOST}
|
||||
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
|
||||
keyword_search = "{}{}".format("site:", keyword_search)
|
||||
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
|
||||
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
|
||||
start_date = timezone.now() - timedelta(days=7)
|
||||
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
|
||||
|
||||
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
|
||||
# Fetch
|
||||
raw_urls = self._fetch_raw_urls(keyword_search)
|
||||
@@ -165,11 +176,11 @@ class SearchGoogleGeneral(FetcherAbstract):
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
self.period = args.get("period", "7d")
|
||||
self.max_pages = args.get("max_pages", 1)
|
||||
self.pages = args.get("pages", 1)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [period] [language-country] [pages]
|
||||
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.max_pages).replace("pages=None", "").strip()
|
||||
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
@@ -181,7 +192,7 @@ class SearchGoogleGeneral(FetcherAbstract):
|
||||
|
||||
set_links = set()
|
||||
# Iterate pages
|
||||
for i in range(self.max_pages):
|
||||
for i in range(self.pages):
|
||||
# Sleep between pages fetch
|
||||
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
|
||||
# Number of URLs fetched so far
|
||||
@@ -253,7 +264,45 @@ class SearchGoogleNewsRSS(FetcherAbstract):
|
||||
urls = []
|
||||
|
||||
return urls
|
||||
|
||||
class SearchYahooGeneral(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.pages = args.get("pages", 2)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country] [pages]
|
||||
return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
results = Yahoo().search(keyword_search, pages=self.pages)
|
||||
urls = results.links()
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchAOLGeneral(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.pages = args.get("pages", 2)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country] [pages]
|
||||
return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
results = Aol().search(keyword_search, pages=self.pages)
|
||||
urls = results.links()
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
###########################################################################
|
||||
|
||||
# List of instances
|
||||
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]
|
||||
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]
|
||||
@@ -2,30 +2,29 @@ import logging
|
||||
import os
|
||||
|
||||
# Get env var
|
||||
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_fetcher_{}.log")
|
||||
logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
|
||||
|
||||
# Directory of logs
|
||||
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
os.makedirs(logs_directory, exist_ok=True)
|
||||
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
logger = logging.getLogger("fetcher")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=4)
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=2)
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.INFO)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.WARNING)
|
||||
logger.addHandler(fh)
|
||||
@@ -73,9 +73,6 @@ def process_missing_kids_urls_all(batch_size=None):
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@job('default')
|
||||
def background_task(process_type: str):
|
||||
logger.info("Task triggered: {}".format(process_type))
|
||||
179
app_urls/fetcher/templates/charts.html
Normal file
179
app_urls/fetcher/templates/charts.html
Normal file
@@ -0,0 +1,179 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Charts</title>
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
||||
<style>
|
||||
body {
|
||||
background-color: #333;
|
||||
color: #fff;
|
||||
font-family: Arial, sans-serif;
|
||||
}
|
||||
|
||||
h2 {
|
||||
color: #fff;
|
||||
text-align: center;
|
||||
margin-bottom: 40px;
|
||||
}
|
||||
|
||||
.chart-container {
|
||||
width: 45%;
|
||||
display: inline-block;
|
||||
margin: 20px;
|
||||
background-color: #444;
|
||||
border-radius: 10px;
|
||||
padding: 5px;
|
||||
}
|
||||
|
||||
canvas {
|
||||
background-color: #2c2c2c;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.container {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.filter-container {
|
||||
text-align: center;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
select {
|
||||
padding: 8px;
|
||||
background-color: #555;
|
||||
color: white;
|
||||
border: 1px solid #444;
|
||||
border-radius: 5px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h2>Data Visualizations</h2>
|
||||
|
||||
<!-- Filter for Number of Days -->
|
||||
<div class="filter-container">
|
||||
<label for="daysFilter">Select Number of Days:</label>
|
||||
<select id="daysFilter">
|
||||
<option value="0.0625">Last 90 Minutes</option>
|
||||
<option value="0.25">Last 6 Hours</option>
|
||||
<option value="1">Last 24 Hours</option>
|
||||
<option value="7" selected>Last 7 Days</option>
|
||||
<option value="30">Last 30 Days</option>
|
||||
<option value="90">Last 90 Days</option>
|
||||
<option value="365">Last 365 Days</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div class="container">
|
||||
<div class="chart-container">
|
||||
<canvas id="urlFetchDateChart"></canvas>
|
||||
</div>
|
||||
|
||||
<div class="chart-container">
|
||||
<canvas id="urlStatusChart"></canvas>
|
||||
</div>
|
||||
|
||||
<div class="chart-container">
|
||||
<canvas id="urlsPerSourceChart"></canvas>
|
||||
</div>
|
||||
|
||||
<div class="chart-container">
|
||||
<canvas id="urlsPerSearchChart"></canvas>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
$(document).ready(function () {
|
||||
let chartInstances = {}; // Store chart instances
|
||||
|
||||
// Fetch initial data (default 7 days)
|
||||
const defaultDays = 7;
|
||||
fetchDataAndRenderCharts(defaultDays);
|
||||
|
||||
// Apply the filter automatically when the user changes the selection
|
||||
$('#daysFilter').on('change', function () {
|
||||
const selectedDays = $(this).val();
|
||||
fetchDataAndRenderCharts(selectedDays);
|
||||
});
|
||||
|
||||
function fetchDataAndRenderCharts(days) {
|
||||
fetchAndRenderChart(`/urls-by-fetch-date/?days=${days}`, 'urlFetchDateChart', 'URLs by Fetch Date', 'bar');
|
||||
fetchAndRenderChart(`/urls-per-status/?days=${days}`, 'urlStatusChart', 'URLs by Status', 'bar');
|
||||
fetchAndRenderChart(`/urls-per-source/?days=${days}`, 'urlsPerSourceChart', 'URLs by Source', 'bar');
|
||||
fetchAndRenderChart(`/urls-per-search/?days=${days}`, 'urlsPerSearchChart', 'URLs by Search', 'bar');
|
||||
}
|
||||
|
||||
const categoryColors = {
|
||||
'URLs by Fetch Date': '#4BC0C0', // Color for this category
|
||||
'URLs by Status': '#36A2EB', // Color for this category
|
||||
'URLs by Source': '#4BC0C0', // Color for this category
|
||||
'URLs by Search': '#36A2EB' // Color for this category
|
||||
};
|
||||
const maxLabelLength = 35; // Limit X-axis labels to 10 characters
|
||||
|
||||
function fetchAndRenderChart(url, canvasId, chartTitle, chartType) {
|
||||
$.getJSON(url, function (data) {
|
||||
if (chartInstances[canvasId]) {
|
||||
chartInstances[canvasId].destroy(); // Destroy previous chart
|
||||
}
|
||||
|
||||
const ctx = document.getElementById(canvasId).getContext('2d');
|
||||
chartInstances[canvasId] = new Chart(ctx, {
|
||||
type: chartType,
|
||||
data: {
|
||||
labels: data.labels, // Ensure labels are passed as strings
|
||||
datasets: [{
|
||||
label: chartTitle,
|
||||
data: data.values,
|
||||
backgroundColor: categoryColors[chartTitle], // Assign the same color based on category
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
plugins: {
|
||||
legend: {
|
||||
labels: { color: '#fff' }
|
||||
}
|
||||
},
|
||||
scales: {
|
||||
x: {
|
||||
ticks: {
|
||||
color: "#fff", // Set the color of x-axis ticks
|
||||
callback: function (value) {
|
||||
let label = data.labels[value];
|
||||
if (label.length > maxLabelLength) { return label.slice(0, maxLabelLength) + '...'; }
|
||||
return label;
|
||||
}
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set the grid lines color
|
||||
}
|
||||
},
|
||||
y: {
|
||||
ticks: {
|
||||
color: "#fff" // Set the color of y-axis ticks
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set the grid lines color
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
});
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -113,11 +113,11 @@ input[type="checkbox"] {
|
||||
}
|
||||
|
||||
/* Themed Toggle Button */
|
||||
.theme-button, .home-button {
|
||||
.theme-button, .home-button, .chart-button {
|
||||
background-color: var(--sidebar);
|
||||
border: 1px solid var(--sidebar);
|
||||
border-radius: 50%;
|
||||
width: 45px;
|
||||
width: 30px;
|
||||
height: 45px;
|
||||
font-size: 25px;
|
||||
display: flex;
|
||||
@@ -127,10 +127,10 @@ input[type="checkbox"] {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.theme-button:hover, .home-button:hover {
|
||||
.theme-button:hover, .home-button:hover, .chart-button:hover {
|
||||
transform: rotate(20deg);
|
||||
}
|
||||
.theme-button:active, .home-button:active {
|
||||
.theme-button:active, .home-button:active, .chart-button:acive {
|
||||
transform: scale(0.95);
|
||||
}
|
||||
|
||||
@@ -235,6 +235,7 @@ input[type="checkbox"] {
|
||||
<div class="button-container">
|
||||
<button id="homeButton" class="home-button">🏠</button>
|
||||
<button id="themeToggle" class="theme-button">🌙</button>
|
||||
<button id="chartButton" class="chart-button">📊</button>
|
||||
</div>
|
||||
|
||||
<form method="GET" action="" id="filterForm">
|
||||
@@ -477,6 +478,10 @@ input[type="checkbox"] {
|
||||
document.getElementById("homeButton").addEventListener("click", function () {
|
||||
window.location.href = "./"; // Change this to your homepage URL if different
|
||||
});
|
||||
// Charts
|
||||
document.getElementById("chartButton").addEventListener("click", function () {
|
||||
window.location.href = "./charts"; // Change this to your homepage URL if different
|
||||
});
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Timestamp to local timezone
|
||||
@@ -508,26 +513,32 @@ input[type="checkbox"] {
|
||||
});
|
||||
});
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Function to update the form parameter before submitting
|
||||
function updateFormParameter(section) {
|
||||
const checkboxes = document.querySelectorAll(`[name='${section}']`);
|
||||
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
|
||||
|
||||
// If all are checked, replace them with a hidden input with value "all"
|
||||
if (allChecked) {
|
||||
checkboxes.forEach(checkbox => checkbox.removeAttribute("name"));
|
||||
let hiddenInput = document.createElement("input");
|
||||
hiddenInput.type = "hidden";
|
||||
hiddenInput.name = section;
|
||||
hiddenInput.value = "all";
|
||||
document.getElementById("filterForm").appendChild(hiddenInput);
|
||||
} else {
|
||||
checkboxes.forEach(checkbox => checkbox.setAttribute("name", section));
|
||||
document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove());
|
||||
}
|
||||
// Function to update the form parameters for all sections before submitting
|
||||
function updateFormParameters() {
|
||||
// Get all distinct sections by selecting all checkboxes and extracting their "name" attributes
|
||||
const sections = new Set([...document.querySelectorAll("input[type='checkbox']")].map(cb => cb.name));
|
||||
|
||||
// Submit form after changes
|
||||
sections.forEach(section => {
|
||||
if (!section) return; // Skip any checkboxes without a name
|
||||
|
||||
const checkboxes = document.querySelectorAll(`[name='${section}']`);
|
||||
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
|
||||
|
||||
// If all checkboxes in a section are checked, remove them and add a hidden input
|
||||
if (allChecked) {
|
||||
checkboxes.forEach(checkbox => checkbox.removeAttribute("name"));
|
||||
let hiddenInput = document.createElement("input");
|
||||
hiddenInput.type = "hidden";
|
||||
hiddenInput.name = section;
|
||||
hiddenInput.value = "all";
|
||||
document.getElementById("filterForm").appendChild(hiddenInput);
|
||||
} else {
|
||||
checkboxes.forEach(checkbox => checkbox.setAttribute("name", section));
|
||||
document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove());
|
||||
}
|
||||
});
|
||||
|
||||
// Submit the form after updating all sections
|
||||
document.getElementById("filterForm").submit();
|
||||
}
|
||||
|
||||
@@ -537,7 +548,7 @@ input[type="checkbox"] {
|
||||
const checkboxes = document.querySelectorAll(`[name='${section}']`);
|
||||
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
|
||||
checkboxes.forEach(cb => cb.checked = !allChecked);
|
||||
updateFormParameter(section);
|
||||
updateFormParameters();
|
||||
}
|
||||
|
||||
// Attach event listeners to "Toggle All" buttons
|
||||
@@ -552,14 +563,14 @@ input[type="checkbox"] {
|
||||
// Automatically submit the form when any checkbox changes
|
||||
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
|
||||
checkbox.addEventListener('change', function() {
|
||||
updateFormParameter(this.name);
|
||||
updateFormParameters();
|
||||
});
|
||||
});
|
||||
document.getElementById('perPageSelect').addEventListener('change', function() {
|
||||
document.getElementById('filterForm').submit();
|
||||
updateFormParameters();
|
||||
});
|
||||
document.getElementById('timeFilterSelect').addEventListener('change', function() {
|
||||
document.getElementById('filterForm').submit();
|
||||
updateFormParameters();
|
||||
});
|
||||
|
||||
|
||||
@@ -167,13 +167,14 @@
|
||||
</script>
|
||||
<body>
|
||||
|
||||
<!--
|
||||
<div class="sidebar">
|
||||
<div class="button-container">
|
||||
<button id="homeButton" class="home-button">🏠</button>
|
||||
<button id="themeToggle" class="theme-button">🌙</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
-->
|
||||
|
||||
<!-- Main Content -->
|
||||
<div class="container mt-4">
|
||||
@@ -8,7 +8,7 @@ urlpatterns = [
|
||||
#
|
||||
path('task/<str:task>', views.trigger_task, name='trigger_task'),
|
||||
#
|
||||
path('charts/', views.charts, name='charts'),
|
||||
path('urls/charts/', views.charts, name='charts'),
|
||||
path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
|
||||
path('urls-per-status/', views.urls_per_status, name='urls_per_status'),
|
||||
path('urls-per-source/', views.urls_per_source, name='urls_per_source'),
|
||||
@@ -2,6 +2,7 @@ from .tasks import background_task
|
||||
from django.core.paginator import Paginator
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
|
||||
from django.contrib.auth.decorators import login_required
|
||||
import ollama
|
||||
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
|
||||
import os
|
||||
@@ -29,17 +30,18 @@ def link_list(request):
|
||||
# URLs
|
||||
"http://localhost:8000/urls",
|
||||
# Charts
|
||||
"http://localhost:8000/charts",
|
||||
# API tasks
|
||||
"http://localhost:8000/urls/charts",
|
||||
# Fetcher tasks
|
||||
] + [os.path.join(prefix, l) for l in links]
|
||||
# Json
|
||||
return JsonResponse({"links": list_links })
|
||||
|
||||
####################################################################################################
|
||||
# @login_required(login_url='/admin')
|
||||
def logs(request, log_type):
|
||||
# Capture output: python manage.py rqstats
|
||||
try:
|
||||
with open(os.getenv("PATH_LOGS_DEBUG", "logs/log_app_fetcher_{}.log".format(log_type)), "r") as f:
|
||||
with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
|
||||
file_content = f.read()
|
||||
except Exception as e:
|
||||
file_content = "Error reading logs for log type :{}".format(log_type)
|
||||
@@ -130,8 +132,9 @@ def charts(request):
|
||||
return render(request, 'charts.html')
|
||||
|
||||
def urls_by_fetch_date(request):
|
||||
# Get the date for 30 days ago
|
||||
start_date = timezone.now() - timedelta(days=30)
|
||||
# Get the filtering date parameter
|
||||
days = float(request.GET.get('days', 30)) # Default is 30 days
|
||||
start_date = timezone.now() - timedelta(days=days)
|
||||
|
||||
# Count the number of URLs grouped by fetch date
|
||||
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
|
||||
@@ -141,8 +144,8 @@ def urls_by_fetch_date(request):
|
||||
|
||||
# Format data to return as JSON
|
||||
data = {
|
||||
'dates': [item['ts_fetch__date'] for item in urls_data],
|
||||
'counts': [item['count'] for item in urls_data],
|
||||
'labels': [item['ts_fetch__date'] for item in urls_data],
|
||||
'values': [item['count'] for item in urls_data],
|
||||
}
|
||||
|
||||
return JsonResponse(data)
|
||||
@@ -160,38 +163,48 @@ def urls_per_status(request):
|
||||
|
||||
# Format data for JSON
|
||||
data = {
|
||||
'statuses': [item['status'] for item in urls_data],
|
||||
'counts': [item['count'] for item in urls_data],
|
||||
'labels': [item['status'] for item in urls_data],
|
||||
'values': [item['count'] for item in urls_data],
|
||||
}
|
||||
|
||||
return JsonResponse(data)
|
||||
|
||||
def urls_per_source(request):
|
||||
# Get the filtering date parameter
|
||||
days = float(request.GET.get('days', 30)) # Default is 30 days
|
||||
start_date = timezone.now() - timedelta(days=days)
|
||||
|
||||
# Count the number of URLs grouped by source
|
||||
urls_data = UrlsSourceSearch.objects \
|
||||
.filter(id_url__ts_fetch__gte=start_date) \
|
||||
.values('id_source__source') \
|
||||
.annotate(count=Count('id_url')) \
|
||||
.order_by('id_source__source')
|
||||
|
||||
# Format data for JSON
|
||||
data = {
|
||||
'sources': [item['id_source__source'] for item in urls_data],
|
||||
'counts': [item['count'] for item in urls_data],
|
||||
'labels': [item['id_source__source'] for item in urls_data],
|
||||
'values': [item['count'] for item in urls_data],
|
||||
}
|
||||
|
||||
return JsonResponse(data)
|
||||
|
||||
def urls_per_search(request):
|
||||
# Get the filtering date parameter
|
||||
days = float(request.GET.get('days', 30)) # Default is 30 days
|
||||
start_date = timezone.now() - timedelta(days=days)
|
||||
|
||||
# Count the number of URLs grouped by search
|
||||
urls_data = UrlsSourceSearch.objects \
|
||||
.filter(id_url__ts_fetch__gte=start_date) \
|
||||
.values('id_search__search') \
|
||||
.annotate(count=Count('id_url')) \
|
||||
.order_by('id_search__search')
|
||||
|
||||
# Format data for JSON
|
||||
data = {
|
||||
'searches': [item['id_search__search'] for item in urls_data],
|
||||
'counts': [item['count'] for item in urls_data],
|
||||
'labels': [item['id_search__search'] for item in urls_data],
|
||||
'values': [item['count'] for item in urls_data],
|
||||
}
|
||||
|
||||
return JsonResponse(data)
|
||||
17
app_urls/requirements.txt
Normal file
17
app_urls/requirements.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
django==5.1
|
||||
psycopg[binary]
|
||||
django-redis
|
||||
django-tasks-scheduler
|
||||
gunicorn
|
||||
whitenoise
|
||||
feedparser
|
||||
python-dateutil
|
||||
newspaper4k[all]
|
||||
lxml[html_clean]
|
||||
googlenewsdecoder
|
||||
gnews
|
||||
GoogleNews
|
||||
duckduckgo_search
|
||||
git+https://github.com/tasos-py/Search-Engines-Scraper.git
|
||||
langdetect
|
||||
ollama
|
||||
@@ -2,10 +2,10 @@
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Process error URLs",
|
||||
"callable": "api.tasks.process_error_urls",
|
||||
"callable": "fetcher.tasks.process_error_urls",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": true,
|
||||
"enabled": false,
|
||||
"queue": "low",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
@@ -15,18 +15,39 @@
|
||||
"scheduled_time": "2025-04-01T12:36:21+00:00",
|
||||
"interval": 4,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 15,
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": "2025-04-01 08:37:06.722770+00:00",
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Process raw URLs",
|
||||
"callable": "fetcher.tasks.process_raw_urls",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "low",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:20:08+00:00",
|
||||
"interval": 10,
|
||||
"interval_unit": "minutes",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Process MissingKids URLs",
|
||||
"callable": "api.tasks.process_missing_kids_urls",
|
||||
"callable": "fetcher.tasks.process_missing_kids_urls",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": true,
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
@@ -34,20 +55,20 @@
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:37:50+00:00",
|
||||
"interval": 2,
|
||||
"interval": 4,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 29,
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": "2025-04-01 08:42:05.864064+00:00",
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Process MissingKids URLs ALL",
|
||||
"callable": "api.tasks.process_missing_kids_urls_all",
|
||||
"callable": "fetcher.tasks.process_missing_kids_urls_all",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": true,
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
@@ -65,10 +86,10 @@
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch Feeds",
|
||||
"callable": "api.tasks.fetch_feeds",
|
||||
"callable": "fetcher.tasks.fetch_feeds",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": true,
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
@@ -78,39 +99,18 @@
|
||||
"scheduled_time": "2025-04-01T10:18:56+00:00",
|
||||
"interval": 15,
|
||||
"interval_unit": "minutes",
|
||||
"successful_runs": 288,
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": "2025-04-01 10:03:58.363856+00:00",
|
||||
"last_failed_run": null
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Process raw URLs",
|
||||
"callable": "api.tasks.process_raw_urls",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": true,
|
||||
"queue": "low",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:20:08+00:00",
|
||||
"interval": 15,
|
||||
"interval_unit": "minutes",
|
||||
"successful_runs": 78,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": "2025-04-01 10:05:08.394472+00:00",
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch Parser",
|
||||
"callable": "api.tasks.fetch_parser",
|
||||
"callable": "fetcher.tasks.fetch_parser",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": true,
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
@@ -120,18 +120,18 @@
|
||||
"scheduled_time": "2025-04-01T10:25:42+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 62,
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": "2025-04-01 09:25:57.977051+00:00",
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch Search",
|
||||
"callable": "api.tasks.fetch_search",
|
||||
"callable": "fetcher.tasks.fetch_search",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": true,
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
@@ -141,9 +141,51 @@
|
||||
"scheduled_time": "2025-04-01T10:29:33+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 63,
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": "2025-04-01 09:37:20.671072+00:00",
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch MissingKids",
|
||||
"callable": "fetcher.tasks.fetch_missing_kids",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:29:33+00:00",
|
||||
"interval": 4,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch MissingKids ALL",
|
||||
"callable": "fetcher.tasks.fetch_missing_kids_all",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-04-01T10:29:33+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "weeks",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
}
|
||||
]
|
||||
|
||||
@@ -2,101 +2,106 @@ version: '3.9'
|
||||
|
||||
services:
|
||||
|
||||
fetcher_selenium:
|
||||
fetcher_app_selenium:
|
||||
image: fetcher_app_selenium
|
||||
build:
|
||||
context: ./app_selenium
|
||||
container_name: selenium_app
|
||||
restart: unless-stopped
|
||||
container_name: fetcher_app_selenium
|
||||
# restart: unless-stopped
|
||||
shm_size: 512mb
|
||||
environment:
|
||||
- SELENIUM_SLEEP_PER_PAGE=4
|
||||
- PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log"
|
||||
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
|
||||
ports:
|
||||
- 80
|
||||
dns:
|
||||
- 1.1.1.1
|
||||
- 1.0.0.1
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '4'
|
||||
memory: 4G
|
||||
|
||||
fetcher_urls_app:
|
||||
fetcher_app_urls:
|
||||
image: fetcher_app_urls
|
||||
build:
|
||||
context: ./app_urls
|
||||
container_name: urls_app
|
||||
restart: unless-stopped
|
||||
container_name: fetcher_app_urls
|
||||
# restart: unless-stopped
|
||||
environment:
|
||||
#- name=value
|
||||
# Initialization
|
||||
- INITIALIZE_DB=${INITIALIZE_DB:-true}
|
||||
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME:-matitos}
|
||||
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD:-matitos}
|
||||
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL:-matitos@matitos.org}
|
||||
# Django
|
||||
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:-abc123456789qwerty}
|
||||
- DJANGO_DEBUG=${DJANGO_DEBUG:-False}
|
||||
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2
|
||||
# Database
|
||||
- DB_NAME=${DB_NAME:-matitos}
|
||||
- DB_USER=${DB_NAME:-supermatitos}
|
||||
- DB_PASSWORD=${DB_NAME:-supermatitos}
|
||||
- DB_HOST=${DB_NAME:-localhost} # db_postgres
|
||||
- DB_PORT=${DB_NAME:-5432}
|
||||
- REDIS_HOST=${REDIS_HOST:-localhost}
|
||||
- DB_USER=${DB_USER:-supermatitos}
|
||||
- DB_PASSWORD=${DB_PASSWORD:-supermatitos}
|
||||
- DB_HOST=${DB_HOST:-fetcher_db}
|
||||
- DB_PORT=${DB_PORT:-5432}
|
||||
- REDIS_HOST=${REDIS_HOST:-fetcher_redis}
|
||||
- REDIS_PORT=${REDIS_PORT:-6379}
|
||||
# Job timeout: 30 min
|
||||
- JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
|
||||
# Logs path
|
||||
- PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
|
||||
# Fetcher
|
||||
- FETCHER_GNEWS_DECODE_SLEEP=2
|
||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
|
||||
- FETCHER_BETWEEN_SEARCHES_SLEEP=5
|
||||
- FETCHER_URL_HOST_SLEEP=5
|
||||
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
|
||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
|
||||
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP:-1}
|
||||
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-2}
|
||||
# Selenium
|
||||
- SELENIUM_ENDPOINT="http://selenium_app:80"
|
||||
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
|
||||
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
|
||||
ports:
|
||||
- 80
|
||||
- 8000:8000
|
||||
depends_on:
|
||||
- fetcher_db
|
||||
- fetcher_redis
|
||||
dns:
|
||||
- 1.1.1.1
|
||||
- 1.0.0.1
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '4'
|
||||
memory: 4G
|
||||
|
||||
fetcher_db:
|
||||
image: postgres:17
|
||||
container_name: db_postgres
|
||||
container_name: fetcher_db
|
||||
restart: unless-stopped
|
||||
# Set shared memory limit when using docker-compose
|
||||
shm_size: 128mb
|
||||
environment:
|
||||
POSTGRES_DB: ${DB_NAME:-matitos}
|
||||
POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos}
|
||||
POSTGRES_USER: ${DB_USERNAME:-supermatitos}
|
||||
POSTGRES_DB: ${DB_DATABASE_NAME:-matitos}
|
||||
POSTGRES_USER: ${DB_USER:-supermatitos}
|
||||
POSTGRES_INITDB_ARGS: '--data-checksums'
|
||||
#volumes:
|
||||
# - ${PATH_BASE:-.}/postgres:/var/lib/postgresql/data
|
||||
#volumes: # Persistent DB?
|
||||
# - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
|
||||
ports:
|
||||
- 5432:5432
|
||||
- 5432 #:5432
|
||||
|
||||
fetcher_redis:
|
||||
image: redis:alpine
|
||||
container_name: db_redis
|
||||
container_name: fetcher_redis
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- 6379:6379
|
||||
#expose:
|
||||
# - 6379
|
||||
|
||||
fetcher_adminer:
|
||||
# http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public
|
||||
image: adminer
|
||||
container_name: adminer
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- ADMINER_DEFAULT_DB_DRIVER=pgsql
|
||||
#- ADMINER_DEFAULT_DB_HOST
|
||||
#- ADMINER_DEFAULT_DB_NAME
|
||||
depends_on:
|
||||
- matitos_db
|
||||
ports:
|
||||
- 8080:8080
|
||||
- 6379 #:6379
|
||||
|
||||
fetcher_dozzle:
|
||||
container_name: dozzle
|
||||
container_name: fetcher_dozzle
|
||||
image: amir20/dozzle:latest
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
ports:
|
||||
- 8888:8080
|
||||
environment:
|
||||
- DOZZLE_FILTER="name=matitos_" # Need container name matitos_ ?
|
||||
|
||||
|
||||
# django:
|
||||
# Env: DB_HOST=matitos_db
|
||||
# DJANGO_DB_NAME=${DB_DATABASE_NAME:-matitos}
|
||||
# DJANGO_DB_USER=${DB_USERNAME:-supermatitos}
|
||||
# DJANGO_DB_PASSWORD=${DB_PASSWORD:-supermatitos}
|
||||
# DJANGO_DB_HOST=${DB_HOST:-localhost}
|
||||
# DJANGO_DB_PORT=${DB_PORT:-5432}
|
||||
- DOZZLE_FILTER="name=fetcher_"
|
||||
|
||||
Reference in New Issue
Block a user