Dockerization, whitenoise serving static, refactor
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -2,3 +2,4 @@ __pycache__/
|
|||||||
*.pyc
|
*.pyc
|
||||||
**/credentials.py
|
**/credentials.py
|
||||||
logs/
|
logs/
|
||||||
|
postgres/
|
||||||
@@ -1,363 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# !pip install git+https://github.com/tasos-py/Search-Engines-Scraper.git\n",
|
|
||||||
"import search_engines\n",
|
|
||||||
"\n",
|
|
||||||
"engine = search_engines.Bing()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"results = engine.search('news: \"child abuse\"', pages=2)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"engine = search_engines.search_engines_dict[\"brave\"]()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"query = 'news: child abuse'\n",
|
|
||||||
"r = engine.search(query, pages=2)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"r.__dict__"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import newspaper\n",
|
|
||||||
"newspaper.ArticleBinaryDataException"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"'''\n",
|
|
||||||
"import newspaper\n",
|
|
||||||
"\n",
|
|
||||||
"url = 'https://www.missingkids.org/poster/USVA/VA25-0820/1'\n",
|
|
||||||
"art_1 = newspaper.article(url)\n",
|
|
||||||
"url = 'https://www.missingkids.org/poster/NCMC/2045193/1'\n",
|
|
||||||
"art_2 = newspaper.article(url)\n",
|
|
||||||
"'''"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import ollama\n",
|
|
||||||
"\n",
|
|
||||||
"#model = \"llama3.2:1b\"\n",
|
|
||||||
"client = ollama.Client(\n",
|
|
||||||
" host = 'https://ollamamodel.matitos.org',\n",
|
|
||||||
")\n",
|
|
||||||
"l = client.list()\n",
|
|
||||||
"list_models = [m.get(\"model\") for m in l.model_dump().get(\"models\")]\n",
|
|
||||||
"\n",
|
|
||||||
"print(list_models)\n",
|
|
||||||
"\n",
|
|
||||||
"for m in list_models:\n",
|
|
||||||
" context_key = [ k for k in client.show(m).model_dump().get(\"modelinfo\").keys() if \"context_length\" in k]\n",
|
|
||||||
" if (len(context_key) != 1):\n",
|
|
||||||
" print(\"Problem!!!\")\n",
|
|
||||||
" print(m, client.show(m).model_dump().get(\"modelinfo\").get(context_key[0]))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"text = \"...\"\n",
|
|
||||||
"model = \"falcon3:1b\"\n",
|
|
||||||
"\n",
|
|
||||||
"msg_content = {\n",
|
|
||||||
" \"role\": \"user\", \n",
|
|
||||||
" \"content\": text,\n",
|
|
||||||
"}\n",
|
|
||||||
"response = client.chat(model=model, messages=[msg_content], stream=False)\n",
|
|
||||||
"print(response[\"message\"][\"content\"])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import requests\n",
|
|
||||||
"import cv2\n",
|
|
||||||
"import base64\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"\n",
|
|
||||||
"endpoint = \"http://192.168.2.64:12343/image\"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"prompt = \"Majestic mountain landscape with snow-capped peaks, autumn foliage in vibrant reds and oranges, a turquoise river winding through a valley, crisp and serene atmosphere, ultra-realistic style.\"\n",
|
|
||||||
"prompt = \"A group of kids happily playing in a joy environment\"\n",
|
|
||||||
"#prompt = \"A bitcoin behaving like a king, surrounded by small alternative coins. Detailed, geometric style\"\n",
|
|
||||||
"\n",
|
|
||||||
"json = {\n",
|
|
||||||
" \"prompt\": prompt,\n",
|
|
||||||
" \"num_inference_steps\": 10,\n",
|
|
||||||
" \"size\": \"512x512\",\n",
|
|
||||||
" \"seed\": 123456,\n",
|
|
||||||
"}\n",
|
|
||||||
"\n",
|
|
||||||
"for inf_step in [1, 4, 10, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100]:\n",
|
|
||||||
" json[\"num_inference_steps\"] = inf_step\n",
|
|
||||||
"\n",
|
|
||||||
" %time r = requests.post(endpoint, json=json)\n",
|
|
||||||
" print(\"Status code\", r.status_code)\n",
|
|
||||||
"\n",
|
|
||||||
" # Image\n",
|
|
||||||
" png_as_np = np.frombuffer(base64.b64decode(r.text), dtype=np.uint8)\n",
|
|
||||||
" image_bgr = cv2.imdecode(png_as_np, cv2.IMREAD_COLOR)\n",
|
|
||||||
"\n",
|
|
||||||
" cv2.imwrite(\"sample_img_{}.png\".format(json[\"num_inference_steps\"]), image_bgr)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# !pip install trafilatura\n",
|
|
||||||
"import trafilatura\n",
|
|
||||||
"from pprint import pprint\n",
|
|
||||||
"\n",
|
|
||||||
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
|
||||||
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
|
|
||||||
"url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
|
|
||||||
"\n",
|
|
||||||
"# Fetch\n",
|
|
||||||
"doc = trafilatura.fetch_url(url)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Content & metadata\n",
|
|
||||||
"metadata = trafilatura.extract_metadata(doc)\n",
|
|
||||||
"content = trafilatura.extract(doc)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"pprint(metadata.as_dict())"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"print(content)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# !pip install newspaper4k\n",
|
|
||||||
"# !pip install langdetect \n",
|
|
||||||
"import newspaper\n",
|
|
||||||
"import langdetect\n",
|
|
||||||
"langdetect.DetectorFactory.seed = 0\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
|
|
||||||
"#url = \"https://www.waff.com/2025/03/11/colbert-heights-high-school-employee-arrested-child-abuse/\"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"#url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"url = \"https://apnews.com/article/canada-trump-us-tariffs-steel-2517a6a2baf0596cb1a43d3a7d1e7939\"\n",
|
|
||||||
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
|
||||||
"#url = \"https://www.ft.com/content/6d7c6915-4ceb-43fc-9896-590036b12a87\"\n",
|
|
||||||
"#url = \"https://www.lanacion.com.ar/politica/milei-en-bahia-blanca-un-viaje-sorpresa-para-frenar-las-criticas-y-mostrar-cercania-nid12032025/\"\n",
|
|
||||||
"#url = \"https://www.missingkids.org/poster/NCMC/2043547/1\"\n",
|
|
||||||
"\n",
|
|
||||||
"try:\n",
|
|
||||||
" article = newspaper.article(url)\n",
|
|
||||||
"except newspaper.ArticleException as e:\n",
|
|
||||||
" print(\"ArticleException: {}\".format(str(e)))\n",
|
|
||||||
"except Exception as e:\n",
|
|
||||||
" print(\"Err: {}\".format(str(e)))\n",
|
|
||||||
"\n",
|
|
||||||
"# url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])\n",
|
|
||||||
"# article.is_valid_url(), article.is_parsed, article.is_media_news(), article.is_valid_body()\n",
|
|
||||||
"article.meta_data\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# !pip install news-please\n",
|
|
||||||
"from newsplease import NewsPlease\n",
|
|
||||||
"\n",
|
|
||||||
"url = \"https://variety.com/2025/film/news/gene-hackman-death-suspicious-gas-leak-search-warrant-1236322610/\"\n",
|
|
||||||
"url = \"https://www.bbc.com/news/articles/cewkkkvkzn9o\"\n",
|
|
||||||
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
|
|
||||||
"article = NewsPlease.from_url(url)\n",
|
|
||||||
"print(article.title)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"print(article.maintext)"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "matitos",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.12.9"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
||||||
@@ -5,6 +5,14 @@
|
|||||||
- Fetch parsing URL host
|
- Fetch parsing URL host
|
||||||
- Fetch from RSS feed
|
- Fetch from RSS feed
|
||||||
- Fetch searching (Google search & news, DuckDuckGo, ...)
|
- Fetch searching (Google search & news, DuckDuckGo, ...)
|
||||||
|
++ Sources -> Robustness to TooManyRequests block
|
||||||
|
- Selenium based
|
||||||
|
- Sites change their logic, request captcha, ...
|
||||||
|
- Brave Search API
|
||||||
|
- Free up to X requests per day. Need credit card association (no charges)
|
||||||
|
- Bing API
|
||||||
|
- Subscription required
|
||||||
|
- Yandex. No API?
|
||||||
- Process URLs -> Updates raw URLs
|
- Process URLs -> Updates raw URLs
|
||||||
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
|
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
|
||||||
- Determines if it is a valid article content
|
- Determines if it is a valid article content
|
||||||
|
|||||||
@@ -2,30 +2,29 @@ import logging
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
# Get env var
|
# Get env var
|
||||||
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log")
|
logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
|
||||||
|
|
||||||
# Directory of logs
|
# Directory of logs
|
||||||
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
|
os.makedirs(logs_directory, exist_ok=True)
|
||||||
os.makedirs(directory, exist_ok=True)
|
|
||||||
|
|
||||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||||
logger = logging.getLogger("news_fetcher")
|
logger = logging.getLogger("selenium")
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1)
|
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||||
fh.setLevel(logging.DEBUG)
|
fh.setLevel(logging.DEBUG)
|
||||||
logger.addHandler(fh)
|
logger.addHandler(fh)
|
||||||
|
|
||||||
# To file log: INFO / WARNING / ERROR
|
# To file log: INFO / WARNING / ERROR
|
||||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1)
|
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||||
fh.setLevel(logging.INFO)
|
fh.setLevel(logging.INFO)
|
||||||
logger.addHandler(fh)
|
logger.addHandler(fh)
|
||||||
|
|
||||||
# To file log: WARNING / ERROR / CRITICAL
|
# To file log: WARNING / ERROR / CRITICAL
|
||||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
|
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||||
fh.setLevel(logging.WARNING)
|
fh.setLevel(logging.WARNING)
|
||||||
logger.addHandler(fh)
|
logger.addHandler(fh)
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ class MissingKidsFetcher():
|
|||||||
logger.debug("Processing page: {}...".format(i))
|
logger.debug("Processing page: {}...".format(i))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3)
|
time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3)
|
||||||
# Fetch poster URLs
|
# Fetch poster URLs
|
||||||
for element_type in ["a"]: # ["a", "p", "div"]:
|
for element_type in ["a"]: # ["a", "p", "div"]:
|
||||||
for elem in driver.find_elements(By.TAG_NAME, element_type):
|
for elem in driver.find_elements(By.TAG_NAME, element_type):
|
||||||
|
|||||||
@@ -1,341 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# !pip install psycopg[binary]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"db_postgres\n",
|
|
||||||
"db_redis\n",
|
|
||||||
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n",
|
|
||||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
|
||||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
|
||||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
|
|
||||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
|
||||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
|
||||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
|
||||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
|
||||||
"\u001b[?25h"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5\n",
|
|
||||||
"!rm logs/*"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"INSERT_TABLES = True\n",
|
|
||||||
"INSERT_SAMPLE_DATA = False\n",
|
|
||||||
"\n",
|
|
||||||
"import psycopg\n",
|
|
||||||
"connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n",
|
|
||||||
"\n",
|
|
||||||
"from datetime import datetime, timezone\n",
|
|
||||||
"import re\n",
|
|
||||||
"from pprint import pprint"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"if INSERT_TABLES:\n",
|
|
||||||
" # Connect to an existing database\n",
|
|
||||||
" with psycopg.connect(connection_info) as conn:\n",
|
|
||||||
" # Open a cursor to perform database operations\n",
|
|
||||||
" with conn.cursor() as cur:\n",
|
|
||||||
" # Autocommit at end of transaction (Atomic insert of URLs and sources)\n",
|
|
||||||
" with conn.transaction() as tx:\n",
|
|
||||||
" # Create URLs table\n",
|
|
||||||
" c = cur.execute(\"\"\"\n",
|
|
||||||
" CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');\n",
|
|
||||||
"\n",
|
|
||||||
" CREATE TABLE URLS (\n",
|
|
||||||
" id SERIAL PRIMARY KEY,\n",
|
|
||||||
" url TEXT NOT NULL UNIQUE,\n",
|
|
||||||
" ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n",
|
|
||||||
" status URL_STATUS NOT NULL DEFAULT 'raw' -- ,\n",
|
|
||||||
" -- status_wendy WENDY_STATUS DEFAULT NULL,\n",
|
|
||||||
" -- ts_wendy TIMESTAMPTZ DEFAULT NULL\n",
|
|
||||||
" );\n",
|
|
||||||
" CREATE INDEX idx_urls_status ON urls(status);\n",
|
|
||||||
" CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);\n",
|
|
||||||
"\n",
|
|
||||||
" CREATE TABLE URLS_DUPLICATE (\n",
|
|
||||||
" id_url_canonical INTEGER REFERENCES URLS(id),\n",
|
|
||||||
" id_url_duplicated INTEGER REFERENCES URLS(id),\n",
|
|
||||||
" PRIMARY KEY (id_url_canonical, id_url_duplicated)\n",
|
|
||||||
" );\n",
|
|
||||||
" \n",
|
|
||||||
" CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');\n",
|
|
||||||
" CREATE TABLE SEARCH (\n",
|
|
||||||
" id SMALLSERIAL PRIMARY KEY,\n",
|
|
||||||
" search TEXT NOT NULL UNIQUE,\n",
|
|
||||||
" type SEARCH_TYPE NOT NULL\n",
|
|
||||||
" -- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search\n",
|
|
||||||
" -- UNIQUE(search, language_country)\n",
|
|
||||||
" );\n",
|
|
||||||
" CREATE INDEX idx_search_type ON SEARCH(type);\n",
|
|
||||||
" \n",
|
|
||||||
" CREATE TABLE SOURCE (\n",
|
|
||||||
" id SMALLSERIAL PRIMARY KEY,\n",
|
|
||||||
" source TEXT NOT NULL UNIQUE\n",
|
|
||||||
" );\n",
|
|
||||||
" \n",
|
|
||||||
" -- CREATE TABLE SEARCH_LANGUAGE (\n",
|
|
||||||
" -- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. \"en\"\n",
|
|
||||||
" -- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. \"us\"\n",
|
|
||||||
" -- PRIMARY KEY (language, country)\n",
|
|
||||||
" -- );\n",
|
|
||||||
" \n",
|
|
||||||
" CREATE TABLE URLS_SOURCE_SEARCH (\n",
|
|
||||||
" id_url INTEGER REFERENCES URLS(id),\n",
|
|
||||||
" id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n",
|
|
||||||
" id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n",
|
|
||||||
" PRIMARY KEY(id_url, id_source, id_search)\n",
|
|
||||||
" );\n",
|
|
||||||
" CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);\n",
|
|
||||||
" CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);\n",
|
|
||||||
"\n",
|
|
||||||
" CREATE TABLE STATUS_PATTERN_MATCHING (\n",
|
|
||||||
" pattern TEXT PRIMARY KEY,\n",
|
|
||||||
" priority SMALLINT NOT NULL,\n",
|
|
||||||
" status URL_STATUS NOT NULL\n",
|
|
||||||
" );\n",
|
|
||||||
" \n",
|
|
||||||
" \n",
|
|
||||||
" CREATE TABLE URL_CONTENT (\n",
|
|
||||||
" id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\n",
|
|
||||||
" date_published TIMESTAMPTZ DEFAULT NOW(),\n",
|
|
||||||
" title TEXT,\n",
|
|
||||||
" description TEXT,\n",
|
|
||||||
" content TEXT,\n",
|
|
||||||
" valid_content BOOLEAN,\n",
|
|
||||||
" language CHAR(2), -- ISO 639-1 Code\n",
|
|
||||||
" keywords TEXT[],\n",
|
|
||||||
" tags TEXT[],\n",
|
|
||||||
" authors TEXT[],\n",
|
|
||||||
" image_main_url TEXT,\n",
|
|
||||||
" images_url TEXT[],\n",
|
|
||||||
" videos_url TEXT[],\n",
|
|
||||||
" url_host TEXT, -- www.breitbart.com\n",
|
|
||||||
" site_name TEXT -- Breitbart News\n",
|
|
||||||
" );\n",
|
|
||||||
" CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n",
|
|
||||||
" CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n",
|
|
||||||
" CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\n",
|
|
||||||
" CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\n",
|
|
||||||
" CREATE INDEX idx_language ON URL_CONTENT (language);\n",
|
|
||||||
" CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n",
|
|
||||||
" \"\"\")\n",
|
|
||||||
"\n",
|
|
||||||
" ### Default insert values\n",
|
|
||||||
" \n",
|
|
||||||
" # Feeds\n",
|
|
||||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');\" )\n",
|
|
||||||
" # Websites of interest\n",
|
|
||||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');\" )\n",
|
|
||||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');\" )\n",
|
|
||||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');\" )\n",
|
|
||||||
" # Search keywords\n",
|
|
||||||
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
|
|
||||||
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');\" )\n",
|
|
||||||
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');\" )\n",
|
|
||||||
" \n",
|
|
||||||
" # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n",
|
|
||||||
" # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n",
|
|
||||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"youtube.com/\"))) )\n",
|
|
||||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"tiktok.com/\"))) )\n",
|
|
||||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"twitter.com/\"))) )\n",
|
|
||||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"reddit.com/\"))) )\n",
|
|
||||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"libreddit.de/\"))) )\n",
|
|
||||||
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"\t urls\n",
|
|
||||||
"[]\n",
|
|
||||||
"\t urls_duplicate\n",
|
|
||||||
"[]\n",
|
|
||||||
"\t urls_source_search\n",
|
|
||||||
"[]\n",
|
|
||||||
"\t source\n",
|
|
||||||
"[]\n",
|
|
||||||
"\t search\n",
|
|
||||||
"[(1,\n",
|
|
||||||
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
|
|
||||||
" 'rss_feed'),\n",
|
|
||||||
" (2, 'missingkids.org/poster', 'url_host'),\n",
|
|
||||||
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
|
|
||||||
" (4, 'breitbart.com', 'url_host'),\n",
|
|
||||||
" (5, 'child abuse', 'keyword_search')]\n",
|
|
||||||
"\t status_pattern_matching\n",
|
|
||||||
"[('.*youtube\\\\.com/.*', 50, 'invalid'),\n",
|
|
||||||
" ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n",
|
|
||||||
" ('.*twitter\\\\.com/.*', 50, 'invalid'),\n",
|
|
||||||
" ('.*reddit\\\\.com/.*', 50, 'invalid'),\n",
|
|
||||||
" ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n",
|
|
||||||
" ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n",
|
|
||||||
"\t url_content\n",
|
|
||||||
"[]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Connect to an existing database\n",
|
|
||||||
"with psycopg.connect(connection_info) as conn:\n",
|
|
||||||
" # Open a cursor to perform database operations\n",
|
|
||||||
" with conn.cursor() as cur:\n",
|
|
||||||
" # Get tables\n",
|
|
||||||
" cur.execute(\"SELECT table_name FROM information_schema.tables WHERE table_schema='public';\")\n",
|
|
||||||
" tables = [t[0] for t in cur.fetchall()]\n",
|
|
||||||
"\n",
|
|
||||||
" for t in tables:\n",
|
|
||||||
" print(\"\\t\", t)\n",
|
|
||||||
" pprint( cur.execute(\"SELECT * FROM {} LIMIT 50;\".format(t)).fetchall() )"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[(1,\n",
|
|
||||||
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
|
|
||||||
" 'rss_feed'),\n",
|
|
||||||
" (2, 'missingkids.org/poster', 'url_host'),\n",
|
|
||||||
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
|
|
||||||
" (4, 'breitbart.com', 'url_host'),\n",
|
|
||||||
" (5, 'child abuse', 'keyword_search')]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Connect to an existing database\n",
|
|
||||||
"with psycopg.connect(connection_info) as conn:\n",
|
|
||||||
" # Open a cursor to perform database operations\n",
|
|
||||||
" with conn.cursor() as cur:\n",
|
|
||||||
" pprint( cur.execute(\"SELECT * FROM SEARCH;\").fetchall() )"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"[]\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Connect to an existing database\n",
|
|
||||||
"with psycopg.connect(connection_info) as conn:\n",
|
|
||||||
" # Open a cursor to perform database operations\n",
|
|
||||||
" with conn.cursor() as cur:\n",
|
|
||||||
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 50;\").fetchall() )\n",
|
|
||||||
" #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'\\n!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"'''\n",
|
|
||||||
"!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n",
|
|
||||||
"\n",
|
|
||||||
"# Connect to an existing database\n",
|
|
||||||
"with psycopg.connect(connection_info) as conn:\n",
|
|
||||||
" # Open a cursor to perform database operations\n",
|
|
||||||
" with conn.cursor() as cur:\n",
|
|
||||||
" pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\n",
|
|
||||||
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('missingkids.org', 'url_host');\" )\n",
|
|
||||||
"'''"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "matitos",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.12.9"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
||||||
48
app_urls/Dockerfile
Normal file
48
app_urls/Dockerfile
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
FROM python:3.12
|
||||||
|
|
||||||
|
# Prevents Python from writing pyc files to disk
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
#Prevents Python from buffering stdout and stderr
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
# User
|
||||||
|
RUN useradd -m -r appuser && \
|
||||||
|
mkdir /opt/app && \
|
||||||
|
chown -R appuser /opt/app
|
||||||
|
|
||||||
|
WORKDIR /opt/app
|
||||||
|
|
||||||
|
# Copy the Django project and install dependencies
|
||||||
|
COPY requirements.txt /opt/app/
|
||||||
|
# run this command to install all dependencies
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
COPY --chown=appuser:appuser . /opt/app/
|
||||||
|
|
||||||
|
RUN chmod -R 755 /opt/app
|
||||||
|
RUN chown -R appuser:appuser /opt/app
|
||||||
|
USER appuser
|
||||||
|
|
||||||
|
# Initialization script
|
||||||
|
RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
|
||||||
|
echo 'if [ "${INITIALIZE_DB}" = false ]; then' >> /opt/app/initialize.sh && \
|
||||||
|
echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \
|
||||||
|
echo 'else' >> /opt/app/initialize.sh && \
|
||||||
|
echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \
|
||||||
|
echo 'sleep 5' >> /opt/app/initialize.sh && \
|
||||||
|
echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \
|
||||||
|
echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \
|
||||||
|
echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \
|
||||||
|
echo 'python manage.py collectstatic --no-input' >> /opt/app/initialize.sh && \
|
||||||
|
echo 'python manage.py import --filename scheduled_tasks.json' >> /opt/app/initialize.sh && \
|
||||||
|
echo 'fi' >> /opt/app/initialize.sh && \
|
||||||
|
chmod +x /opt/app/initialize.sh
|
||||||
|
|
||||||
|
# Serving script
|
||||||
|
RUN echo '#!/bin/bash' > /opt/app/run.sh && \
|
||||||
|
echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
|
||||||
|
#echo 'python manage.py runserver & python manage.py rqworker high default low' >> /opt/app/run.sh && \
|
||||||
|
chmod +x /opt/app/run.sh
|
||||||
|
|
||||||
|
# Run Django’s server & workers
|
||||||
|
CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"]
|
||||||
@@ -2,18 +2,9 @@
|
|||||||
```
|
```
|
||||||
conda create -n matitos_urls python=3.12
|
conda create -n matitos_urls python=3.12
|
||||||
conda activate matitos_urls
|
conda activate matitos_urls
|
||||||
# Core
|
pip install -r requirements.txt
|
||||||
pip install django==5.1 psycopg[binary] django-redis django-tasks-scheduler
|
|
||||||
# Fetcher
|
|
||||||
pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect
|
|
||||||
# News visualization
|
|
||||||
pip install ollama
|
|
||||||
```
|
```
|
||||||
|
|
||||||
* Database
|
|
||||||
* Database initialization -> 1-DB.ipynb
|
|
||||||
|
|
||||||
|
|
||||||
* From automated inspectdb
|
* From automated inspectdb
|
||||||
```
|
```
|
||||||
# 1) Inspect DB, generate models.py
|
# 1) Inspect DB, generate models.py
|
||||||
@@ -74,60 +65,19 @@ class Meta:
|
|||||||
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
|
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
* Database & initialization
|
||||||
|
* Check initialize.sh on Dockerfile
|
||||||
|
|
||||||
* Environment variables
|
* Environment variables
|
||||||
```
|
* In docker-compose.yml
|
||||||
# Database
|
|
||||||
DB_NAME=${DB_NAME:-matitos}
|
|
||||||
DB_USER=${DB_NAME:-supermatitos}
|
|
||||||
DB_PASSWORD=${DB_NAME:-supermatitos}
|
|
||||||
DB_HOST=${DB_NAME:-localhost}
|
|
||||||
DB_PORT=${DB_NAME:-5432}
|
|
||||||
REDIS_HOST=${REDIS_HOST:-localhost}
|
|
||||||
REDIS_PORT=${REDIS_PORT:-6379}
|
|
||||||
|
|
||||||
# Job timeout: 30 min
|
|
||||||
JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
|
|
||||||
|
|
||||||
# Logs path
|
|
||||||
PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
|
|
||||||
|
|
||||||
# Fetcher
|
|
||||||
FETCHER_GNEWS_DECODE_SLEEP=2
|
|
||||||
FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
|
|
||||||
FETCHER_BETWEEN_SEARCHES_SLEEP=5
|
|
||||||
FETCHER_URL_HOST_SLEEP=5
|
|
||||||
FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
|
|
||||||
|
|
||||||
SELENIUM_ENDPOINT="http://selenium_app:80"
|
|
||||||
```
|
|
||||||
|
|
||||||
* Deploy
|
* Deploy
|
||||||
```
|
```
|
||||||
# Migrations
|
# Check environments variables on docker-compose.yml
|
||||||
python manage.py makemigrations api; python manage.py migrate --fake-initial
|
|
||||||
# Create user
|
|
||||||
python manage.py createsuperuser
|
|
||||||
|
|
||||||
# 1) Server
|
# Remove previous instances
|
||||||
python manage.py runserver
|
docker compose down -v
|
||||||
|
|
||||||
# 2) Workers
|
# Build & up
|
||||||
python manage.py rqworker high default low
|
docker compose up -d --build
|
||||||
|
|
||||||
# Visualize DB
|
|
||||||
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id
|
|
||||||
```
|
```
|
||||||
|
|
||||||
* Scheduled tasks
|
|
||||||
```
|
|
||||||
# Import tasks
|
|
||||||
python manage.py import --filename scheduled_tasks.json
|
|
||||||
|
|
||||||
# Modify using the admin panel, then save
|
|
||||||
# python manage.py export > scheduled_tasks.json
|
|
||||||
```
|
|
||||||
|
|
||||||
* Utils. TODO: To endpoint...
|
|
||||||
```
|
|
||||||
python manage.py rqstats
|
|
||||||
```
|
|
||||||
@@ -1,295 +0,0 @@
|
|||||||
<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
||||||
<title>Charts</title>
|
|
||||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
|
||||||
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
|
||||||
<style>
|
|
||||||
body {
|
|
||||||
background-color: #333;
|
|
||||||
color: #fff;
|
|
||||||
font-family: Arial, sans-serif;
|
|
||||||
}
|
|
||||||
|
|
||||||
h2 {
|
|
||||||
color: #fff;
|
|
||||||
text-align: center;
|
|
||||||
margin-bottom: 40px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.chart-container {
|
|
||||||
width: 45%;
|
|
||||||
display: inline-block;
|
|
||||||
margin: 20px;
|
|
||||||
background-color: #444;
|
|
||||||
border-radius: 10px;
|
|
||||||
padding: 5px;
|
|
||||||
}
|
|
||||||
|
|
||||||
canvas {
|
|
||||||
background-color: #2c2c2c;
|
|
||||||
border-radius: 5px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.container {
|
|
||||||
display: flex;
|
|
||||||
justify-content: center;
|
|
||||||
flex-wrap: wrap;
|
|
||||||
}
|
|
||||||
|
|
||||||
.filter-container {
|
|
||||||
text-align: center;
|
|
||||||
margin-bottom: 20px;
|
|
||||||
}
|
|
||||||
|
|
||||||
select {
|
|
||||||
padding: 8px;
|
|
||||||
background-color: #555;
|
|
||||||
color: white;
|
|
||||||
border: 1px solid #444;
|
|
||||||
border-radius: 5px;
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<h2>Data Visualizations</h2>
|
|
||||||
|
|
||||||
<!-- Filter for Number of Days -->
|
|
||||||
<div class="filter-container">
|
|
||||||
<label for="daysFilter">Select Number of Days:</label>
|
|
||||||
<select id="daysFilter">
|
|
||||||
<option value="0.25">Last 6 Hours</option>
|
|
||||||
<option value="1">Last 24 Hours</option>
|
|
||||||
<option value="3">Last 3 Days</option>
|
|
||||||
<option value="7" selected>Last 7 Days</option>
|
|
||||||
<option value="30">Last 30 Days</option>
|
|
||||||
<option value="90">Last 90 Days</option>
|
|
||||||
<option value="365">Last 365 Days</option>
|
|
||||||
</select>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="container">
|
|
||||||
<div class="chart-container">
|
|
||||||
<canvas id="urlFetchDateChart"></canvas>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="chart-container">
|
|
||||||
<canvas id="urlStatusChart"></canvas>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="chart-container">
|
|
||||||
<canvas id="urlsPerSourceChart"></canvas>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="chart-container">
|
|
||||||
<canvas id="urlsPerSearchChart"></canvas>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<script>
|
|
||||||
$(document).ready(function () {
|
|
||||||
// Fetch initial data (default 30 days)
|
|
||||||
const defaultDays = 7;
|
|
||||||
fetchDataAndRenderCharts(defaultDays);
|
|
||||||
|
|
||||||
// Apply the filter automatically when the user changes the selection
|
|
||||||
$('#daysFilter').change(function () {
|
|
||||||
const selectedDays = $(this).val();
|
|
||||||
fetchDataAndRenderCharts(selectedDays);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
function fetchDataAndRenderCharts(days) {
|
|
||||||
// Fetch and render the URL Fetch Date chart
|
|
||||||
$.getJSON(`/urls-by-fetch-date/?days=${days}`, function (data) {
|
|
||||||
renderUrlFetchDateChart(data);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Fetch and render the URL Status chart (with dynamic date filtering)
|
|
||||||
$.getJSON(`/urls-per-status/?days=${days}`, function (data) {
|
|
||||||
renderUrlStatusChart(data);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Fetch and render the URLs per Source chart
|
|
||||||
$.getJSON(`/urls-per-source/?days=${days}`, function (data) {
|
|
||||||
renderUrlsPerSourceChart(data);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Fetch and render the URLs per Search chart
|
|
||||||
$.getJSON(`/urls-per-search/?days=${days}`, function (data) {
|
|
||||||
renderUrlsPerSearchChart(data);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function renderUrlFetchDateChart(data) {
|
|
||||||
new Chart(document.getElementById("urlFetchDateChart"), {
|
|
||||||
type: 'bar',
|
|
||||||
data: {
|
|
||||||
labels: data.dates,
|
|
||||||
datasets: [{
|
|
||||||
label: 'URLs by Fetch Date',
|
|
||||||
data: data.counts,
|
|
||||||
backgroundColor: 'blue',
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
options: {
|
|
||||||
responsive: true,
|
|
||||||
plugins: {
|
|
||||||
legend: {
|
|
||||||
labels: {
|
|
||||||
color: '#fff' // Change the legend text color to white
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
scales: {
|
|
||||||
x: {
|
|
||||||
ticks: {
|
|
||||||
color: "#fff" // Set x-axis ticks color
|
|
||||||
},
|
|
||||||
grid: {
|
|
||||||
color: "#444" // Set grid lines color
|
|
||||||
}
|
|
||||||
},
|
|
||||||
y: {
|
|
||||||
ticks: {
|
|
||||||
color: "#fff" // Set y-axis ticks color
|
|
||||||
},
|
|
||||||
grid: {
|
|
||||||
color: "#444" // Set grid lines color
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function renderUrlStatusChart(data) {
|
|
||||||
new Chart(document.getElementById("urlStatusChart"), {
|
|
||||||
type: 'bar',
|
|
||||||
data: {
|
|
||||||
labels: data.statuses,
|
|
||||||
datasets: [{
|
|
||||||
label: 'URLs by Status',
|
|
||||||
data: data.counts,
|
|
||||||
backgroundColor: 'green',
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
options: {
|
|
||||||
responsive: true,
|
|
||||||
plugins: {
|
|
||||||
legend: {
|
|
||||||
labels: {
|
|
||||||
color: '#fff' // Change the legend text color to white
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
scales: {
|
|
||||||
x: {
|
|
||||||
ticks: {
|
|
||||||
color: "#fff" // Set x-axis ticks color
|
|
||||||
},
|
|
||||||
grid: {
|
|
||||||
color: "#444" // Set grid lines color
|
|
||||||
}
|
|
||||||
},
|
|
||||||
y: {
|
|
||||||
ticks: {
|
|
||||||
color: "#fff" // Set y-axis ticks color
|
|
||||||
},
|
|
||||||
grid: {
|
|
||||||
color: "#444" // Set grid lines color
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function renderUrlsPerSourceChart(data) {
|
|
||||||
new Chart(document.getElementById("urlsPerSourceChart"), {
|
|
||||||
type: 'bar',
|
|
||||||
data: {
|
|
||||||
labels: data.sources,
|
|
||||||
datasets: [{
|
|
||||||
label: 'URLs by Source',
|
|
||||||
data: data.counts,
|
|
||||||
backgroundColor: 'purple',
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
options: {
|
|
||||||
responsive: true,
|
|
||||||
plugins: {
|
|
||||||
legend: {
|
|
||||||
labels: {
|
|
||||||
color: '#fff' // Change the legend text color to white
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
scales: {
|
|
||||||
x: {
|
|
||||||
ticks: {
|
|
||||||
color: "#fff" // Set x-axis ticks color
|
|
||||||
},
|
|
||||||
grid: {
|
|
||||||
color: "#444" // Set grid lines color
|
|
||||||
}
|
|
||||||
},
|
|
||||||
y: {
|
|
||||||
ticks: {
|
|
||||||
color: "#fff" // Set y-axis ticks color
|
|
||||||
},
|
|
||||||
grid: {
|
|
||||||
color: "#444" // Set grid lines color
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function renderUrlsPerSearchChart(data) {
|
|
||||||
new Chart(document.getElementById("urlsPerSearchChart"), {
|
|
||||||
type: 'bar',
|
|
||||||
data: {
|
|
||||||
labels: data.searches,
|
|
||||||
datasets: [{
|
|
||||||
label: 'URLs by Search',
|
|
||||||
data: data.counts,
|
|
||||||
backgroundColor: 'orange',
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
options: {
|
|
||||||
responsive: true,
|
|
||||||
plugins: {
|
|
||||||
legend: {
|
|
||||||
labels: {
|
|
||||||
color: '#fff' // Change the legend text color to white
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
scales: {
|
|
||||||
x: {
|
|
||||||
ticks: {
|
|
||||||
color: "#fff" // Set x-axis ticks color
|
|
||||||
},
|
|
||||||
grid: {
|
|
||||||
color: "#444" // Set grid lines color
|
|
||||||
}
|
|
||||||
},
|
|
||||||
y: {
|
|
||||||
ticks: {
|
|
||||||
color: "#fff" // Set y-axis ticks color
|
|
||||||
},
|
|
||||||
grid: {
|
|
||||||
color: "#444" // Set grid lines color
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
</script>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@@ -20,12 +20,13 @@ BASE_DIR = Path(__file__).resolve().parent.parent
|
|||||||
# Quick-start development settings - unsuitable for production
|
# Quick-start development settings - unsuitable for production
|
||||||
|
|
||||||
# SECURITY WARNING: keep the secret key used in production secret!
|
# SECURITY WARNING: keep the secret key used in production secret!
|
||||||
SECRET_KEY = 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt'
|
SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt')
|
||||||
|
|
||||||
# SECURITY WARNING: don't run with debug turned on in production!
|
# SECURITY WARNING: don't run with debug turned on in production!
|
||||||
DEBUG = True
|
DEBUG = (os.environ.get('DJANGO_DEBUG') == "True")
|
||||||
|
print("Django debug mode:", DEBUG)
|
||||||
|
|
||||||
ALLOWED_HOSTS = []
|
ALLOWED_HOSTS = os.environ.get('DJANGO_ALLOWED_HOSTS', "*").split(",")
|
||||||
|
|
||||||
|
|
||||||
# Application definition
|
# Application definition
|
||||||
@@ -38,11 +39,12 @@ INSTALLED_APPS = [
|
|||||||
'django.contrib.messages',
|
'django.contrib.messages',
|
||||||
'django.contrib.staticfiles',
|
'django.contrib.staticfiles',
|
||||||
'scheduler',
|
'scheduler',
|
||||||
'api',
|
'fetcher',
|
||||||
]
|
]
|
||||||
|
|
||||||
MIDDLEWARE = [
|
MIDDLEWARE = [
|
||||||
'django.middleware.security.SecurityMiddleware',
|
'django.middleware.security.SecurityMiddleware',
|
||||||
|
'whitenoise.middleware.WhiteNoiseMiddleware', # Serving static files
|
||||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||||
'django.middleware.common.CommonMiddleware',
|
'django.middleware.common.CommonMiddleware',
|
||||||
'django.middleware.csrf.CsrfViewMiddleware',
|
'django.middleware.csrf.CsrfViewMiddleware',
|
||||||
@@ -51,6 +53,8 @@ MIDDLEWARE = [
|
|||||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
STATICFILES_STORAGE = 'whitenoise.storage.CompressedManifestStaticFilesStorage'
|
||||||
|
|
||||||
ROOT_URLCONF = 'core.urls'
|
ROOT_URLCONF = 'core.urls'
|
||||||
|
|
||||||
TEMPLATES = [
|
TEMPLATES = [
|
||||||
@@ -121,7 +125,7 @@ SCHEDULER_QUEUES = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
SCHEDULER_CONFIG = {
|
SCHEDULER_CONFIG = {
|
||||||
'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 15 minutes
|
'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 30 minutes
|
||||||
'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours
|
'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours
|
||||||
'EXECUTIONS_IN_PAGE': 20,
|
'EXECUTIONS_IN_PAGE': 20,
|
||||||
'SCHEDULER_INTERVAL': 10, # 10 seconds
|
'SCHEDULER_INTERVAL': 10, # 10 seconds
|
||||||
@@ -158,7 +162,8 @@ USE_TZ = True
|
|||||||
|
|
||||||
# Static files (CSS, JavaScript, Images)
|
# Static files (CSS, JavaScript, Images)
|
||||||
|
|
||||||
STATIC_URL = 'static/'
|
STATIC_URL = '/static/'
|
||||||
|
STATIC_ROOT = os.path.join(BASE_DIR, 'static')
|
||||||
|
|
||||||
# Default primary key field type
|
# Default primary key field type
|
||||||
|
|
||||||
|
|||||||
@@ -20,5 +20,5 @@ from django.urls import path, include
|
|||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
path('admin/', admin.site.urls),
|
path('admin/', admin.site.urls),
|
||||||
path('scheduler/', include('scheduler.urls')),
|
path('scheduler/', include('scheduler.urls')),
|
||||||
path('', include('api.urls')),
|
path('', include('fetcher.urls')),
|
||||||
]
|
]
|
||||||
|
|||||||
145
app_urls/db.py
Normal file
145
app_urls/db.py
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import psycopg
|
||||||
|
import re
|
||||||
|
|
||||||
|
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
|
||||||
|
os.environ.get("DB_HOST", "localhost"),
|
||||||
|
os.environ.get("DB_PORT", "5432"),
|
||||||
|
os.environ.get("DB_NAME", "matitos"),
|
||||||
|
os.environ.get("DB_USER", "supermatitos"),
|
||||||
|
os.environ.get("DB_PASSWORD", "supermatitos")
|
||||||
|
)
|
||||||
|
|
||||||
|
def initialize_tables():
|
||||||
|
# Connect to an existing database
|
||||||
|
with psycopg.connect(connection_info) as conn:
|
||||||
|
# Open a cursor to perform database operations
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
# Autocommit at end of transaction (Atomic creation of tables)
|
||||||
|
with conn.transaction() as tx:
|
||||||
|
# Create URLs table
|
||||||
|
c = cur.execute("""
|
||||||
|
CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');
|
||||||
|
|
||||||
|
CREATE TABLE URLS (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
url TEXT NOT NULL UNIQUE,
|
||||||
|
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
|
||||||
|
-- status_wendy WENDY_STATUS DEFAULT NULL,
|
||||||
|
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_urls_status ON urls(status);
|
||||||
|
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
|
||||||
|
|
||||||
|
CREATE TABLE URLS_DUPLICATE (
|
||||||
|
id_url_canonical INTEGER REFERENCES URLS(id),
|
||||||
|
id_url_duplicated INTEGER REFERENCES URLS(id),
|
||||||
|
PRIMARY KEY (id_url_canonical, id_url_duplicated)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');
|
||||||
|
CREATE TABLE SEARCH (
|
||||||
|
id SMALLSERIAL PRIMARY KEY,
|
||||||
|
search TEXT NOT NULL UNIQUE,
|
||||||
|
type SEARCH_TYPE NOT NULL
|
||||||
|
-- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search
|
||||||
|
-- UNIQUE(search, language_country)
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_search_type ON SEARCH(type);
|
||||||
|
|
||||||
|
CREATE TABLE SOURCE (
|
||||||
|
id SMALLSERIAL PRIMARY KEY,
|
||||||
|
source TEXT NOT NULL UNIQUE
|
||||||
|
);
|
||||||
|
|
||||||
|
-- CREATE TABLE SEARCH_LANGUAGE (
|
||||||
|
-- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. "en"
|
||||||
|
-- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. "us"
|
||||||
|
-- PRIMARY KEY (language, country)
|
||||||
|
-- );
|
||||||
|
|
||||||
|
CREATE TABLE URLS_SOURCE_SEARCH (
|
||||||
|
id_url INTEGER REFERENCES URLS(id),
|
||||||
|
id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,
|
||||||
|
id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,
|
||||||
|
PRIMARY KEY(id_url, id_source, id_search)
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);
|
||||||
|
CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);
|
||||||
|
|
||||||
|
CREATE TABLE STATUS_PATTERN_MATCHING (
|
||||||
|
pattern TEXT PRIMARY KEY,
|
||||||
|
priority SMALLINT NOT NULL,
|
||||||
|
status URL_STATUS NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE URL_CONTENT (
|
||||||
|
id_url INTEGER PRIMARY KEY REFERENCES URLS(id),
|
||||||
|
date_published TIMESTAMPTZ DEFAULT NOW(),
|
||||||
|
title TEXT,
|
||||||
|
description TEXT,
|
||||||
|
content TEXT,
|
||||||
|
valid_content BOOLEAN,
|
||||||
|
language CHAR(2), -- ISO 639-1 Code
|
||||||
|
keywords TEXT[],
|
||||||
|
tags TEXT[],
|
||||||
|
authors TEXT[],
|
||||||
|
image_main_url TEXT,
|
||||||
|
images_url TEXT[],
|
||||||
|
videos_url TEXT[],
|
||||||
|
url_host TEXT, -- www.breitbart.com
|
||||||
|
site_name TEXT -- Breitbart News
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);
|
||||||
|
CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);
|
||||||
|
CREATE INDEX idx_date_published ON URL_CONTENT (date_published);
|
||||||
|
CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);
|
||||||
|
CREATE INDEX idx_language ON URL_CONTENT (language);
|
||||||
|
CREATE INDEX idx_url_host ON URL_CONTENT (url_host);
|
||||||
|
""")
|
||||||
|
|
||||||
|
def initialize_data():
|
||||||
|
# Connect to an existing database
|
||||||
|
with psycopg.connect(connection_info) as conn:
|
||||||
|
# Open a cursor to perform database operations
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
# Autocommit at end of transaction (Atomic creation of data)
|
||||||
|
with conn.transaction() as tx:
|
||||||
|
# Feeds
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" )
|
||||||
|
# Websites of interest
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" )
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" )
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" )
|
||||||
|
# Search keywords
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" )
|
||||||
|
# TODO: Language per search
|
||||||
|
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');" )
|
||||||
|
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');" )
|
||||||
|
|
||||||
|
# Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/
|
||||||
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) )
|
||||||
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) )
|
||||||
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) )
|
||||||
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) )
|
||||||
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) )
|
||||||
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) )
|
||||||
|
|
||||||
|
def main(name):
|
||||||
|
print('Hello, %s!' % name)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description='Database initialization')
|
||||||
|
parser.add_argument('--initialize_tables', help='Create DB tables', action='store_true', default=False)
|
||||||
|
parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if (args.initialize_tables):
|
||||||
|
print("Initializing tables")
|
||||||
|
initialize_tables()
|
||||||
|
if (args.initialize_data):
|
||||||
|
print("Initializing data")
|
||||||
|
initialize_data()
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
class ApiConfig(AppConfig):
|
class FetcherConfig(AppConfig):
|
||||||
default_auto_field = 'django.db.models.BigAutoField'
|
default_auto_field = 'django.db.models.BigAutoField'
|
||||||
name = 'api'
|
name = 'fetcher'
|
||||||
@@ -65,7 +65,7 @@ class Migration(migrations.Migration):
|
|||||||
migrations.CreateModel(
|
migrations.CreateModel(
|
||||||
name='UrlContent',
|
name='UrlContent',
|
||||||
fields=[
|
fields=[
|
||||||
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
|
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
|
||||||
('date_published', models.DateTimeField(blank=True, null=True)),
|
('date_published', models.DateTimeField(blank=True, null=True)),
|
||||||
('title', models.TextField(blank=True, null=True)),
|
('title', models.TextField(blank=True, null=True)),
|
||||||
('description', models.TextField(blank=True, null=True)),
|
('description', models.TextField(blank=True, null=True)),
|
||||||
@@ -89,7 +89,7 @@ class Migration(migrations.Migration):
|
|||||||
migrations.CreateModel(
|
migrations.CreateModel(
|
||||||
name='UrlsDuplicate',
|
name='UrlsDuplicate',
|
||||||
fields=[
|
fields=[
|
||||||
('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
|
('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
|
||||||
],
|
],
|
||||||
options={
|
options={
|
||||||
'db_table': 'urls_duplicate',
|
'db_table': 'urls_duplicate',
|
||||||
@@ -99,7 +99,7 @@ class Migration(migrations.Migration):
|
|||||||
migrations.CreateModel(
|
migrations.CreateModel(
|
||||||
name='UrlsSourceSearch',
|
name='UrlsSourceSearch',
|
||||||
fields=[
|
fields=[
|
||||||
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
|
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
|
||||||
],
|
],
|
||||||
options={
|
options={
|
||||||
'db_table': 'urls_source_search',
|
'db_table': 'urls_source_search',
|
||||||
@@ -1,6 +1,8 @@
|
|||||||
import time
|
import time
|
||||||
import feedparser
|
import feedparser
|
||||||
import os
|
import os
|
||||||
|
from django.utils import timezone
|
||||||
|
from datetime import timedelta
|
||||||
from ..models import Search, Source
|
from ..models import Search, Source
|
||||||
from .fetch_utils import decode_gnews_urls
|
from .fetch_utils import decode_gnews_urls
|
||||||
from .logger import get_logger
|
from .logger import get_logger
|
||||||
@@ -9,6 +11,7 @@ logger = get_logger()
|
|||||||
from gnews import GNews
|
from gnews import GNews
|
||||||
from duckduckgo_search import DDGS
|
from duckduckgo_search import DDGS
|
||||||
from GoogleNews import GoogleNews
|
from GoogleNews import GoogleNews
|
||||||
|
from search_engines import Yahoo, Aol
|
||||||
|
|
||||||
###########################################################################
|
###########################################################################
|
||||||
###########################################################################
|
###########################################################################
|
||||||
@@ -42,11 +45,19 @@ class FetcherAbstract(ABC):
|
|||||||
return raw_urls
|
return raw_urls
|
||||||
|
|
||||||
def fetch_articles(self, db_writer, obj_search):
|
def fetch_articles(self, db_writer, obj_search):
|
||||||
# Search
|
|
||||||
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
|
|
||||||
# Source name
|
# Source name
|
||||||
source_name = self._get_name()
|
source_name = self._get_name()
|
||||||
|
|
||||||
|
# Search
|
||||||
|
keyword_search = obj_search.search
|
||||||
|
# URL Host search? -> site:${URL_HOST}
|
||||||
|
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
|
||||||
|
keyword_search = "{}{}".format("site:", keyword_search)
|
||||||
|
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
|
||||||
|
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
|
||||||
|
start_date = timezone.now() - timedelta(days=7)
|
||||||
|
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
|
||||||
|
|
||||||
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
|
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
|
||||||
# Fetch
|
# Fetch
|
||||||
raw_urls = self._fetch_raw_urls(keyword_search)
|
raw_urls = self._fetch_raw_urls(keyword_search)
|
||||||
@@ -165,11 +176,11 @@ class SearchGoogleGeneral(FetcherAbstract):
|
|||||||
self.language = args.get("language", "en")
|
self.language = args.get("language", "en")
|
||||||
self.country = args.get("country", "US")
|
self.country = args.get("country", "US")
|
||||||
self.period = args.get("period", "7d")
|
self.period = args.get("period", "7d")
|
||||||
self.max_pages = args.get("max_pages", 1)
|
self.pages = args.get("pages", 1)
|
||||||
|
|
||||||
def _get_name(self):
|
def _get_name(self):
|
||||||
# [source] [period] [language-country] [pages]
|
# [source] [period] [language-country] [pages]
|
||||||
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.max_pages).replace("pages=None", "").strip()
|
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip()
|
||||||
|
|
||||||
def _fetch_raw_urls(self, keyword_search):
|
def _fetch_raw_urls(self, keyword_search):
|
||||||
try:
|
try:
|
||||||
@@ -181,7 +192,7 @@ class SearchGoogleGeneral(FetcherAbstract):
|
|||||||
|
|
||||||
set_links = set()
|
set_links = set()
|
||||||
# Iterate pages
|
# Iterate pages
|
||||||
for i in range(self.max_pages):
|
for i in range(self.pages):
|
||||||
# Sleep between pages fetch
|
# Sleep between pages fetch
|
||||||
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
|
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
|
||||||
# Number of URLs fetched so far
|
# Number of URLs fetched so far
|
||||||
@@ -253,7 +264,45 @@ class SearchGoogleNewsRSS(FetcherAbstract):
|
|||||||
urls = []
|
urls = []
|
||||||
|
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
class SearchYahooGeneral(FetcherAbstract):
|
||||||
|
def __init__(self, args={}):
|
||||||
|
super().__init__()
|
||||||
|
# Parameters
|
||||||
|
self.pages = args.get("pages", 2)
|
||||||
|
|
||||||
|
def _get_name(self):
|
||||||
|
# [source] [language-country] [pages]
|
||||||
|
return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip()
|
||||||
|
|
||||||
|
def _fetch_raw_urls(self, keyword_search):
|
||||||
|
try:
|
||||||
|
results = Yahoo().search(keyword_search, pages=self.pages)
|
||||||
|
urls = results.links()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||||
|
urls = []
|
||||||
|
return urls
|
||||||
|
|
||||||
|
class SearchAOLGeneral(FetcherAbstract):
|
||||||
|
def __init__(self, args={}):
|
||||||
|
super().__init__()
|
||||||
|
# Parameters
|
||||||
|
self.pages = args.get("pages", 2)
|
||||||
|
|
||||||
|
def _get_name(self):
|
||||||
|
# [source] [language-country] [pages]
|
||||||
|
return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip()
|
||||||
|
|
||||||
|
def _fetch_raw_urls(self, keyword_search):
|
||||||
|
try:
|
||||||
|
results = Aol().search(keyword_search, pages=self.pages)
|
||||||
|
urls = results.links()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||||
|
urls = []
|
||||||
|
return urls
|
||||||
###########################################################################
|
###########################################################################
|
||||||
|
|
||||||
# List of instances
|
# List of instances
|
||||||
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]
|
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]
|
||||||
@@ -2,30 +2,29 @@ import logging
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
# Get env var
|
# Get env var
|
||||||
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_fetcher_{}.log")
|
logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
|
||||||
|
|
||||||
# Directory of logs
|
# Directory of logs
|
||||||
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
|
os.makedirs(logs_directory, exist_ok=True)
|
||||||
os.makedirs(directory, exist_ok=True)
|
|
||||||
|
|
||||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||||
logger = logging.getLogger("news_fetcher")
|
logger = logging.getLogger("fetcher")
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=4)
|
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||||
fh.setLevel(logging.DEBUG)
|
fh.setLevel(logging.DEBUG)
|
||||||
logger.addHandler(fh)
|
logger.addHandler(fh)
|
||||||
|
|
||||||
# To file log: INFO / WARNING / ERROR
|
# To file log: INFO / WARNING / ERROR
|
||||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=2)
|
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||||
fh.setLevel(logging.INFO)
|
fh.setLevel(logging.INFO)
|
||||||
logger.addHandler(fh)
|
logger.addHandler(fh)
|
||||||
|
|
||||||
# To file log: WARNING / ERROR / CRITICAL
|
# To file log: WARNING / ERROR / CRITICAL
|
||||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
|
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||||
fh.setLevel(logging.WARNING)
|
fh.setLevel(logging.WARNING)
|
||||||
logger.addHandler(fh)
|
logger.addHandler(fh)
|
||||||
@@ -73,9 +73,6 @@ def process_missing_kids_urls_all(batch_size=None):
|
|||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@job('default')
|
@job('default')
|
||||||
def background_task(process_type: str):
|
def background_task(process_type: str):
|
||||||
logger.info("Task triggered: {}".format(process_type))
|
logger.info("Task triggered: {}".format(process_type))
|
||||||
179
app_urls/fetcher/templates/charts.html
Normal file
179
app_urls/fetcher/templates/charts.html
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Charts</title>
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||||
|
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
background-color: #333;
|
||||||
|
color: #fff;
|
||||||
|
font-family: Arial, sans-serif;
|
||||||
|
}
|
||||||
|
|
||||||
|
h2 {
|
||||||
|
color: #fff;
|
||||||
|
text-align: center;
|
||||||
|
margin-bottom: 40px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.chart-container {
|
||||||
|
width: 45%;
|
||||||
|
display: inline-block;
|
||||||
|
margin: 20px;
|
||||||
|
background-color: #444;
|
||||||
|
border-radius: 10px;
|
||||||
|
padding: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
canvas {
|
||||||
|
background-color: #2c2c2c;
|
||||||
|
border-radius: 5px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.container {
|
||||||
|
display: flex;
|
||||||
|
justify-content: center;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.filter-container {
|
||||||
|
text-align: center;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
padding: 8px;
|
||||||
|
background-color: #555;
|
||||||
|
color: white;
|
||||||
|
border: 1px solid #444;
|
||||||
|
border-radius: 5px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h2>Data Visualizations</h2>
|
||||||
|
|
||||||
|
<!-- Filter for Number of Days -->
|
||||||
|
<div class="filter-container">
|
||||||
|
<label for="daysFilter">Select Number of Days:</label>
|
||||||
|
<select id="daysFilter">
|
||||||
|
<option value="0.0625">Last 90 Minutes</option>
|
||||||
|
<option value="0.25">Last 6 Hours</option>
|
||||||
|
<option value="1">Last 24 Hours</option>
|
||||||
|
<option value="7" selected>Last 7 Days</option>
|
||||||
|
<option value="30">Last 30 Days</option>
|
||||||
|
<option value="90">Last 90 Days</option>
|
||||||
|
<option value="365">Last 365 Days</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<div class="chart-container">
|
||||||
|
<canvas id="urlFetchDateChart"></canvas>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="chart-container">
|
||||||
|
<canvas id="urlStatusChart"></canvas>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="chart-container">
|
||||||
|
<canvas id="urlsPerSourceChart"></canvas>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="chart-container">
|
||||||
|
<canvas id="urlsPerSearchChart"></canvas>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
$(document).ready(function () {
|
||||||
|
let chartInstances = {}; // Store chart instances
|
||||||
|
|
||||||
|
// Fetch initial data (default 7 days)
|
||||||
|
const defaultDays = 7;
|
||||||
|
fetchDataAndRenderCharts(defaultDays);
|
||||||
|
|
||||||
|
// Apply the filter automatically when the user changes the selection
|
||||||
|
$('#daysFilter').on('change', function () {
|
||||||
|
const selectedDays = $(this).val();
|
||||||
|
fetchDataAndRenderCharts(selectedDays);
|
||||||
|
});
|
||||||
|
|
||||||
|
function fetchDataAndRenderCharts(days) {
|
||||||
|
fetchAndRenderChart(`/urls-by-fetch-date/?days=${days}`, 'urlFetchDateChart', 'URLs by Fetch Date', 'bar');
|
||||||
|
fetchAndRenderChart(`/urls-per-status/?days=${days}`, 'urlStatusChart', 'URLs by Status', 'bar');
|
||||||
|
fetchAndRenderChart(`/urls-per-source/?days=${days}`, 'urlsPerSourceChart', 'URLs by Source', 'bar');
|
||||||
|
fetchAndRenderChart(`/urls-per-search/?days=${days}`, 'urlsPerSearchChart', 'URLs by Search', 'bar');
|
||||||
|
}
|
||||||
|
|
||||||
|
const categoryColors = {
|
||||||
|
'URLs by Fetch Date': '#4BC0C0', // Color for this category
|
||||||
|
'URLs by Status': '#36A2EB', // Color for this category
|
||||||
|
'URLs by Source': '#4BC0C0', // Color for this category
|
||||||
|
'URLs by Search': '#36A2EB' // Color for this category
|
||||||
|
};
|
||||||
|
const maxLabelLength = 35; // Limit X-axis labels to 10 characters
|
||||||
|
|
||||||
|
function fetchAndRenderChart(url, canvasId, chartTitle, chartType) {
|
||||||
|
$.getJSON(url, function (data) {
|
||||||
|
if (chartInstances[canvasId]) {
|
||||||
|
chartInstances[canvasId].destroy(); // Destroy previous chart
|
||||||
|
}
|
||||||
|
|
||||||
|
const ctx = document.getElementById(canvasId).getContext('2d');
|
||||||
|
chartInstances[canvasId] = new Chart(ctx, {
|
||||||
|
type: chartType,
|
||||||
|
data: {
|
||||||
|
labels: data.labels, // Ensure labels are passed as strings
|
||||||
|
datasets: [{
|
||||||
|
label: chartTitle,
|
||||||
|
data: data.values,
|
||||||
|
backgroundColor: categoryColors[chartTitle], // Assign the same color based on category
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
options: {
|
||||||
|
responsive: true,
|
||||||
|
plugins: {
|
||||||
|
legend: {
|
||||||
|
labels: { color: '#fff' }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
scales: {
|
||||||
|
x: {
|
||||||
|
ticks: {
|
||||||
|
color: "#fff", // Set the color of x-axis ticks
|
||||||
|
callback: function (value) {
|
||||||
|
let label = data.labels[value];
|
||||||
|
if (label.length > maxLabelLength) { return label.slice(0, maxLabelLength) + '...'; }
|
||||||
|
return label;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
grid: {
|
||||||
|
color: "#444" // Set the grid lines color
|
||||||
|
}
|
||||||
|
},
|
||||||
|
y: {
|
||||||
|
ticks: {
|
||||||
|
color: "#fff" // Set the color of y-axis ticks
|
||||||
|
},
|
||||||
|
grid: {
|
||||||
|
color: "#444" // Set the grid lines color
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
</script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -113,11 +113,11 @@ input[type="checkbox"] {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Themed Toggle Button */
|
/* Themed Toggle Button */
|
||||||
.theme-button, .home-button {
|
.theme-button, .home-button, .chart-button {
|
||||||
background-color: var(--sidebar);
|
background-color: var(--sidebar);
|
||||||
border: 1px solid var(--sidebar);
|
border: 1px solid var(--sidebar);
|
||||||
border-radius: 50%;
|
border-radius: 50%;
|
||||||
width: 45px;
|
width: 30px;
|
||||||
height: 45px;
|
height: 45px;
|
||||||
font-size: 25px;
|
font-size: 25px;
|
||||||
display: flex;
|
display: flex;
|
||||||
@@ -127,10 +127,10 @@ input[type="checkbox"] {
|
|||||||
cursor: pointer;
|
cursor: pointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
.theme-button:hover, .home-button:hover {
|
.theme-button:hover, .home-button:hover, .chart-button:hover {
|
||||||
transform: rotate(20deg);
|
transform: rotate(20deg);
|
||||||
}
|
}
|
||||||
.theme-button:active, .home-button:active {
|
.theme-button:active, .home-button:active, .chart-button:acive {
|
||||||
transform: scale(0.95);
|
transform: scale(0.95);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -235,6 +235,7 @@ input[type="checkbox"] {
|
|||||||
<div class="button-container">
|
<div class="button-container">
|
||||||
<button id="homeButton" class="home-button">🏠</button>
|
<button id="homeButton" class="home-button">🏠</button>
|
||||||
<button id="themeToggle" class="theme-button">🌙</button>
|
<button id="themeToggle" class="theme-button">🌙</button>
|
||||||
|
<button id="chartButton" class="chart-button">📊</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<form method="GET" action="" id="filterForm">
|
<form method="GET" action="" id="filterForm">
|
||||||
@@ -477,6 +478,10 @@ input[type="checkbox"] {
|
|||||||
document.getElementById("homeButton").addEventListener("click", function () {
|
document.getElementById("homeButton").addEventListener("click", function () {
|
||||||
window.location.href = "./"; // Change this to your homepage URL if different
|
window.location.href = "./"; // Change this to your homepage URL if different
|
||||||
});
|
});
|
||||||
|
// Charts
|
||||||
|
document.getElementById("chartButton").addEventListener("click", function () {
|
||||||
|
window.location.href = "./charts"; // Change this to your homepage URL if different
|
||||||
|
});
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
// Timestamp to local timezone
|
// Timestamp to local timezone
|
||||||
@@ -508,26 +513,32 @@ input[type="checkbox"] {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////
|
// Function to update the form parameters for all sections before submitting
|
||||||
// Function to update the form parameter before submitting
|
function updateFormParameters() {
|
||||||
function updateFormParameter(section) {
|
// Get all distinct sections by selecting all checkboxes and extracting their "name" attributes
|
||||||
const checkboxes = document.querySelectorAll(`[name='${section}']`);
|
const sections = new Set([...document.querySelectorAll("input[type='checkbox']")].map(cb => cb.name));
|
||||||
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
|
|
||||||
|
|
||||||
// If all are checked, replace them with a hidden input with value "all"
|
|
||||||
if (allChecked) {
|
|
||||||
checkboxes.forEach(checkbox => checkbox.removeAttribute("name"));
|
|
||||||
let hiddenInput = document.createElement("input");
|
|
||||||
hiddenInput.type = "hidden";
|
|
||||||
hiddenInput.name = section;
|
|
||||||
hiddenInput.value = "all";
|
|
||||||
document.getElementById("filterForm").appendChild(hiddenInput);
|
|
||||||
} else {
|
|
||||||
checkboxes.forEach(checkbox => checkbox.setAttribute("name", section));
|
|
||||||
document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Submit form after changes
|
sections.forEach(section => {
|
||||||
|
if (!section) return; // Skip any checkboxes without a name
|
||||||
|
|
||||||
|
const checkboxes = document.querySelectorAll(`[name='${section}']`);
|
||||||
|
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
|
||||||
|
|
||||||
|
// If all checkboxes in a section are checked, remove them and add a hidden input
|
||||||
|
if (allChecked) {
|
||||||
|
checkboxes.forEach(checkbox => checkbox.removeAttribute("name"));
|
||||||
|
let hiddenInput = document.createElement("input");
|
||||||
|
hiddenInput.type = "hidden";
|
||||||
|
hiddenInput.name = section;
|
||||||
|
hiddenInput.value = "all";
|
||||||
|
document.getElementById("filterForm").appendChild(hiddenInput);
|
||||||
|
} else {
|
||||||
|
checkboxes.forEach(checkbox => checkbox.setAttribute("name", section));
|
||||||
|
document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Submit the form after updating all sections
|
||||||
document.getElementById("filterForm").submit();
|
document.getElementById("filterForm").submit();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -537,7 +548,7 @@ input[type="checkbox"] {
|
|||||||
const checkboxes = document.querySelectorAll(`[name='${section}']`);
|
const checkboxes = document.querySelectorAll(`[name='${section}']`);
|
||||||
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
|
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
|
||||||
checkboxes.forEach(cb => cb.checked = !allChecked);
|
checkboxes.forEach(cb => cb.checked = !allChecked);
|
||||||
updateFormParameter(section);
|
updateFormParameters();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Attach event listeners to "Toggle All" buttons
|
// Attach event listeners to "Toggle All" buttons
|
||||||
@@ -552,14 +563,14 @@ input[type="checkbox"] {
|
|||||||
// Automatically submit the form when any checkbox changes
|
// Automatically submit the form when any checkbox changes
|
||||||
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
|
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
|
||||||
checkbox.addEventListener('change', function() {
|
checkbox.addEventListener('change', function() {
|
||||||
updateFormParameter(this.name);
|
updateFormParameters();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
document.getElementById('perPageSelect').addEventListener('change', function() {
|
document.getElementById('perPageSelect').addEventListener('change', function() {
|
||||||
document.getElementById('filterForm').submit();
|
updateFormParameters();
|
||||||
});
|
});
|
||||||
document.getElementById('timeFilterSelect').addEventListener('change', function() {
|
document.getElementById('timeFilterSelect').addEventListener('change', function() {
|
||||||
document.getElementById('filterForm').submit();
|
updateFormParameters();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
@@ -167,13 +167,14 @@
|
|||||||
</script>
|
</script>
|
||||||
<body>
|
<body>
|
||||||
|
|
||||||
|
<!--
|
||||||
<div class="sidebar">
|
<div class="sidebar">
|
||||||
<div class="button-container">
|
<div class="button-container">
|
||||||
<button id="homeButton" class="home-button">🏠</button>
|
<button id="homeButton" class="home-button">🏠</button>
|
||||||
<button id="themeToggle" class="theme-button">🌙</button>
|
<button id="themeToggle" class="theme-button">🌙</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
-->
|
||||||
|
|
||||||
<!-- Main Content -->
|
<!-- Main Content -->
|
||||||
<div class="container mt-4">
|
<div class="container mt-4">
|
||||||
@@ -8,7 +8,7 @@ urlpatterns = [
|
|||||||
#
|
#
|
||||||
path('task/<str:task>', views.trigger_task, name='trigger_task'),
|
path('task/<str:task>', views.trigger_task, name='trigger_task'),
|
||||||
#
|
#
|
||||||
path('charts/', views.charts, name='charts'),
|
path('urls/charts/', views.charts, name='charts'),
|
||||||
path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
|
path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
|
||||||
path('urls-per-status/', views.urls_per_status, name='urls_per_status'),
|
path('urls-per-status/', views.urls_per_status, name='urls_per_status'),
|
||||||
path('urls-per-source/', views.urls_per_source, name='urls_per_source'),
|
path('urls-per-source/', views.urls_per_source, name='urls_per_source'),
|
||||||
@@ -2,6 +2,7 @@ from .tasks import background_task
|
|||||||
from django.core.paginator import Paginator
|
from django.core.paginator import Paginator
|
||||||
from django.shortcuts import render, get_object_or_404
|
from django.shortcuts import render, get_object_or_404
|
||||||
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
|
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
|
||||||
|
from django.contrib.auth.decorators import login_required
|
||||||
import ollama
|
import ollama
|
||||||
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
|
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
|
||||||
import os
|
import os
|
||||||
@@ -29,17 +30,18 @@ def link_list(request):
|
|||||||
# URLs
|
# URLs
|
||||||
"http://localhost:8000/urls",
|
"http://localhost:8000/urls",
|
||||||
# Charts
|
# Charts
|
||||||
"http://localhost:8000/charts",
|
"http://localhost:8000/urls/charts",
|
||||||
# API tasks
|
# Fetcher tasks
|
||||||
] + [os.path.join(prefix, l) for l in links]
|
] + [os.path.join(prefix, l) for l in links]
|
||||||
# Json
|
# Json
|
||||||
return JsonResponse({"links": list_links })
|
return JsonResponse({"links": list_links })
|
||||||
|
|
||||||
####################################################################################################
|
####################################################################################################
|
||||||
|
# @login_required(login_url='/admin')
|
||||||
def logs(request, log_type):
|
def logs(request, log_type):
|
||||||
# Capture output: python manage.py rqstats
|
# Capture output: python manage.py rqstats
|
||||||
try:
|
try:
|
||||||
with open(os.getenv("PATH_LOGS_DEBUG", "logs/log_app_fetcher_{}.log".format(log_type)), "r") as f:
|
with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
|
||||||
file_content = f.read()
|
file_content = f.read()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
file_content = "Error reading logs for log type :{}".format(log_type)
|
file_content = "Error reading logs for log type :{}".format(log_type)
|
||||||
@@ -130,8 +132,9 @@ def charts(request):
|
|||||||
return render(request, 'charts.html')
|
return render(request, 'charts.html')
|
||||||
|
|
||||||
def urls_by_fetch_date(request):
|
def urls_by_fetch_date(request):
|
||||||
# Get the date for 30 days ago
|
# Get the filtering date parameter
|
||||||
start_date = timezone.now() - timedelta(days=30)
|
days = float(request.GET.get('days', 30)) # Default is 30 days
|
||||||
|
start_date = timezone.now() - timedelta(days=days)
|
||||||
|
|
||||||
# Count the number of URLs grouped by fetch date
|
# Count the number of URLs grouped by fetch date
|
||||||
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
|
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
|
||||||
@@ -141,8 +144,8 @@ def urls_by_fetch_date(request):
|
|||||||
|
|
||||||
# Format data to return as JSON
|
# Format data to return as JSON
|
||||||
data = {
|
data = {
|
||||||
'dates': [item['ts_fetch__date'] for item in urls_data],
|
'labels': [item['ts_fetch__date'] for item in urls_data],
|
||||||
'counts': [item['count'] for item in urls_data],
|
'values': [item['count'] for item in urls_data],
|
||||||
}
|
}
|
||||||
|
|
||||||
return JsonResponse(data)
|
return JsonResponse(data)
|
||||||
@@ -160,38 +163,48 @@ def urls_per_status(request):
|
|||||||
|
|
||||||
# Format data for JSON
|
# Format data for JSON
|
||||||
data = {
|
data = {
|
||||||
'statuses': [item['status'] for item in urls_data],
|
'labels': [item['status'] for item in urls_data],
|
||||||
'counts': [item['count'] for item in urls_data],
|
'values': [item['count'] for item in urls_data],
|
||||||
}
|
}
|
||||||
|
|
||||||
return JsonResponse(data)
|
return JsonResponse(data)
|
||||||
|
|
||||||
def urls_per_source(request):
|
def urls_per_source(request):
|
||||||
|
# Get the filtering date parameter
|
||||||
|
days = float(request.GET.get('days', 30)) # Default is 30 days
|
||||||
|
start_date = timezone.now() - timedelta(days=days)
|
||||||
|
|
||||||
# Count the number of URLs grouped by source
|
# Count the number of URLs grouped by source
|
||||||
urls_data = UrlsSourceSearch.objects \
|
urls_data = UrlsSourceSearch.objects \
|
||||||
|
.filter(id_url__ts_fetch__gte=start_date) \
|
||||||
.values('id_source__source') \
|
.values('id_source__source') \
|
||||||
.annotate(count=Count('id_url')) \
|
.annotate(count=Count('id_url')) \
|
||||||
.order_by('id_source__source')
|
.order_by('id_source__source')
|
||||||
|
|
||||||
# Format data for JSON
|
# Format data for JSON
|
||||||
data = {
|
data = {
|
||||||
'sources': [item['id_source__source'] for item in urls_data],
|
'labels': [item['id_source__source'] for item in urls_data],
|
||||||
'counts': [item['count'] for item in urls_data],
|
'values': [item['count'] for item in urls_data],
|
||||||
}
|
}
|
||||||
|
|
||||||
return JsonResponse(data)
|
return JsonResponse(data)
|
||||||
|
|
||||||
def urls_per_search(request):
|
def urls_per_search(request):
|
||||||
|
# Get the filtering date parameter
|
||||||
|
days = float(request.GET.get('days', 30)) # Default is 30 days
|
||||||
|
start_date = timezone.now() - timedelta(days=days)
|
||||||
|
|
||||||
# Count the number of URLs grouped by search
|
# Count the number of URLs grouped by search
|
||||||
urls_data = UrlsSourceSearch.objects \
|
urls_data = UrlsSourceSearch.objects \
|
||||||
|
.filter(id_url__ts_fetch__gte=start_date) \
|
||||||
.values('id_search__search') \
|
.values('id_search__search') \
|
||||||
.annotate(count=Count('id_url')) \
|
.annotate(count=Count('id_url')) \
|
||||||
.order_by('id_search__search')
|
.order_by('id_search__search')
|
||||||
|
|
||||||
# Format data for JSON
|
# Format data for JSON
|
||||||
data = {
|
data = {
|
||||||
'searches': [item['id_search__search'] for item in urls_data],
|
'labels': [item['id_search__search'] for item in urls_data],
|
||||||
'counts': [item['count'] for item in urls_data],
|
'values': [item['count'] for item in urls_data],
|
||||||
}
|
}
|
||||||
|
|
||||||
return JsonResponse(data)
|
return JsonResponse(data)
|
||||||
17
app_urls/requirements.txt
Normal file
17
app_urls/requirements.txt
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
django==5.1
|
||||||
|
psycopg[binary]
|
||||||
|
django-redis
|
||||||
|
django-tasks-scheduler
|
||||||
|
gunicorn
|
||||||
|
whitenoise
|
||||||
|
feedparser
|
||||||
|
python-dateutil
|
||||||
|
newspaper4k[all]
|
||||||
|
lxml[html_clean]
|
||||||
|
googlenewsdecoder
|
||||||
|
gnews
|
||||||
|
GoogleNews
|
||||||
|
duckduckgo_search
|
||||||
|
git+https://github.com/tasos-py/Search-Engines-Scraper.git
|
||||||
|
langdetect
|
||||||
|
ollama
|
||||||
@@ -2,10 +2,10 @@
|
|||||||
{
|
{
|
||||||
"model": "RepeatableTaskType",
|
"model": "RepeatableTaskType",
|
||||||
"name": "Process error URLs",
|
"name": "Process error URLs",
|
||||||
"callable": "api.tasks.process_error_urls",
|
"callable": "fetcher.tasks.process_error_urls",
|
||||||
"callable_args": [],
|
"callable_args": [],
|
||||||
"callable_kwargs": [],
|
"callable_kwargs": [],
|
||||||
"enabled": true,
|
"enabled": false,
|
||||||
"queue": "low",
|
"queue": "low",
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"at_front": false,
|
"at_front": false,
|
||||||
@@ -15,18 +15,39 @@
|
|||||||
"scheduled_time": "2025-04-01T12:36:21+00:00",
|
"scheduled_time": "2025-04-01T12:36:21+00:00",
|
||||||
"interval": 4,
|
"interval": 4,
|
||||||
"interval_unit": "hours",
|
"interval_unit": "hours",
|
||||||
"successful_runs": 15,
|
"successful_runs": 0,
|
||||||
"failed_runs": 0,
|
"failed_runs": 0,
|
||||||
"last_successful_run": "2025-04-01 08:37:06.722770+00:00",
|
"last_successful_run": null,
|
||||||
|
"last_failed_run": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "RepeatableTaskType",
|
||||||
|
"name": "Process raw URLs",
|
||||||
|
"callable": "fetcher.tasks.process_raw_urls",
|
||||||
|
"callable_args": [],
|
||||||
|
"callable_kwargs": [],
|
||||||
|
"enabled": false,
|
||||||
|
"queue": "low",
|
||||||
|
"repeat": null,
|
||||||
|
"at_front": false,
|
||||||
|
"timeout": null,
|
||||||
|
"result_ttl": 86400,
|
||||||
|
"cron_string": null,
|
||||||
|
"scheduled_time": "2025-04-01T10:20:08+00:00",
|
||||||
|
"interval": 10,
|
||||||
|
"interval_unit": "minutes",
|
||||||
|
"successful_runs": 0,
|
||||||
|
"failed_runs": 0,
|
||||||
|
"last_successful_run": null,
|
||||||
"last_failed_run": null
|
"last_failed_run": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model": "RepeatableTaskType",
|
"model": "RepeatableTaskType",
|
||||||
"name": "Process MissingKids URLs",
|
"name": "Process MissingKids URLs",
|
||||||
"callable": "api.tasks.process_missing_kids_urls",
|
"callable": "fetcher.tasks.process_missing_kids_urls",
|
||||||
"callable_args": [],
|
"callable_args": [],
|
||||||
"callable_kwargs": [],
|
"callable_kwargs": [],
|
||||||
"enabled": true,
|
"enabled": false,
|
||||||
"queue": "default",
|
"queue": "default",
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"at_front": false,
|
"at_front": false,
|
||||||
@@ -34,20 +55,20 @@
|
|||||||
"result_ttl": 86400,
|
"result_ttl": 86400,
|
||||||
"cron_string": null,
|
"cron_string": null,
|
||||||
"scheduled_time": "2025-04-01T10:37:50+00:00",
|
"scheduled_time": "2025-04-01T10:37:50+00:00",
|
||||||
"interval": 2,
|
"interval": 4,
|
||||||
"interval_unit": "hours",
|
"interval_unit": "hours",
|
||||||
"successful_runs": 29,
|
"successful_runs": 0,
|
||||||
"failed_runs": 0,
|
"failed_runs": 0,
|
||||||
"last_successful_run": "2025-04-01 08:42:05.864064+00:00",
|
"last_successful_run": null,
|
||||||
"last_failed_run": null
|
"last_failed_run": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model": "RepeatableTaskType",
|
"model": "RepeatableTaskType",
|
||||||
"name": "Process MissingKids URLs ALL",
|
"name": "Process MissingKids URLs ALL",
|
||||||
"callable": "api.tasks.process_missing_kids_urls_all",
|
"callable": "fetcher.tasks.process_missing_kids_urls_all",
|
||||||
"callable_args": [],
|
"callable_args": [],
|
||||||
"callable_kwargs": [],
|
"callable_kwargs": [],
|
||||||
"enabled": true,
|
"enabled": false,
|
||||||
"queue": "default",
|
"queue": "default",
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"at_front": false,
|
"at_front": false,
|
||||||
@@ -65,10 +86,10 @@
|
|||||||
{
|
{
|
||||||
"model": "RepeatableTaskType",
|
"model": "RepeatableTaskType",
|
||||||
"name": "Fetch Feeds",
|
"name": "Fetch Feeds",
|
||||||
"callable": "api.tasks.fetch_feeds",
|
"callable": "fetcher.tasks.fetch_feeds",
|
||||||
"callable_args": [],
|
"callable_args": [],
|
||||||
"callable_kwargs": [],
|
"callable_kwargs": [],
|
||||||
"enabled": true,
|
"enabled": false,
|
||||||
"queue": "default",
|
"queue": "default",
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"at_front": false,
|
"at_front": false,
|
||||||
@@ -78,39 +99,18 @@
|
|||||||
"scheduled_time": "2025-04-01T10:18:56+00:00",
|
"scheduled_time": "2025-04-01T10:18:56+00:00",
|
||||||
"interval": 15,
|
"interval": 15,
|
||||||
"interval_unit": "minutes",
|
"interval_unit": "minutes",
|
||||||
"successful_runs": 288,
|
"successful_runs": 0,
|
||||||
"failed_runs": 0,
|
"failed_runs": 0,
|
||||||
"last_successful_run": "2025-04-01 10:03:58.363856+00:00",
|
"last_successful_run": null,
|
||||||
"last_failed_run": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"model": "RepeatableTaskType",
|
|
||||||
"name": "Process raw URLs",
|
|
||||||
"callable": "api.tasks.process_raw_urls",
|
|
||||||
"callable_args": [],
|
|
||||||
"callable_kwargs": [],
|
|
||||||
"enabled": true,
|
|
||||||
"queue": "low",
|
|
||||||
"repeat": null,
|
|
||||||
"at_front": false,
|
|
||||||
"timeout": null,
|
|
||||||
"result_ttl": 86400,
|
|
||||||
"cron_string": null,
|
|
||||||
"scheduled_time": "2025-04-01T10:20:08+00:00",
|
|
||||||
"interval": 15,
|
|
||||||
"interval_unit": "minutes",
|
|
||||||
"successful_runs": 78,
|
|
||||||
"failed_runs": 0,
|
|
||||||
"last_successful_run": "2025-04-01 10:05:08.394472+00:00",
|
|
||||||
"last_failed_run": null
|
"last_failed_run": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model": "RepeatableTaskType",
|
"model": "RepeatableTaskType",
|
||||||
"name": "Fetch Parser",
|
"name": "Fetch Parser",
|
||||||
"callable": "api.tasks.fetch_parser",
|
"callable": "fetcher.tasks.fetch_parser",
|
||||||
"callable_args": [],
|
"callable_args": [],
|
||||||
"callable_kwargs": [],
|
"callable_kwargs": [],
|
||||||
"enabled": true,
|
"enabled": false,
|
||||||
"queue": "default",
|
"queue": "default",
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"at_front": false,
|
"at_front": false,
|
||||||
@@ -120,18 +120,18 @@
|
|||||||
"scheduled_time": "2025-04-01T10:25:42+00:00",
|
"scheduled_time": "2025-04-01T10:25:42+00:00",
|
||||||
"interval": 1,
|
"interval": 1,
|
||||||
"interval_unit": "hours",
|
"interval_unit": "hours",
|
||||||
"successful_runs": 62,
|
"successful_runs": 0,
|
||||||
"failed_runs": 0,
|
"failed_runs": 0,
|
||||||
"last_successful_run": "2025-04-01 09:25:57.977051+00:00",
|
"last_successful_run": null,
|
||||||
"last_failed_run": null
|
"last_failed_run": null
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"model": "RepeatableTaskType",
|
"model": "RepeatableTaskType",
|
||||||
"name": "Fetch Search",
|
"name": "Fetch Search",
|
||||||
"callable": "api.tasks.fetch_search",
|
"callable": "fetcher.tasks.fetch_search",
|
||||||
"callable_args": [],
|
"callable_args": [],
|
||||||
"callable_kwargs": [],
|
"callable_kwargs": [],
|
||||||
"enabled": true,
|
"enabled": false,
|
||||||
"queue": "default",
|
"queue": "default",
|
||||||
"repeat": null,
|
"repeat": null,
|
||||||
"at_front": false,
|
"at_front": false,
|
||||||
@@ -141,9 +141,51 @@
|
|||||||
"scheduled_time": "2025-04-01T10:29:33+00:00",
|
"scheduled_time": "2025-04-01T10:29:33+00:00",
|
||||||
"interval": 1,
|
"interval": 1,
|
||||||
"interval_unit": "hours",
|
"interval_unit": "hours",
|
||||||
"successful_runs": 63,
|
"successful_runs": 0,
|
||||||
"failed_runs": 0,
|
"failed_runs": 0,
|
||||||
"last_successful_run": "2025-04-01 09:37:20.671072+00:00",
|
"last_successful_run": null,
|
||||||
|
"last_failed_run": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "RepeatableTaskType",
|
||||||
|
"name": "Fetch MissingKids",
|
||||||
|
"callable": "fetcher.tasks.fetch_missing_kids",
|
||||||
|
"callable_args": [],
|
||||||
|
"callable_kwargs": [],
|
||||||
|
"enabled": false,
|
||||||
|
"queue": "default",
|
||||||
|
"repeat": null,
|
||||||
|
"at_front": false,
|
||||||
|
"timeout": null,
|
||||||
|
"result_ttl": 86400,
|
||||||
|
"cron_string": null,
|
||||||
|
"scheduled_time": "2025-04-01T10:29:33+00:00",
|
||||||
|
"interval": 4,
|
||||||
|
"interval_unit": "hours",
|
||||||
|
"successful_runs": 0,
|
||||||
|
"failed_runs": 0,
|
||||||
|
"last_successful_run": null,
|
||||||
|
"last_failed_run": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"model": "RepeatableTaskType",
|
||||||
|
"name": "Fetch MissingKids ALL",
|
||||||
|
"callable": "fetcher.tasks.fetch_missing_kids_all",
|
||||||
|
"callable_args": [],
|
||||||
|
"callable_kwargs": [],
|
||||||
|
"enabled": false,
|
||||||
|
"queue": "default",
|
||||||
|
"repeat": null,
|
||||||
|
"at_front": false,
|
||||||
|
"timeout": null,
|
||||||
|
"result_ttl": 86400,
|
||||||
|
"cron_string": null,
|
||||||
|
"scheduled_time": "2025-04-01T10:29:33+00:00",
|
||||||
|
"interval": 1,
|
||||||
|
"interval_unit": "weeks",
|
||||||
|
"successful_runs": 0,
|
||||||
|
"failed_runs": 0,
|
||||||
|
"last_successful_run": null,
|
||||||
"last_failed_run": null
|
"last_failed_run": null
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -2,101 +2,106 @@ version: '3.9'
|
|||||||
|
|
||||||
services:
|
services:
|
||||||
|
|
||||||
fetcher_selenium:
|
fetcher_app_selenium:
|
||||||
|
image: fetcher_app_selenium
|
||||||
build:
|
build:
|
||||||
context: ./app_selenium
|
context: ./app_selenium
|
||||||
container_name: selenium_app
|
container_name: fetcher_app_selenium
|
||||||
restart: unless-stopped
|
# restart: unless-stopped
|
||||||
shm_size: 512mb
|
shm_size: 512mb
|
||||||
environment:
|
environment:
|
||||||
- SELENIUM_SLEEP_PER_PAGE=4
|
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
|
||||||
- PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log"
|
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
|
||||||
ports:
|
ports:
|
||||||
- 80
|
- 80
|
||||||
|
dns:
|
||||||
|
- 1.1.1.1
|
||||||
|
- 1.0.0.1
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: '4'
|
||||||
|
memory: 4G
|
||||||
|
|
||||||
fetcher_urls_app:
|
fetcher_app_urls:
|
||||||
|
image: fetcher_app_urls
|
||||||
build:
|
build:
|
||||||
context: ./app_urls
|
context: ./app_urls
|
||||||
container_name: urls_app
|
container_name: fetcher_app_urls
|
||||||
restart: unless-stopped
|
# restart: unless-stopped
|
||||||
environment:
|
environment:
|
||||||
#- name=value
|
# Initialization
|
||||||
|
- INITIALIZE_DB=${INITIALIZE_DB:-true}
|
||||||
|
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME:-matitos}
|
||||||
|
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD:-matitos}
|
||||||
|
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL:-matitos@matitos.org}
|
||||||
|
# Django
|
||||||
|
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:-abc123456789qwerty}
|
||||||
|
- DJANGO_DEBUG=${DJANGO_DEBUG:-False}
|
||||||
|
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2
|
||||||
# Database
|
# Database
|
||||||
- DB_NAME=${DB_NAME:-matitos}
|
- DB_NAME=${DB_NAME:-matitos}
|
||||||
- DB_USER=${DB_NAME:-supermatitos}
|
- DB_USER=${DB_USER:-supermatitos}
|
||||||
- DB_PASSWORD=${DB_NAME:-supermatitos}
|
- DB_PASSWORD=${DB_PASSWORD:-supermatitos}
|
||||||
- DB_HOST=${DB_NAME:-localhost} # db_postgres
|
- DB_HOST=${DB_HOST:-fetcher_db}
|
||||||
- DB_PORT=${DB_NAME:-5432}
|
- DB_PORT=${DB_PORT:-5432}
|
||||||
- REDIS_HOST=${REDIS_HOST:-localhost}
|
- REDIS_HOST=${REDIS_HOST:-fetcher_redis}
|
||||||
- REDIS_PORT=${REDIS_PORT:-6379}
|
- REDIS_PORT=${REDIS_PORT:-6379}
|
||||||
# Job timeout: 30 min
|
# Job timeout: 30 min
|
||||||
- JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
|
- JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
|
||||||
# Logs path
|
# Logs path
|
||||||
- PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
|
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
|
||||||
# Fetcher
|
# Fetcher
|
||||||
- FETCHER_GNEWS_DECODE_SLEEP=2
|
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
|
||||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
|
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
|
||||||
- FETCHER_BETWEEN_SEARCHES_SLEEP=5
|
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP:-1}
|
||||||
- FETCHER_URL_HOST_SLEEP=5
|
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-2}
|
||||||
# Selenium
|
# Selenium
|
||||||
- SELENIUM_ENDPOINT="http://selenium_app:80"
|
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
|
||||||
|
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
|
||||||
ports:
|
ports:
|
||||||
- 80
|
- 8000:8000
|
||||||
|
depends_on:
|
||||||
|
- fetcher_db
|
||||||
|
- fetcher_redis
|
||||||
|
dns:
|
||||||
|
- 1.1.1.1
|
||||||
|
- 1.0.0.1
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpus: '4'
|
||||||
|
memory: 4G
|
||||||
|
|
||||||
fetcher_db:
|
fetcher_db:
|
||||||
image: postgres:17
|
image: postgres:17
|
||||||
container_name: db_postgres
|
container_name: fetcher_db
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
# Set shared memory limit when using docker-compose
|
# Set shared memory limit when using docker-compose
|
||||||
shm_size: 128mb
|
shm_size: 128mb
|
||||||
environment:
|
environment:
|
||||||
|
POSTGRES_DB: ${DB_NAME:-matitos}
|
||||||
POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos}
|
POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos}
|
||||||
POSTGRES_USER: ${DB_USERNAME:-supermatitos}
|
POSTGRES_USER: ${DB_USER:-supermatitos}
|
||||||
POSTGRES_DB: ${DB_DATABASE_NAME:-matitos}
|
|
||||||
POSTGRES_INITDB_ARGS: '--data-checksums'
|
POSTGRES_INITDB_ARGS: '--data-checksums'
|
||||||
#volumes:
|
#volumes: # Persistent DB?
|
||||||
# - ${PATH_BASE:-.}/postgres:/var/lib/postgresql/data
|
# - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
|
||||||
ports:
|
ports:
|
||||||
- 5432:5432
|
- 5432 #:5432
|
||||||
|
|
||||||
fetcher_redis:
|
fetcher_redis:
|
||||||
image: redis:alpine
|
image: redis:alpine
|
||||||
container_name: db_redis
|
container_name: fetcher_redis
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ports:
|
ports:
|
||||||
- 6379:6379
|
- 6379 #:6379
|
||||||
#expose:
|
|
||||||
# - 6379
|
|
||||||
|
|
||||||
fetcher_adminer:
|
|
||||||
# http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public
|
|
||||||
image: adminer
|
|
||||||
container_name: adminer
|
|
||||||
restart: unless-stopped
|
|
||||||
environment:
|
|
||||||
- ADMINER_DEFAULT_DB_DRIVER=pgsql
|
|
||||||
#- ADMINER_DEFAULT_DB_HOST
|
|
||||||
#- ADMINER_DEFAULT_DB_NAME
|
|
||||||
depends_on:
|
|
||||||
- matitos_db
|
|
||||||
ports:
|
|
||||||
- 8080:8080
|
|
||||||
|
|
||||||
fetcher_dozzle:
|
fetcher_dozzle:
|
||||||
container_name: dozzle
|
container_name: fetcher_dozzle
|
||||||
image: amir20/dozzle:latest
|
image: amir20/dozzle:latest
|
||||||
volumes:
|
volumes:
|
||||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
ports:
|
ports:
|
||||||
- 8888:8080
|
- 8888:8080
|
||||||
environment:
|
environment:
|
||||||
- DOZZLE_FILTER="name=matitos_" # Need container name matitos_ ?
|
- DOZZLE_FILTER="name=fetcher_"
|
||||||
|
|
||||||
|
|
||||||
# django:
|
|
||||||
# Env: DB_HOST=matitos_db
|
|
||||||
# DJANGO_DB_NAME=${DB_DATABASE_NAME:-matitos}
|
|
||||||
# DJANGO_DB_USER=${DB_USERNAME:-supermatitos}
|
|
||||||
# DJANGO_DB_PASSWORD=${DB_PASSWORD:-supermatitos}
|
|
||||||
# DJANGO_DB_HOST=${DB_HOST:-localhost}
|
|
||||||
# DJANGO_DB_PORT=${DB_PORT:-5432}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user