Compare commits
106 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cbc422df36 | ||
|
|
dc784dabec | ||
|
|
d8ef738d19 | ||
|
|
2f035a4222 | ||
|
|
e057568af0 | ||
|
|
7924857fe5 | ||
|
|
f44b784715 | ||
|
|
24510d26e2 | ||
|
|
ef51a96db6 | ||
|
|
079b2473f8 | ||
|
|
1fbc5beb6e | ||
|
|
7886d16264 | ||
|
|
2ed86e31ec | ||
|
|
892fb984d1 | ||
|
|
c17f09a94f | ||
|
|
e4a325d6b4 | ||
|
|
2fae0a3a9d | ||
|
|
35f9260b94 | ||
|
|
b40611bd3e | ||
|
|
346d7c9187 | ||
|
|
a21ff9c5d6 | ||
|
|
7b0d24309c | ||
|
|
334062b0ec | ||
|
|
a9074f45b5 | ||
|
|
569e7d4676 | ||
|
|
4883b097db | ||
|
|
a0ced90d7c | ||
|
|
883dfcd3bd | ||
|
|
9b0a84c16a | ||
|
|
b08ea574d9 | ||
|
|
da078a6f0f | ||
|
|
4ccff2bc02 | ||
|
|
ffb0f85475 | ||
|
|
c939624687 | ||
|
|
6fb14d5e72 | ||
|
|
260a505766 | ||
|
|
bae5329b1e | ||
|
|
b3d63f820e | ||
|
|
1fbf4cf3d4 | ||
|
|
856a9e7562 | ||
|
|
4080154f2b | ||
|
|
015f92a06b | ||
|
|
3d09c1acff | ||
|
|
02f756d3c2 | ||
|
|
6b5073d1b6 | ||
|
|
e3d6cf8000 | ||
|
|
30c586d49a | ||
|
|
1502f09e22 | ||
|
|
54e41139bb | ||
|
|
b112da8bd0 | ||
|
|
cb621c9d6b | ||
|
|
50e8666162 | ||
|
|
202e58776d | ||
|
|
7a91fc1a87 | ||
|
|
b2b853b32f | ||
|
|
d5d80ade55 | ||
|
|
60f021fc2d | ||
|
|
1dcf69ab08 | ||
|
|
b9ba0d8f3d | ||
|
|
06ded0b37d | ||
|
|
a38e2bc5d1 | ||
|
|
5a33012a64 | ||
|
|
9d79a4e5c4 | ||
|
|
6612a50d13 | ||
|
|
6c88759e7b | ||
|
|
623dfbf95a | ||
|
|
0cb68a876b | ||
|
|
fdc3263785 | ||
|
|
da5dfe5314 | ||
|
|
0fa4482711 | ||
|
|
4985f09e56 | ||
|
|
0cf61026e8 | ||
|
|
f729bd1cb2 | ||
|
|
9083021674 | ||
|
|
8d72d3af0c | ||
|
|
75de046dd9 | ||
|
|
7fdd93d35d | ||
|
|
522c1cb8b3 | ||
|
|
e81a96f4bd | ||
|
|
dd8e71aaa3 | ||
|
|
8cf2b52325 | ||
|
|
a8b236bac0 | ||
|
|
15035c108d | ||
|
|
4c0dd70bc3 | ||
|
|
b559f8cd8c | ||
|
|
737483db9f | ||
|
|
d0ae91bf35 | ||
|
|
80f40e1a74 | ||
|
|
969e08e84a | ||
|
|
68b56eafea | ||
|
|
e657c3bee1 | ||
|
|
8b689729bf | ||
|
|
f659d4adb3 | ||
|
|
03a2949b2b | ||
|
|
490f01d66c | ||
|
|
a2cce62096 | ||
|
|
aa7aca3e66 | ||
|
|
d7df5b4ea4 | ||
|
|
ccfd0f9188 | ||
|
|
aa369d0458 | ||
|
|
f59d16b3fc | ||
|
|
b3f7cb255c | ||
|
|
b8fdcae5ec | ||
|
|
cf55c586f7 | ||
|
|
e5c574ba33 | ||
|
|
8ea3ec1bda |
@@ -1,3 +1,7 @@
|
||||
# AutoSSH DB
|
||||
REMOTE_HOST=''
|
||||
REMOTE_USERNAME=''
|
||||
|
||||
# Initialization
|
||||
INITIALIZE_DB=true
|
||||
DJANGO_SUPERUSER_USERNAME=matitos
|
||||
@@ -18,13 +22,12 @@ PATH_LOGS_DIRECTORY=/opt/logs
|
||||
DB_NAME=matitos
|
||||
DB_PASSWORD=supermatitos
|
||||
DB_USER=supermatitos
|
||||
PATH_DB_DATA=.
|
||||
|
||||
# Database: Django
|
||||
DB_HOST=fetcher_db
|
||||
DB_PORT=5432
|
||||
REDIS_HOST=fetcher_redis
|
||||
REDIS_PORT=6379
|
||||
REDIS_CACHE_HOST=fetcher_redis_cache
|
||||
REDIS_CACHE_PORT=6379
|
||||
REDIS_CELERY_HOST=fetcher_redis_celery
|
||||
REDIS_CELERY_PORT=6379
|
||||
|
||||
# Job timeout: 30 min
|
||||
JOB_DEFAULT_TIMEOUT=1800
|
||||
@@ -40,18 +43,23 @@ FETCHER_ERROR_URL_CACHE_TIME=172800
|
||||
|
||||
# Selenium
|
||||
SELENIUM_ENDPOINT=http://fetcher_app_selenium:80
|
||||
ENDPOINT_OLLAMA=https://ollamamodel.matitos.org
|
||||
|
||||
# APP: Selenium
|
||||
ARCH=amd64 # arm64, amd64
|
||||
SELENIUM_SLEEP_PER_PAGE=4
|
||||
PATH_LOGS_DIRECTORY=/opt/logs
|
||||
|
||||
# Deploy resources per App
|
||||
DEPLOY_CPUS=2
|
||||
DEPLOY_RAM=4G
|
||||
DEPLOY_RAM=3G
|
||||
|
||||
# Ghost
|
||||
GHOST_ADMIN_API_URL=https://news.matitos.org/ghost/api/admin/
|
||||
GHOST_ADMIN_API_KEY=67fffe1b8a57a80001ecec5b:59f580020c196f92e05e208d288702082f8edad6366e2b2c8940b54e41cc355a
|
||||
GHOST_ADMIN_API_KEY=
|
||||
PEXELS_API_KEY=Y6clJkY32eihf34ukX4JsINYu9lzxh3xDdNq2HMAmGwXp0a0tt6vr6S9
|
||||
# Ollama
|
||||
ENDPOINT_OLLAMA=https://ollamamodelnpu.matitos.org
|
||||
OLLAMA_MODEL_DEFAULT=qwen2.5-instruct:3b
|
||||
|
||||
# Telegram
|
||||
TELEGRAM_INFO_BOT_TOKEN="..."
|
||||
TELEGRAM_INFO_CHAT_ID="..."
|
||||
TELEGRAM_WARNING_BOT_TOKEN="..."
|
||||
TELEGRAM_WARNING_CHAT_ID="..."
|
||||
7
.gitignore
vendored
7
.gitignore
vendored
@@ -1,6 +1,11 @@
|
||||
.env
|
||||
__pycache__/
|
||||
*.pyc
|
||||
**/credentials.py
|
||||
logs/
|
||||
postgres/
|
||||
docker_data/
|
||||
docker_data/
|
||||
**/*.pt
|
||||
**/*.pth
|
||||
**/*.tar
|
||||
**/*.onnx
|
||||
|
||||
10
README.md
10
README.md
@@ -15,6 +15,8 @@
|
||||
- TODO: Proxy / VPN?
|
||||
- TooManyRequests, ...
|
||||
- TODO: Search per locale (nl-NL, fr-FR, en-GB)
|
||||
- Fetch keyword search for selenium sources
|
||||
|
||||
|
||||
- URLs Processing -> Updates raw URLs
|
||||
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
|
||||
@@ -52,6 +54,10 @@
|
||||
* Dev mode
|
||||
```
|
||||
docker compose -f docker-compose-dev.yml down -v
|
||||
docker compose -f docker-compose-dev.yml build --progress=plain
|
||||
docker compose -f docker-compose-dev.yml up
|
||||
docker compose -f docker-compose-dev.yml up --no-deps --build
|
||||
```
|
||||
* Prod mode
|
||||
```
|
||||
docker compose down -v
|
||||
docker compose up -d --no-deps --build
|
||||
```
|
||||
157
app_cv/Demo.ipynb
Normal file
157
app_cv/Demo.ipynb
Normal file
@@ -0,0 +1,157 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import base64\n",
|
||||
"import json\n",
|
||||
"import requests\n",
|
||||
"import io\n",
|
||||
"import numpy as np\n",
|
||||
"import PIL.Image\n",
|
||||
"import cv2\n",
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"def process_image(path_img):\n",
|
||||
" with open(path_img, \"rb\") as image_file:\n",
|
||||
" encoded_string = base64.b64encode(image_file.read()).decode('utf-8')\n",
|
||||
" response = requests.post(\n",
|
||||
" 'http://localhost:5000/process',\n",
|
||||
" headers={'Content-Type': 'application/json'},\n",
|
||||
" data=json.dumps({'image': encoded_string})\n",
|
||||
" )\n",
|
||||
" response_dict = response.json()\n",
|
||||
" pprint(response_dict)\n",
|
||||
" # Decode\n",
|
||||
" image_bytes = base64.b64decode(response_dict.get(\"image_b64\"))\n",
|
||||
" img_array = np.frombuffer(io.BytesIO(image_bytes).getvalue(), dtype=np.uint8)\n",
|
||||
" img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)\n",
|
||||
" img_rgb = img_bgr[:, :, ::-1]\n",
|
||||
" return img_rgb"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path_img = \"imgs/img_1p.jpg\"\n",
|
||||
"PIL.Image.fromarray( process_image(path_img) )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path_img = \"imgs/img_nude.jpg\"\n",
|
||||
"PIL.Image.fromarray( process_image(path_img) )"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"'''\n",
|
||||
"# !git clone https://github.com/wildchlamydia/mivolo\n",
|
||||
"# !pip install ultralytics yt_dlp pandas scipy timm==0.8.13.dev0\n",
|
||||
"# !pip install ./mivolo\n",
|
||||
"\n",
|
||||
"!python mivolo/demo.py \\\n",
|
||||
" --input \"face_data/sample_image.jpg\" \\\n",
|
||||
" --output \"output\" \\\n",
|
||||
" --detector-weights \"mivolo/pretrained/yolov8x_person_face.pt\" \\\n",
|
||||
" --checkpoint \"mivolo/pretrained/model_imdb_cross_person_4.22_99.46.pth.tar\" \\\n",
|
||||
" --device \"cpu\" \\\n",
|
||||
" --draw\n",
|
||||
"'''\n",
|
||||
"\n",
|
||||
"'''\n",
|
||||
"# !git clone https://github.com/Kartik-3004/facexformer.git\n",
|
||||
"# !pip install huggingface_hub torch torchvision torchaudio opencv-python facenet_pytorch\n",
|
||||
"from huggingface_hub import hf_hub_download\n",
|
||||
"hf_hub_download(repo_id=\"kartiknarayan/facexformer\", filename=\"ckpts/model.pt\", local_dir=\"./facexformer\")\n",
|
||||
"\n",
|
||||
"!python facexformer/inference.py \\\n",
|
||||
" --model_path facexformer/ckpts/model.pt \\\n",
|
||||
" --image_path face_data/sample_image.jpg \\\n",
|
||||
" --results_path face_data \\\n",
|
||||
" --task parsing\n",
|
||||
" x\n",
|
||||
"!python facexformer/inference.py \\\n",
|
||||
" --model_path facexformer/ckpts/model.pt \\\n",
|
||||
" --image_path face_data/face.png \\\n",
|
||||
" --results_path face_data \\\n",
|
||||
" --task landmarks\n",
|
||||
"\n",
|
||||
"!python facexformer/inference.py \\\n",
|
||||
" --model_path facexformer/ckpts/model.pt \\\n",
|
||||
" --image_path face_data/face.png \\\n",
|
||||
" --results_path face_data \\\n",
|
||||
" --task headpose\n",
|
||||
"\n",
|
||||
"!python facexformer/inference.py \\\n",
|
||||
" --model_path facexformer/ckpts/model.pt \\\n",
|
||||
" --image_path face_data/face.png \\\n",
|
||||
" --results_path face_data \\\n",
|
||||
" --task attributes\n",
|
||||
"\n",
|
||||
"!python facexformer/inference.py \\\n",
|
||||
" --model_path facexformer/ckpts/model.pt \\\n",
|
||||
" --image_path face_data/face.png \\\n",
|
||||
" --results_path face_data \\\n",
|
||||
" --task age_gender_race\n",
|
||||
"'''"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos_cv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
27
app_cv/Dockerfile
Normal file
27
app_cv/Dockerfile
Normal file
@@ -0,0 +1,27 @@
|
||||
FROM python:3.12
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# LibGL for OpenCV
|
||||
RUN apt-get update && apt-get install libgl1 -y
|
||||
|
||||
# Download models
|
||||
RUN mkdir models
|
||||
|
||||
# https://github.com/wildchlamydia/mivolo
|
||||
RUN curl "https://drive.usercontent.google.com/download?id=11i8pKctxz3wVkDBlWKvhYIh7kpVFXSZ4&confirm=xxx" -o models/model_imdb_cross_person_4.22_99.46.pth.tar
|
||||
RUN curl "https://drive.usercontent.google.com/download?id=1CGNCkZQNj5WkP3rLpENWAOgrBQkUWRdw&confirm=xxx" -o models/yolov8x_person_face.pt
|
||||
|
||||
# https://github.com/notAI-tech/NudeNet
|
||||
# Upload to an accessible link: https://github.com/notAI-tech/NudeNet/releases/download/v3.4-weights/640m.onnx
|
||||
RUN curl "https://drive.usercontent.google.com/download?id=1lHTrW1rmYoYnMSUlhLwqFCW61-w2hvKX&confirm=xxx" -o models/640m.onnx
|
||||
|
||||
COPY . .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip freeze
|
||||
|
||||
# CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "app:app"]
|
||||
CMD ["uvicorn", "--host", "0.0.0.0", "--port", "5000", "--workers", "1", "--log-level", "info", "app:app"]
|
||||
|
||||
# docker build -t fetcher_cv .
|
||||
# docker run --rm -p 5000:5000 fetcher_cv
|
||||
36
app_cv/Server.ipynb
Normal file
36
app_cv/Server.ipynb
Normal file
@@ -0,0 +1,36 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!uvicorn app:app --workers 1 --log-level info --port 5001\n",
|
||||
"#!uvicorn app:app --reload --log-level debug --port 8000\n",
|
||||
"#!python app.py"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos_cv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
76
app_cv/app.py
Normal file
76
app_cv/app.py
Normal file
@@ -0,0 +1,76 @@
|
||||
from fastapi import FastAPI
|
||||
from nicegui import ui, events, run
|
||||
import base64
|
||||
import io
|
||||
import numpy as np
|
||||
import cv2
|
||||
import traceback
|
||||
import logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[logging.StreamHandler()])
|
||||
from cv_processor import process
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
class Item(BaseModel):
|
||||
image: str # Base64
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Define the NiceGUI UI components
|
||||
@ui.page("/")
|
||||
def main_page():
|
||||
async def handle_upload(e: events.UploadEventArguments) -> None:
|
||||
ui.notify('Processing...')
|
||||
# Read content -> image
|
||||
nparr = np.frombuffer(e.content.read(), np.uint8)
|
||||
img_np_bgr = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
||||
# Async process
|
||||
results = await run.io_bound(process, img_np_bgr)
|
||||
|
||||
# Display
|
||||
with ui.dialog() as dialog:
|
||||
# Encode
|
||||
retval, buffer = cv2.imencode('.png', results.get("image"))
|
||||
img_buffer_encoded = base64.b64encode(buffer).decode('utf-8')
|
||||
img_encoded = "data:image/png;base64,{}".format(img_buffer_encoded)
|
||||
content = ui.image(img_encoded).props('fit=scale-down')
|
||||
dialog.open()
|
||||
|
||||
ui.upload(on_upload=handle_upload, auto_upload=True, on_rejected=lambda: ui.notify('Rejected!')).props('accept=image').classes('max-w-full')
|
||||
|
||||
ui.run_with(app, title="CV")
|
||||
|
||||
@app.post('/process')
|
||||
def process_image(item: Item):
|
||||
logging.info("POST /process")
|
||||
try:
|
||||
image_data = item.image
|
||||
if (image_data is None):
|
||||
return {"error": "No image data provided"}
|
||||
|
||||
# Decode base64 string
|
||||
image_bytes = base64.b64decode(image_data)
|
||||
image_stream = io.BytesIO(image_bytes)
|
||||
# Convert bytes to NumPy array
|
||||
img_array = np.frombuffer(image_stream.getvalue(), dtype=np.uint8)
|
||||
# Decode image using OpenCV
|
||||
img_bgr = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
|
||||
# Valid image
|
||||
assert(img_bgr is not None)
|
||||
|
||||
# Process the image
|
||||
results = process(img_bgr)
|
||||
|
||||
# Encode processed image to base64
|
||||
_, buffer = cv2.imencode('.jpg', results.get("image"), [cv2.IMWRITE_JPEG_QUALITY, 100])
|
||||
processed_image_base64 = base64.b64encode(buffer).decode('utf-8')
|
||||
|
||||
# Update image with base64 encoded
|
||||
results["image_b64"] = processed_image_base64
|
||||
# Pop image (not serializable)
|
||||
results.pop("image")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logging.warning("Exception: {}".format(traceback.format_exc()))
|
||||
return {"error": traceback.format_exc()}
|
||||
125
app_cv/cv_processor.py
Normal file
125
app_cv/cv_processor.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[logging.StreamHandler()])
|
||||
|
||||
# Age
|
||||
from mivolo.predictor import Predictor
|
||||
import argparse
|
||||
# Nudity
|
||||
from nudenet import NudeDetector
|
||||
|
||||
|
||||
class CV():
|
||||
def __init__(self):
|
||||
args = argparse.ArgumentParser()
|
||||
args.add_argument("--device", type=str, default="cpu")
|
||||
args.add_argument("--checkpoint", default="models/model_imdb_cross_person_4.22_99.46.pth.tar")
|
||||
args.add_argument("--detector_weights", default="models/yolov8x_person_face.pt")
|
||||
args.add_argument("--with-persons", action="store_true", default=False, help="If set model will run with persons, if available")
|
||||
args.add_argument("--disable-faces", action="store_true", default=False, help="If set model will use only persons if available")
|
||||
args.add_argument("--draw", action="store_true", default=False, help="If set, resulted images will be drawn")
|
||||
args = args.parse_args([])
|
||||
# Initialize
|
||||
self.predictor_age = Predictor(args)
|
||||
|
||||
# Initialize
|
||||
self.nude_detector = NudeDetector(model_path="models/640m.onnx", inference_resolution=640)
|
||||
# detector = NudeDetector(model_path="downloaded_640m.onnx path", inference_resolution=640)
|
||||
# https://github.com/notAI-tech/NudeNet?tab=readme-ov-file#available-models
|
||||
|
||||
# All labels list
|
||||
self.nudity_all_labels = [
|
||||
"FEMALE_GENITALIA_COVERED",
|
||||
"FACE_FEMALE",
|
||||
"BUTTOCKS_EXPOSED",
|
||||
"FEMALE_BREAST_EXPOSED",
|
||||
"FEMALE_GENITALIA_EXPOSED",
|
||||
"MALE_BREAST_EXPOSED",
|
||||
"ANUS_EXPOSED",
|
||||
"FEET_EXPOSED",
|
||||
"BELLY_COVERED",
|
||||
"FEET_COVERED",
|
||||
"ARMPITS_COVERED",
|
||||
"ARMPITS_EXPOSED",
|
||||
"FACE_MALE",
|
||||
"BELLY_EXPOSED",
|
||||
"MALE_GENITALIA_EXPOSED",
|
||||
"ANUS_COVERED",
|
||||
"FEMALE_BREAST_COVERED",
|
||||
"BUTTOCKS_COVERED",
|
||||
]
|
||||
# Classes of interest
|
||||
self.nudity_classes_of_interest = ["BUTTOCKS_EXPOSED", "FEMALE_BREAST_EXPOSED", "FEMALE_GENITALIA_EXPOSED", "ANUS_EXPOSED", "MALE_GENITALIA_EXPOSED"]
|
||||
|
||||
def _censor(self, image_bgr, detections):
|
||||
# Copy original image
|
||||
image_bgr_censored = image_bgr.copy()
|
||||
|
||||
for detection in detections:
|
||||
box = detection["box"]
|
||||
x, y, w, h = box[0], box[1], box[2], box[3]
|
||||
# Change these pixels to pure black
|
||||
image_bgr_censored[y : y + h, x : x + w] = (0, 0, 0)
|
||||
|
||||
return image_bgr_censored
|
||||
|
||||
def process_image(self, image_bgr):
|
||||
###################################################################
|
||||
# Predict
|
||||
detected_objects, out_img = self.predictor_age.recognize(image_bgr)
|
||||
logging.debug("#persons: {}, #faces: {}".format(detected_objects.n_persons, detected_objects.n_faces))
|
||||
|
||||
# Num faces and persons detected
|
||||
detected_objects.n_faces, detected_objects.n_persons
|
||||
# Association
|
||||
detected_objects.associate_faces_with_persons()
|
||||
|
||||
# detected_objects.face_to_person_map
|
||||
# {2: 1, 3: 0}
|
||||
# detected_objects.ages
|
||||
# [None, None, 27.18, 23.77]
|
||||
age_predictions = [e for e in detected_objects.ages if e is not None]
|
||||
|
||||
# Crops of faces & persons
|
||||
# crops = detected_objects.collect_crops(img)
|
||||
any_minor_present = any([ a < 18 for a in detected_objects.ages if a is not None ])
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Predict
|
||||
nude_detections = self.nude_detector.detect(image_bgr)
|
||||
logging.debug("Nude detections: {}".format(nude_detections))
|
||||
# Filter by classes of interest
|
||||
nude_detections = [ detection for detection in nude_detections if detection["class"] in self.nudity_classes_of_interest ]
|
||||
# Nude detections present?
|
||||
any_nude_detection = len(nude_detections) > 0
|
||||
###################################################################
|
||||
|
||||
###################################################################
|
||||
# Censor image
|
||||
censored_img_bgr = self._censor(image_bgr, nude_detections)
|
||||
# Plot age predictions on censored image
|
||||
output_image = detected_objects.plot(img=censored_img_bgr)
|
||||
###################################################################
|
||||
|
||||
results = {
|
||||
"any_minor_present": any_minor_present,
|
||||
"any_nude_detection": any_nude_detection,
|
||||
"nudity_detections": nude_detections,
|
||||
"age_predictions": age_predictions,
|
||||
"image": output_image,
|
||||
}
|
||||
return results
|
||||
|
||||
def process(img_bgr):
|
||||
try:
|
||||
logging.info("Processing image")
|
||||
# Process
|
||||
results = CV().process_image(img_bgr)
|
||||
logging.info("Returning results")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logging.warning("Error processing image: {}".format(str(e)))
|
||||
return {}
|
||||
23
app_cv/docker-compose.yml
Normal file
23
app_cv/docker-compose.yml
Normal file
@@ -0,0 +1,23 @@
|
||||
services:
|
||||
matitos_cv:
|
||||
build:
|
||||
context: .
|
||||
image: fetcher_app_cv
|
||||
container_name: fetcher_app_cv
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- 5000
|
||||
environment:
|
||||
- DEBUG_MODE=0
|
||||
labels: # Reverse proxy sample
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.cv.rule=Host(`cv.matitos.org`)"
|
||||
- "traefik.http.routers.cv.entrypoints=websecure"
|
||||
- "traefik.http.routers.cv.tls.certresolver=myresolvercd"
|
||||
- "traefik.http.services.cv.loadbalancer.server.port=5000"
|
||||
networks:
|
||||
- docker_default # Reverse proxy network
|
||||
|
||||
networks:
|
||||
docker_default:
|
||||
external: true
|
||||
BIN
app_cv/imgs/img_1p.jpg
Normal file
BIN
app_cv/imgs/img_1p.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 35 KiB |
BIN
app_cv/imgs/img_nude.jpg
Normal file
BIN
app_cv/imgs/img_nude.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 29 KiB |
7
app_cv/requirements.txt
Normal file
7
app_cv/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
opencv-python
|
||||
git+https://github.com/wildchlamydia/mivolo.git
|
||||
nudenet>=3.4.2
|
||||
torch==2.5
|
||||
nicegui
|
||||
fastapi
|
||||
gunicorn
|
||||
55
app_cv_face/ABC.ipynb
Normal file
55
app_cv_face/ABC.ipynb
Normal file
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Warning: Binary output can mess up your terminal. Use \"--output -\" to tell \n",
|
||||
"Warning: curl to output it to your terminal anyway, or consider \"--output \n",
|
||||
"Warning: <FILE>\" to save to a file.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!curl https://api.missingkids.org/photographs/NCMC2049364c1.jpg"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install deepface\n",
|
||||
"# !pip install tf-keras\n",
|
||||
"from deepface import DeepFace"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos_cv_face",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -49,7 +49,7 @@ RUN if [ "${ARCH}" = "amd64" ] ; then \
|
||||
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false $toolDeps \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/*
|
||||
|
||||
RUN pip install --no-cache-dir selenium fastapi "uvicorn[standard]"
|
||||
RUN pip install --no-cache-dir selenium fastapi "uvicorn[standard]" psutil
|
||||
|
||||
WORKDIR /opt/app
|
||||
COPY . /opt/app/
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
from missing_kids import MissingKidsFetcher
|
||||
from search import SearchFetcher
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
@@ -8,7 +10,44 @@ app = FastAPI()
|
||||
@app.get("/get_missing_kids/")
|
||||
def get_missing_kids(pages: int = -1):
|
||||
try:
|
||||
logger.info("Get missing kids, #pages={}".format(pages))
|
||||
res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)}
|
||||
except Exception as e:
|
||||
logger.warning("Exception: {}".format(str(e)), exc_info=True)
|
||||
res = {}
|
||||
return res
|
||||
|
||||
class BodyVerifyMissingKid(BaseModel):
|
||||
url: str
|
||||
|
||||
@app.post("/verify_missing_kid/")
|
||||
def get_missing_kids(data: BodyVerifyMissingKid):
|
||||
try:
|
||||
logger.info("Verify missing kid, URL={}".format(data.url))
|
||||
res = MissingKidsFetcher().verify_missing_kid_url(data.url)
|
||||
except Exception as e:
|
||||
logger.warning("Exception: {}".format(str(e)), exc_info=True)
|
||||
res = {}
|
||||
return res
|
||||
|
||||
class BodyFetchSearch(BaseModel):
|
||||
search: str
|
||||
|
||||
@app.post("/fetch_search/")
|
||||
def fetch_search(data: BodyFetchSearch):
|
||||
try:
|
||||
# Initialize
|
||||
search_fetcher, results = SearchFetcher(), {}
|
||||
# Iterate
|
||||
for source in search_fetcher.get_available_sources():
|
||||
logger.info("Fetch based search source={} search={}".format(source, data.search))
|
||||
# Fetch
|
||||
results[source] = SearchFetcher().search(source, data.search)
|
||||
# Empty?
|
||||
if (len(results[source]) == 0):
|
||||
results.pop(source)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception: {}".format(str(e)), exc_info=True)
|
||||
results = {}
|
||||
return results
|
||||
|
||||
@@ -8,10 +8,10 @@ logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
|
||||
os.makedirs(logs_directory, exist_ok=True)
|
||||
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("app_selenium")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger = logging.getLogger("selenium")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||
# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
|
||||
@@ -1,27 +1,85 @@
|
||||
from selenium import webdriver
|
||||
from utils import get_webdriver, kill_process_tree
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.firefox.service import Service
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
import time
|
||||
import os
|
||||
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
def get_webdriver():
|
||||
options = Options()
|
||||
options.add_argument('--headless') # Optional
|
||||
options.binary_location = '/opt/firefox/firefox'
|
||||
|
||||
service = Service('/usr/local/bin/geckodriver')
|
||||
|
||||
driver = webdriver.Firefox(options=options, service=service)
|
||||
return driver
|
||||
|
||||
class MissingKidsFetcher():
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def verify_missing_kid_url(self, url):
|
||||
def load_finished(driver):
|
||||
# Find all <img> tags with src attributes. Extract src URLs
|
||||
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
|
||||
# If base64 image exists, loading finished
|
||||
finished = any(["data:image/png;base64" in i for i in image_urls])
|
||||
# logger.debug("Finished loading URL")
|
||||
return finished
|
||||
|
||||
try:
|
||||
# Initialize
|
||||
logger.debug("Initializing driver")
|
||||
driver, service = get_webdriver()
|
||||
# Load URL
|
||||
logger.debug("Get URL: {}".format(url))
|
||||
driver.get(url)
|
||||
# Wait for 404?
|
||||
try:
|
||||
WebDriverWait(driver, 2).until(EC.title_contains("404"))
|
||||
logger.debug("WebDriverWait -> title contains 404")
|
||||
except TimeoutException:
|
||||
logger.debug("WebDriverWait timeout, no 404 appeared")
|
||||
|
||||
if ("404" in driver.title):
|
||||
# Status invalid
|
||||
results = {"status": "invalid"}
|
||||
else:
|
||||
# Check until finished loading
|
||||
num_checks = 10
|
||||
while (not load_finished(driver)) and (num_checks>=0):
|
||||
time.sleep(1)
|
||||
num_checks -= 1
|
||||
|
||||
# Find all <img> tags with src attributes. Extract src URLs
|
||||
image_urls = [img.get_attribute("src") for img in driver.find_elements(By.XPATH, "//img[@src]")]
|
||||
|
||||
# Redirects to 404?
|
||||
if ("missingkids.org/404" in driver.current_url) or (any(["thumb-404.png" in i for i in image_urls])):
|
||||
# Status invalid
|
||||
results = {"status": "invalid"}
|
||||
# Redirection to valid URL? -> Duplicate
|
||||
elif (driver.current_url != url):
|
||||
# Redirection (duplicate)
|
||||
results = {"status": "duplicate", "redirection": driver.current_url}
|
||||
# Valid
|
||||
elif ("Have you seen this child?" in driver.title):
|
||||
# Status valid
|
||||
results = {"status": "valid"}
|
||||
else:
|
||||
results = {"status": "unknown"}
|
||||
except Exception as e:
|
||||
logger.warning("Exception while verifying MissingKid URL {}\n{}".format(url, str(e)), exc_info=True)
|
||||
results = {}
|
||||
|
||||
# Release memory
|
||||
try:
|
||||
driver.quit() #driver.close()
|
||||
time.sleep(1)
|
||||
# import atexit
|
||||
# atexit.register(driver.quit) # Will always be called on exit
|
||||
except Exception as e:
|
||||
logger.warning("Exception while closing/quitting driver: {}".format(str(e)), exc_info=True)
|
||||
kill_process_tree(service.process.pid)
|
||||
logger.info("Results: {} for URL: {}".format(str(results), url))
|
||||
return results
|
||||
|
||||
|
||||
def get_missing_kids_urls(self, first_n_pages=-1):
|
||||
logger.info("Get MissingKids, #pages: {}".format(first_n_pages))
|
||||
# Poster URL
|
||||
@@ -30,7 +88,9 @@ class MissingKidsFetcher():
|
||||
set_urls = set()
|
||||
|
||||
try:
|
||||
driver = get_webdriver()
|
||||
logger.debug("Initializing driver")
|
||||
driver, service = get_webdriver()
|
||||
logger.debug("Get URL: {}".format(url))
|
||||
# Go to URL
|
||||
driver.get(url)
|
||||
# Iterate
|
||||
@@ -88,8 +148,12 @@ class MissingKidsFetcher():
|
||||
|
||||
# Release memory
|
||||
try:
|
||||
driver.close()
|
||||
driver.quit() #driver.close()
|
||||
time.sleep(1)
|
||||
# import atexit
|
||||
# atexit.register(driver.quit) # Will always be called on exit
|
||||
except Exception as e:
|
||||
logger.warning("Exception while closing driver: {}".format(str(e)), exc_info=True)
|
||||
logger.warning("Exception while closing/quitting driver: {}".format(str(e)), exc_info=True)
|
||||
kill_process_tree(service.process.pid)
|
||||
|
||||
return set_urls
|
||||
|
||||
115
app_selenium/search.py
Normal file
115
app_selenium/search.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from utils import get_webdriver, kill_process_tree
|
||||
from selenium.webdriver.common.by import By
|
||||
from urllib.parse import quote
|
||||
import time
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class SearchFetcher():
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get_available_sources(self, ):
|
||||
return ["foxnews", "breitbart", "zerohedge"]
|
||||
|
||||
def search(self, source, search="child abuse"):
|
||||
try:
|
||||
if (source == "foxnews"):
|
||||
return self._search_foxnews(search)
|
||||
elif (source == "breitbart"):
|
||||
return self._search_breitbart(search)
|
||||
elif (source == "zerohedge"):
|
||||
return self._search_zerohedge(search)
|
||||
else:
|
||||
logger.warning("Search not implemented for source={} search={}".format(source, search))
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.warning("Error searching for source={} search={}".format(source, search))
|
||||
return []
|
||||
|
||||
def _search_foxnews(self, search):
|
||||
url_host = "foxnews.com"
|
||||
# URL search
|
||||
url_unquoted = "https://www.foxnews.com/search-results/search#q={}".format(search)
|
||||
url = quote(url_unquoted, safe=":/?=&#")
|
||||
|
||||
# Initialize
|
||||
driver, service = get_webdriver()
|
||||
# Load URL
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Find the element with class "page"
|
||||
page_element = driver.find_element(By.CLASS_NAME, "page")
|
||||
# Find the articles
|
||||
articles = page_element.find_elements(By.CLASS_NAME, "article")
|
||||
# Extract URLs
|
||||
urls = [ art.find_element(By.CLASS_NAME, "m").find_element(By.TAG_NAME, "a").get_attribute("href") for art in articles ]
|
||||
|
||||
# Remove duplicates, remove None
|
||||
urls = [u for u in set(urls) if u is not None]
|
||||
# Filter by URL host
|
||||
urls = [u for u in urls if url_host in u]
|
||||
|
||||
driver.quit()
|
||||
kill_process_tree(service.process.pid)
|
||||
|
||||
return urls
|
||||
|
||||
def _search_breitbart(self, search):
|
||||
url_host = "breitbart.com"
|
||||
# URL search
|
||||
url_unquoted = "https://www.breitbart.com/search/?s={}".format(search.replace(" ", "+"))
|
||||
url = quote(url_unquoted, safe=":/?=&#")
|
||||
|
||||
# Initialize
|
||||
driver, service = get_webdriver()
|
||||
# Load URL
|
||||
driver.get(url)
|
||||
time.sleep(4)
|
||||
|
||||
# Find the element with class "page"
|
||||
page_element = driver.find_element(By.CLASS_NAME, "gsc-expansionArea")
|
||||
# Find the articles
|
||||
articles = page_element.find_elements(By.CLASS_NAME, "gs-title")
|
||||
# Extract URLs
|
||||
urls = [ art.get_attribute("href") for art in articles ]
|
||||
|
||||
# Remove duplicates, remove None
|
||||
urls = [u for u in set(urls) if u is not None]
|
||||
# Filter by URL host
|
||||
urls = [u for u in urls if url_host in u]
|
||||
|
||||
driver.quit()
|
||||
kill_process_tree(service.process.pid)
|
||||
|
||||
return urls
|
||||
|
||||
def _search_zerohedge(self, search):
|
||||
url_host = "zerohedge.com"
|
||||
# URL search
|
||||
url_unquoted = "https://www.zerohedge.com/search-content?qTitleBody={}".format(search.replace(" ", "+"))
|
||||
url = quote(url_unquoted, safe=":/?=&#")
|
||||
|
||||
# Initialize
|
||||
driver, service = get_webdriver()
|
||||
# Load URL
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Find the element with class "page"
|
||||
page_element = driver.find_element(By.CLASS_NAME, "main-content")
|
||||
# Find the articles
|
||||
articles = page_element.find_elements(By.TAG_NAME, "a")
|
||||
# Extract URLs
|
||||
urls = [ art.get_attribute("href") for art in articles]
|
||||
|
||||
# Remove duplicates, remove None
|
||||
urls = [u for u in set(urls) if u is not None]
|
||||
# Filter by URL host
|
||||
urls = [u for u in urls if url_host in u]
|
||||
|
||||
driver.quit()
|
||||
kill_process_tree(service.process.pid)
|
||||
|
||||
return urls
|
||||
23
app_selenium/utils.py
Normal file
23
app_selenium/utils.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.firefox.service import Service
|
||||
import psutil
|
||||
|
||||
def get_webdriver():
|
||||
options = Options()
|
||||
options.add_argument('--headless') # Optional
|
||||
options.binary_location = '/opt/firefox/firefox'
|
||||
|
||||
service = Service('/usr/local/bin/geckodriver')
|
||||
|
||||
driver = webdriver.Firefox(options=options, service=service)
|
||||
return driver, service
|
||||
|
||||
def kill_process_tree(pid):
|
||||
try:
|
||||
parent = psutil.Process(pid)
|
||||
for child in parent.children(recursive=True):
|
||||
child.kill()
|
||||
parent.kill()
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
@@ -5,6 +5,9 @@ ENV PYTHONDONTWRITEBYTECODE=1
|
||||
#Prevents Python from buffering stdout and stderr
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# supervisor
|
||||
RUN apt-get update && apt-get install -y supervisor
|
||||
|
||||
# User
|
||||
RUN useradd -m -r appuser && \
|
||||
mkdir /opt/app && \
|
||||
@@ -14,10 +17,11 @@ WORKDIR /opt/app
|
||||
|
||||
# Copy the Django project and install dependencies
|
||||
COPY requirements.txt /opt/app/
|
||||
# run this command to install all dependencies
|
||||
# Install dependencies
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY --chown=appuser:appuser . /opt/app/
|
||||
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||
|
||||
RUN chmod -R 755 /opt
|
||||
RUN chown -R appuser:appuser /opt
|
||||
@@ -25,4 +29,4 @@ RUN chown -R appuser:appuser /opt
|
||||
USER appuser
|
||||
|
||||
# Run Django’s server & workers
|
||||
CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"]
|
||||
CMD ["sh", "-c", "/opt/app/initialize.sh && /usr/bin/supervisord"]
|
||||
|
||||
@@ -73,6 +73,17 @@ class Meta:
|
||||
* Environment variables
|
||||
* In docker-compose.yml
|
||||
|
||||
* Tasks
|
||||
```
|
||||
python manage.py dumpdata \
|
||||
django_celery_beat.PeriodicTask \
|
||||
django_celery_beat.IntervalSchedule \
|
||||
django_celery_beat.CrontabSchedule \
|
||||
django_celery_beat.SolarSchedule \
|
||||
django_celery_beat.ClockedSchedule \
|
||||
--indent 2 > scheduled_tasks.json
|
||||
```
|
||||
|
||||
* Deploy
|
||||
```
|
||||
# Check environments variables on .env file
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
from .celery import app as celery_app
|
||||
|
||||
__all__ = ('celery_app',)
|
||||
|
||||
14
app_urls/core/celery.py
Normal file
14
app_urls/core/celery.py
Normal file
@@ -0,0 +1,14 @@
|
||||
# core/celery.py
|
||||
import os
|
||||
from celery import Celery
|
||||
|
||||
# Set default Django settings module
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
|
||||
app = Celery('core')
|
||||
|
||||
# Load config from Django settings, namespace CELERY
|
||||
app.config_from_object('django.conf:settings', namespace='CELERY')
|
||||
|
||||
# Auto-discover tasks from all registered Django app configs
|
||||
app.autodiscover_tasks()
|
||||
@@ -12,15 +12,16 @@ https://docs.djangoproject.com/en/5.1/ref/settings/
|
||||
|
||||
from pathlib import Path
|
||||
import os
|
||||
# Queues and routing
|
||||
from kombu import Queue
|
||||
|
||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
# Quick-start development settings - unsuitable for production
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = os.getenv("DJANGO_SECRET_KEY", 'django-insecure-EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5')
|
||||
SECRET_KEY = os.environ.get("DJANGO_SECRET_KEY", 'django-insecure-EtKpy7t84GvU4gBwX9z3xKPBXMS75IAV0dkzN7dXVUsMSqy6a5rjY6WNCw3CcRH5')
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = (os.environ.get('DJANGO_DEBUG') == "True")
|
||||
@@ -37,7 +38,7 @@ INSTALLED_APPS = [
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'scheduler',
|
||||
'django_celery_beat',
|
||||
'fetcher',
|
||||
]
|
||||
|
||||
@@ -96,9 +97,10 @@ DATABASES = {
|
||||
CACHES = {
|
||||
"default": {
|
||||
"BACKEND": "django_redis.cache.RedisCache",
|
||||
"LOCATION": "redis://{}:{}".format(
|
||||
os.environ.get("REDIS_HOST", "localhost"),
|
||||
os.environ.get("REDIS_PORT", 6379)
|
||||
"LOCATION": "redis://{}:{}/{}".format(
|
||||
os.environ.get("REDIS_CACHE_HOST", "localhost"),
|
||||
os.environ.get("REDIS_CACHE_PORT", 6379),
|
||||
2 # DB for Caching
|
||||
),
|
||||
"OPTIONS": {
|
||||
"MEMCACHE_MAX_KEY_LENGTH": 2048,
|
||||
@@ -107,59 +109,23 @@ CACHES = {
|
||||
}
|
||||
}
|
||||
|
||||
'''
|
||||
from scheduler.types import SchedulerConfiguration, QueueConfiguration, Broker
|
||||
from typing import Dict
|
||||
# Celery configuration
|
||||
CELERY_BROKER_URL = 'redis://{}:{}/{}'.format(os.environ.get("REDIS_CELERY_HOST", "localhost"), os.environ.get("REDIS_CELERY_PORT", 6379), 0)
|
||||
CELERY_RESULT_BACKEND = 'redis://{}:{}/{}'.format(os.environ.get("REDIS_CELERY_HOST", "localhost"), os.environ.get("REDIS_CELERY_PORT", 6379), 1)
|
||||
CELERY_ACCEPT_CONTENT = ['json']
|
||||
CELERY_TASK_SERIALIZER = 'json'
|
||||
CELERY_RESULT_EXPIRES = 3600 # Auto clean results after 1 hour
|
||||
CELERY_ENABLE_UTC = True
|
||||
CELERY_TIMEZONE = "UTC"
|
||||
|
||||
# https://django-tasks-scheduler.readthedocs.io/en/latest/configuration/
|
||||
SCHEDULER_CONFIG = SchedulerConfiguration(
|
||||
DEFAULT_JOB_TIMEOUT = os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 30 minutes
|
||||
BROKER=Broker.REDIS,
|
||||
# Celery Beat scheduler (required for django-celery-beat to work)
|
||||
CELERY_BEAT_SCHEDULER = 'django_celery_beat.schedulers.DatabaseScheduler'
|
||||
|
||||
CELERY_TASK_QUEUES = (
|
||||
Queue('default'),
|
||||
Queue('low'),
|
||||
)
|
||||
|
||||
SCHEDULER_QUEUES: Dict[str, QueueConfiguration] = {
|
||||
'default': QueueConfiguration(
|
||||
HOST = os.environ.get("REDIS_HOST", "localhost"),
|
||||
PORT = os.environ.get("REDIS_PORT", 6379),
|
||||
DB = os.environ.get("REDIS_DB", 0),
|
||||
),
|
||||
'high': QueueConfiguration(
|
||||
HOST = os.environ.get("REDIS_HOST", "localhost"),
|
||||
PORT = os.environ.get("REDIS_PORT", 6379),
|
||||
DB = os.environ.get("REDIS_DB", 0),
|
||||
),
|
||||
'low': QueueConfiguration(
|
||||
HOST = os.environ.get("REDIS_HOST", "localhost"),
|
||||
PORT = os.environ.get("REDIS_PORT", 6379),
|
||||
DB = os.environ.get("REDIS_DB", 0),
|
||||
),
|
||||
}
|
||||
'''
|
||||
|
||||
SCHEDULER_QUEUES = {
|
||||
'default': {
|
||||
'HOST': os.environ.get("REDIS_HOST", "localhost"),
|
||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||
'DB': os.environ.get("REDIS_DB", 0),
|
||||
},
|
||||
'high': {
|
||||
'HOST': os.environ.get("REDIS_HOST", "localhost"),
|
||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||
'DB': os.environ.get("REDIS_DB", 0),
|
||||
},
|
||||
'low': {
|
||||
'HOST': os.environ.get("REDIS_HOST", "localhost"),
|
||||
'PORT': os.environ.get("REDIS_PORT", 6379),
|
||||
'DB': os.environ.get("REDIS_DB", 0),
|
||||
}
|
||||
}
|
||||
SCHEDULER_CONFIG = {
|
||||
'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 30 minutes
|
||||
'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours
|
||||
'EXECUTIONS_IN_PAGE': 20,
|
||||
'SCHEDULER_INTERVAL': 10, # 10 seconds
|
||||
}
|
||||
|
||||
|
||||
# Password validation
|
||||
|
||||
|
||||
@@ -19,6 +19,5 @@ from django.urls import path, include
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('scheduler/', include('scheduler.urls')),
|
||||
path('', include('fetcher.urls')),
|
||||
]
|
||||
|
||||
@@ -4,8 +4,10 @@ from django.core.cache import cache
|
||||
from django.db import IntegrityError
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from .fetch_utils_url_processor import process_url, get_with_protocol
|
||||
from .fetch_utils_url_processor import process_url, verify_missing_kid_url
|
||||
from .utils import get_with_protocol
|
||||
import re
|
||||
import requests
|
||||
import os
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
@@ -15,7 +17,7 @@ class DB_Handler():
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def insert_raw_urls(self, urls, obj_source, obj_search):
|
||||
def insert_raw_urls(self, urls, obj_source, obj_search):
|
||||
try:
|
||||
logger.debug("Inserting raw URLs")
|
||||
# Empty?
|
||||
@@ -43,7 +45,6 @@ class DB_Handler():
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
|
||||
else:
|
||||
# Add object to insert
|
||||
# url_object_to_insert.append(Urls(url=url))
|
||||
urls_to_insert.append(url)
|
||||
|
||||
### Insert URLs & (URL_id, source_id)
|
||||
@@ -81,27 +82,67 @@ class DB_Handler():
|
||||
except Exception as e:
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def _set_status(self, obj_url, status):
|
||||
# Update status if setting a new value
|
||||
if (obj_url.status != status):
|
||||
obj_url.status = status
|
||||
obj_url.save()
|
||||
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
|
||||
|
||||
def set_status(obj_url, status):
|
||||
# Update status if setting a new value
|
||||
if (obj_url.status != status):
|
||||
obj_url.status = status
|
||||
obj_url.save()
|
||||
def _set_duplicate_and_insert_canonical(self, obj_url, url_canonical):
|
||||
# Update status
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=url_canonical)
|
||||
# Get the source-search IDs associated to obj_url.id
|
||||
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
|
||||
for obj_url_source_search in list_url_source_search:
|
||||
# Associate same sources to url_canonical (it might already exist)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
||||
# URLs duplciate association
|
||||
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False, request_timeout=15):
|
||||
##########################################################################
|
||||
# URL pattern: missingkids.org/poster OR missingkids.org/new-poster
|
||||
if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url):
|
||||
try:
|
||||
# Verify missing kid URL
|
||||
results = verify_missing_kid_url(obj_url.url)
|
||||
except Exception as e:
|
||||
if (raise_exception_on_error):
|
||||
# Simply raise exception, handled in a different way
|
||||
raise Exception("Error processing URL, raising exception as expected")
|
||||
else:
|
||||
logger.debug("Error processing URL: {}\n{}\n{}".format(obj_url.url, str(e), traceback.format_exc()))
|
||||
# Set status to error
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
return
|
||||
|
||||
if (results.get("status") == "valid"):
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
elif (results.get("status") == "invalid"):
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
elif (results.get("status") == "duplicate"):
|
||||
self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection"))
|
||||
elif (results.get("status") == "unknown"):
|
||||
# Nothing to do, not sure about it...
|
||||
logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url))
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
||||
return
|
||||
##########################################################################
|
||||
|
||||
# Found a pattern match -> Override status
|
||||
if (status_pattern_match is not None):
|
||||
logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
|
||||
# Update status
|
||||
set_status(obj_url, status_pattern_match)
|
||||
self._set_status(obj_url, status_pattern_match)
|
||||
##### Filter URL? -> Invalid (don't extract content)
|
||||
if (status_pattern_match == "invalid"):
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
# Extract URL content
|
||||
dict_url_data = process_url(obj_url.url, paywall_bypass)
|
||||
dict_url_data = process_url(obj_url.url, paywall_bypass, request_timeout)
|
||||
except Exception as e:
|
||||
if (raise_exception_on_error):
|
||||
# Simply raise exception, handled in a different way
|
||||
@@ -112,19 +153,9 @@ class DB_Handler():
|
||||
dict_url_data = None
|
||||
|
||||
##### Canonical URL different? -> Duplicate
|
||||
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||
# Get the source-search IDs associated to obj_url.id
|
||||
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
|
||||
for obj_url_source_search in list_url_source_search:
|
||||
# Associate same sources to url_canonical (it might already exist)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
||||
# URLs duplciate association
|
||||
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||
|
||||
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# URL as duplicate, insert canonical URL
|
||||
self._set_duplicate_and_insert_canonical(obj_url, dict_url_data.get("url_canonical"))
|
||||
# Next URL
|
||||
return
|
||||
|
||||
@@ -133,20 +164,20 @@ class DB_Handler():
|
||||
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
|
||||
if (dict_url_data is None):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
# Invalid? e.g. binary data
|
||||
if (dict_url_data.get("override_status") == "invalid"):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
|
||||
try:
|
||||
if (dict_url_data is not None):
|
||||
@@ -244,14 +275,31 @@ class DB_Handler():
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def process_missing_kids_urls(self, batch_size=None):
|
||||
def process_missing_kids_urls(self, batch_size=None, process_status_only=None):
|
||||
try:
|
||||
logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size))
|
||||
logger.info("Processing MissingKids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only))
|
||||
|
||||
if (process_status_only is None):
|
||||
filter = (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.UNKNOWN) | Q(status=Urls.STATUS_ENUM.ERROR))
|
||||
else:
|
||||
if (process_status_only == "valid"):
|
||||
filter = Q(status=Urls.STATUS_ENUM.VALID)
|
||||
elif (process_status_only == "invalid"):
|
||||
filter = Q(status=Urls.STATUS_ENUM.INVALID)
|
||||
elif (process_status_only == "error"):
|
||||
filter = Q(status=Urls.STATUS_ENUM.ERROR)
|
||||
elif (process_status_only == "unknown"):
|
||||
filter = Q(status=Urls.STATUS_ENUM.UNKNOWN)
|
||||
elif (process_status_only == "raw"):
|
||||
filter = Q(status=Urls.STATUS_ENUM.RAW)
|
||||
elif (process_status_only == "duplicate"):
|
||||
filter = Q(status=Urls.STATUS_ENUM.DUPLICATE)
|
||||
else:
|
||||
logger.info("Unknown status to filter: {}".format(process_status_only))
|
||||
|
||||
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
|
||||
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
|
||||
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
||||
&
|
||||
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
|
||||
filter & (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
||||
)
|
||||
|
||||
# Get batch size
|
||||
@@ -261,14 +309,36 @@ class DB_Handler():
|
||||
# Per URL
|
||||
for obj_url in missingkids_urls:
|
||||
try:
|
||||
# Process URL. If no exception -> Valid
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
|
||||
SELENIUM_BASED_MISSINGKID_VERIFICATION = False
|
||||
if (SELENIUM_BASED_MISSINGKID_VERIFICATION):
|
||||
# Missing kids fetching endpoint, verify URL
|
||||
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
|
||||
data = {"url": obj_url.url}
|
||||
# POST
|
||||
r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120)
|
||||
# Jsonify
|
||||
results = r.json()
|
||||
logger.debug("Missingkids Selenium results for URL {}: {}".format(obj_url.url, str(results)))
|
||||
else:
|
||||
# Verify
|
||||
results = verify_missing_kid_url(obj_url.url)
|
||||
logger.debug("Missingkids verify results for URL {}: {}".format(obj_url.url, str(results)))
|
||||
|
||||
if (results.get("status") == "valid"):
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
elif (results.get("status") == "invalid"):
|
||||
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
elif (results.get("status") == "duplicate"):
|
||||
self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection"))
|
||||
elif (results.get("status") == "unknown"):
|
||||
# Nothing to do, not sure about it...
|
||||
logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url))
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
# Raised exception -> Invalid (404 error)
|
||||
obj_url.status = Urls.STATUS_ENUM.INVALID
|
||||
obj_url.save()
|
||||
logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e)))
|
||||
|
||||
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
|
||||
logger.info("Verified status of #{} missingkids.org/poster / missingkids.org/new-poster URLs".format(len(missingkids_urls)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
from .fetch_utils_url_processor import get_with_protocol, url_host_slowdown
|
||||
from .fetch_utils_url_processor import url_host_slowdown
|
||||
from .utils import get_with_protocol
|
||||
import newspaper
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
@@ -17,6 +18,9 @@ class FetchParser():
|
||||
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
|
||||
# Ensure URL host in URL
|
||||
raw_urls = [u for u in raw_urls if url_host_clean in u]
|
||||
|
||||
# Clean URL part after """
|
||||
raw_urls = [u.split(""")[0] for u in raw_urls]
|
||||
|
||||
return raw_urls
|
||||
|
||||
|
||||
@@ -54,6 +54,7 @@ class FetchSearcher():
|
||||
for SearchInstance in ListSearchInstances:
|
||||
# Sleep between requests, avoid too many requests...
|
||||
time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
|
||||
# TODO: Random proxy / VPN
|
||||
SearchInstance(args).fetch_articles(db_writer, obj_search)
|
||||
|
||||
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
|
||||
|
||||
@@ -1,15 +1,15 @@
|
||||
import time
|
||||
import feedparser
|
||||
import os
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from urllib.parse import unquote
|
||||
from ..models import Search, Source
|
||||
from .fetch_utils_gnews import decode_gnews_urls
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
from furl import furl
|
||||
from gnews import GNews
|
||||
from duckduckgo_search import DDGS
|
||||
from ddgs import DDGS
|
||||
from GoogleNews import GoogleNews
|
||||
from search_engines import Yahoo, Aol
|
||||
|
||||
@@ -42,6 +42,9 @@ class FetcherAbstract(ABC):
|
||||
# Ensure URL host in URL
|
||||
raw_urls = [u for u in raw_urls if url_host_clean in u]
|
||||
|
||||
# Remove URL parameters, e.g. "?param=1234&h=yes"
|
||||
raw_urls = [ furl(u).remove(furl(u).args).url for u in raw_urls ]
|
||||
|
||||
return raw_urls
|
||||
|
||||
def fetch_articles(self, db_writer, obj_search):
|
||||
@@ -110,7 +113,7 @@ class SearchDuckDuckGoGeneral(FetcherAbstract):
|
||||
return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
try:
|
||||
news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
|
||||
urls = [e.get("href") for e in news]
|
||||
except Exception as e:
|
||||
@@ -206,7 +209,10 @@ class SearchGoogleGeneral(FetcherAbstract):
|
||||
# Links
|
||||
for l in links:
|
||||
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
|
||||
set_links.add( l.get("link").split("&ved=")[0] )
|
||||
url = l.get("link").split("&ved=")[0]
|
||||
# https://www.foxnews.com/politics%3Fparam%3D446dd5e1 -> https://www.foxnews.com/politics?param=446dd5e1
|
||||
url = unquote(url)
|
||||
set_links.add(url)
|
||||
# Finished?
|
||||
if (num_before == len(set_links)):
|
||||
break
|
||||
|
||||
42
app_urls/fetcher/src/fetch_selenium.py
Normal file
42
app_urls/fetcher/src/fetch_selenium.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
import traceback
|
||||
import requests
|
||||
import os
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchSeleniumSourceSearch():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Selenium Source Search")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchSeleniumSourceSearch.run()")
|
||||
|
||||
# Get keyword searches
|
||||
list_keyword_search = Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH)
|
||||
logger.debug("Fetching news Selenium based for keyword searches: {}".format([e.search for e in list_keyword_search]))
|
||||
|
||||
# Run selenium search for each keyword search
|
||||
for obj_search in list_keyword_search:
|
||||
try:
|
||||
# Selenium fetching endpoint
|
||||
selenium_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "fetch_search/")
|
||||
data = {"search": obj_search.search}
|
||||
# POST
|
||||
r = requests.post(selenium_fetch_endpoint, json=data, timeout=900)
|
||||
# Jsonify
|
||||
results = r.json()
|
||||
logger.debug("Selenium results for URL {}: {}".format(obj_search.search, str(results)))
|
||||
|
||||
for source, urls_fetched in results.items():
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source="selenium {}".format(source))
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("Exception while fetching selenium search: {}\n{}".format(obj_search.search, str(e)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchSeleniumSourceSearch.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
@@ -2,20 +2,13 @@ from django.core.cache import cache
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
import newspaper
|
||||
import requests
|
||||
import time
|
||||
import os
|
||||
from urllib.parse import unquote
|
||||
import langdetect
|
||||
langdetect.DetectorFactory.seed = 0
|
||||
|
||||
def get_with_protocol(url):
|
||||
# http:// -> https://
|
||||
url = url.replace("http://", "https://")
|
||||
# "" -> https://
|
||||
if not (url.startswith("https://")):
|
||||
url = "https://" + url
|
||||
return url
|
||||
|
||||
def get_url_host(url):
|
||||
# URL no protocol, first substring before '/'
|
||||
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||
@@ -38,8 +31,49 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
||||
# About to process URL host, cache time
|
||||
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||
|
||||
def process_url(url, paywall_bypass=False):
|
||||
|
||||
def verify_missing_kid_url(url):
|
||||
# Sleep required? To avoid too many requests error
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||
|
||||
# Request, get redirection
|
||||
r = requests.get(url, allow_redirects=True)
|
||||
# Redirection?
|
||||
if (url != r.url):
|
||||
url_redirection = r.url
|
||||
return {"status": "duplicate", "redirection": url_redirection}
|
||||
|
||||
# Sample URL: "https://www.missingkids.org/poster/NCMC/2058896/1"
|
||||
org_prefix, case_num = url.split("/")[-3], url.split("/")[-2]
|
||||
# Fill details to API endpoint
|
||||
base_url = "https://www.missingkids.org/bin/ncmecEndpoint?action=childDetail&orgPrefix={}&caseNum={}"
|
||||
url_endpoint = base_url.format(org_prefix, case_num)
|
||||
|
||||
# Cache timeout missingkids.org
|
||||
time.sleep(0.25)
|
||||
|
||||
# Request
|
||||
r = requests.get(url_endpoint)
|
||||
# Analyze status code and status result
|
||||
if (r.status_code == 200):
|
||||
r_json = r.json()
|
||||
# Valid poster
|
||||
if (r_json.get("status") == "success"):
|
||||
return {"status": "valid"}
|
||||
# Invalid poster
|
||||
elif (r_json.get("status") == "error"):
|
||||
return {"status": "invalid"}
|
||||
else:
|
||||
# ?
|
||||
logger.info("Unknown json status: {} when verifying missing kid: {}".format(str(r_json), url))
|
||||
return {"status": "unknown"}
|
||||
else:
|
||||
# Error status code
|
||||
logger.info("Unknown request status: {} when verifying missing kid: {}".format(r.status_code, url))
|
||||
return {"status": "unknown"}
|
||||
|
||||
def process_url(url, paywall_bypass=False, request_timeout=15):
|
||||
|
||||
if (paywall_bypass):
|
||||
# TODO: Implement self-hosted instance
|
||||
url_paywall_bypass_base = "https://marreta.pcdomanual.com"
|
||||
@@ -51,33 +85,74 @@ def process_url(url, paywall_bypass=False):
|
||||
try:
|
||||
# Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||
# User agent
|
||||
user_agent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
|
||||
|
||||
# Process
|
||||
article = newspaper.article(url_of_interest)
|
||||
if ("foxnews.com" in url_of_interest) or ("zerohedge" in url_of_interest):
|
||||
# Request
|
||||
r = requests.get(url, headers={"User-Agent": user_agent}, timeout=request_timeout)
|
||||
# Raise for error code
|
||||
r.raise_for_status()
|
||||
# Parse
|
||||
article = newspaper.Article(url=r.url).download(input_html=r.text).parse()
|
||||
else:
|
||||
# Config: Fake user agent
|
||||
config = newspaper.configuration.Configuration()
|
||||
config.headers = {'User-Agent': user_agent}
|
||||
config.request_timeout = request_timeout
|
||||
# Default mode
|
||||
article = newspaper.article(url_of_interest, config=config)
|
||||
|
||||
except newspaper.ArticleBinaryDataException:
|
||||
logger.warning("ArticleException for input URL {}".format(url))
|
||||
return {"override_status": "invalid"}
|
||||
except newspaper.ArticleException as e:
|
||||
|
||||
# Too many requests or blocked for some reason
|
||||
if ("Status code 403" in str(e.args)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
logger.debug("TODO: process_url Implement code 403")
|
||||
|
||||
# Not found, either it doesn't exist or getting blocked...
|
||||
if ("Status code 404" in str(e.args)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
logger.debug("TODO: process_url Implement code 404")
|
||||
|
||||
# Too many requests? Cool down...
|
||||
if ("Status code 429" in str(e.args)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
logger.debug("TODO: process_url Implement code 429")
|
||||
|
||||
# Unavailable for legal reasons
|
||||
if ("Status code 451" in str(e.args)):
|
||||
# TODO: Bypass with VPN
|
||||
logger.debug("TODO: process_url Implement code 451")
|
||||
|
||||
# CloudFlare protection?
|
||||
if ("Website protected with Cloudflare" in str(e.args)):
|
||||
logger.debug("TODO: process_url Implement bypass CloudFlare")
|
||||
|
||||
# PerimeterX protection?
|
||||
if ("Website protected with PerimeterX" in str(e.args)):
|
||||
logger.debug("TODO: process_url Implement bypass PerimeterX")
|
||||
|
||||
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
|
||||
|
||||
# Try simple request, valid response but couldn't parse article? e.g. getting blocked? -> unknown
|
||||
time.sleep(0.25)
|
||||
r = requests.get(url_of_interest, timeout=request_timeout)
|
||||
if (r.status_code == 200):
|
||||
return {"override_status": "unknown"}
|
||||
else:
|
||||
# Another status code still... "error" or "unknown"
|
||||
return {"override_status": "unknown"}
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||
return None
|
||||
|
||||
|
||||
# Not a valid URL?
|
||||
if (not article.is_valid_url()):
|
||||
logger.debug("Invalid URL found: {}".format(url))
|
||||
|
||||
@@ -1,24 +1,76 @@
|
||||
import ollama
|
||||
import os
|
||||
import requests
|
||||
import json
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class OllamaClient():
|
||||
def __init__(self):
|
||||
self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
|
||||
self.host = os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org")
|
||||
self.client = ollama.Client(host=self.host)
|
||||
self.options = {"temperature": 0, "seed": 13579}
|
||||
|
||||
def _get_default_model(self):
|
||||
return "llama3.2:3b"
|
||||
return os.getenv("OLLAMA_MODEL_DEFAULT", "llama3.2:3b")
|
||||
|
||||
def get_models(self):
|
||||
models = sorted([m.model for m in self.client.list().models])
|
||||
if (self._get_default_model() in models):
|
||||
return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
|
||||
else:
|
||||
return models
|
||||
try:
|
||||
# Get models
|
||||
models = sorted([m.model for m in self.client.list().models])
|
||||
# r = requests.get( os.path.join(endpoint, "models") )
|
||||
# r.json().get("models")
|
||||
|
||||
# Default within it?
|
||||
if (self._get_default_model() in models):
|
||||
return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
|
||||
else:
|
||||
return models
|
||||
except Exception as e:
|
||||
return []
|
||||
|
||||
def get_prompt(self):
|
||||
return ("Rewrite the text below into a clear and concise summary of one paragraph maximum, presenting the key points as if they are newly written insights. "
|
||||
def get_prompt(self, content):
|
||||
return "Provide, in one sentence each, the what, why, who, when, where, and a detailed summary of the content below:\n\n{}".format(content)
|
||||
return "First, provide a detailed summary of the content below in one paragraph. Second, specify in one sentence each the who, what, when, where and why of the story. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states':\n\n{}".format(content)
|
||||
return "First, provide a summary of the content below in one paragraph. Second, specify the Who, What, When, Where and Why of the story:\n\n{}".format(content)
|
||||
# First, provide a summary of the content below in one paragraph. Second, specify the who, what, when, where and why of the story in one sentence each. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states':
|
||||
'''
|
||||
return ("Rewrite the content below into a clear and concise summary of one paragraph maximum, presenting the key points as if they are newly written insights. "
|
||||
"Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. "
|
||||
"Write in a natural, standalone format that feels like an original explanation. "
|
||||
"Keep it brief, engaging, informative, in the style of a news article: \n"
|
||||
"Keep it brief, engaging, informative, in the style of a news article:\n\n{}".format(content)
|
||||
)
|
||||
|
||||
'''
|
||||
|
||||
def generate(self, model, prompt, format=None):
|
||||
try:
|
||||
# Generate response
|
||||
response = self.client.generate(model=model, prompt=prompt, format=format, options=self.options)
|
||||
# Extract response
|
||||
response = response.response
|
||||
# Json? -> Dict
|
||||
if (format == "json"):
|
||||
# Dict
|
||||
response = json.loads(response)
|
||||
# Force unload
|
||||
r = requests.post( os.path.join(self.host, "unload_model") )
|
||||
except Exception as e:
|
||||
logger.warning("Exception while generating LLM response: {}".format(str(e)))
|
||||
if (format == "json"):
|
||||
response = {}
|
||||
else:
|
||||
response = None
|
||||
# Text
|
||||
return response
|
||||
|
||||
def generate_stream(self, model, prompt):
|
||||
try:
|
||||
# Generate response
|
||||
response = self.client.generate(model=model, prompt=prompt, format="json", stream=True, options=self.options)
|
||||
# Streamed chunks
|
||||
for chunk in response:
|
||||
yield chunk.response
|
||||
# Force unload
|
||||
r = requests.post( os.path.join(self.host, "unload_model") )
|
||||
except Exception as e:
|
||||
logger.warning("Exception while generating LLM response: {}".format(str(e)))
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
# Set to warning
|
||||
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
||||
logging.getLogger("newspaper").setLevel(logging.WARNING)
|
||||
|
||||
# Get env var
|
||||
logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
|
||||
|
||||
@@ -11,7 +15,7 @@ logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(messa
|
||||
logger = logging.getLogger("fetcher")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||
# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
|
||||
153
app_urls/fetcher/src/notifier.py
Normal file
153
app_urls/fetcher/src/notifier.py
Normal file
@@ -0,0 +1,153 @@
|
||||
from django.utils import timezone
|
||||
from django.utils.timezone import now, timedelta
|
||||
from ..models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
|
||||
from django.db.models import Count
|
||||
import requests
|
||||
import os
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
def notify_telegram_info(last_hours, channel="INFO"):
|
||||
try:
|
||||
start_date = timezone.now() - timedelta(hours=last_hours)
|
||||
|
||||
# Count the number of URLs grouped by status within the date range
|
||||
urls_data_status = Urls.objects.filter(ts_fetch__gte=start_date) \
|
||||
.values('status') \
|
||||
.annotate(count=Count('id')) \
|
||||
.order_by('status')
|
||||
|
||||
# Count the number of URLs grouped by source
|
||||
urls_data_source = UrlsSourceSearch.objects \
|
||||
.filter(id_url__ts_fetch__gte=start_date) \
|
||||
.values('id_source__source') \
|
||||
.annotate(count=Count('id_url')) \
|
||||
.order_by('id_source__source')
|
||||
|
||||
# Count the number of URLs grouped by search
|
||||
urls_data_search = UrlsSourceSearch.objects \
|
||||
.filter(id_url__ts_fetch__gte=start_date) \
|
||||
.values('id_search__search') \
|
||||
.annotate(count=Count('id_url')) \
|
||||
.order_by('id_search__search')
|
||||
|
||||
|
||||
bot_token = os.environ.get("TELEGRAM_{}_BOT_TOKEN".format(channel), "")
|
||||
chat_id = os.environ.get("TELEGRAM_{}_CHAT_ID".format(channel), "")
|
||||
|
||||
message = "During the last {} hours:\n".format(last_hours)
|
||||
|
||||
message += "\nURLs per status:\n"
|
||||
for o in urls_data_status:
|
||||
message += " {}: {}\n".format(o.get("status"), o.get("count"))
|
||||
message += "\nURLs per source:\n"
|
||||
for o in urls_data_source:
|
||||
message += " {}: {}\n".format(o.get("id_source__source"), o.get("count"))
|
||||
message += "\nURLs per search:\n"
|
||||
for o in urls_data_search:
|
||||
message += " {}: {}\n".format(o.get("id_search__search"), o.get("count"))
|
||||
|
||||
|
||||
url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
|
||||
params = {
|
||||
"chat_id": chat_id,
|
||||
"text": message
|
||||
}
|
||||
|
||||
# POST
|
||||
response = requests.post(url, params=params)
|
||||
except Exception as e:
|
||||
logger.info("Exception while notifying status: {}\n{}".format(str(e), traceback.format_exc()))
|
||||
|
||||
|
||||
def notify_telegram_warning(last_hours, channel="WARNING"):
|
||||
try:
|
||||
# Message appending logic
|
||||
message = ""
|
||||
|
||||
start_date = timezone.now() - timedelta(hours=last_hours)
|
||||
|
||||
# Count the number of URLs grouped by status within the date range
|
||||
urls_data_status = Urls.objects.filter(ts_fetch__gte=start_date) \
|
||||
.values('status') \
|
||||
.annotate(count=Count('id')) \
|
||||
.order_by('status')
|
||||
|
||||
# Build dictionary
|
||||
urls_data_status_dict = {}
|
||||
for o in urls_data_status:
|
||||
# #STATUS
|
||||
urls_data_status_dict[o.get("status")] = o.get("count")
|
||||
# #TOTAL
|
||||
urls_data_status_dict["total"] = urls_data_status_dict.get("total", 0) + o.get("count")
|
||||
|
||||
MINIMUM_URLS_THRESHOLD = 10
|
||||
MINIMUM_PROCESSED_URLS_RATIO = 0.7
|
||||
|
||||
# Minimum amount of URLs
|
||||
if (urls_data_status_dict.get("total") < MINIMUM_URLS_THRESHOLD):
|
||||
message += "WARNING - Total #URLS during the last {} hours: {}\n".format(last_hours, urls_data_status_dict.get("total"))
|
||||
message += "\nURLs per status:\n"
|
||||
for o in urls_data_status:
|
||||
message += " {}: {}\n".format(o.get("status"), o.get("count"))
|
||||
|
||||
# Minimum ratio of processed raw urls
|
||||
if (urls_data_status_dict.get("total") > 0):
|
||||
if (urls_data_status_dict.get("raw", 0) / urls_data_status_dict.get("total") >= MINIMUM_PROCESSED_URLS_RATIO):
|
||||
message += "WARNING - Small ratio of processed raw URLs during the last {} hours: {}\n".format(last_hours, urls_data_status_dict.get("total"))
|
||||
message += "\nURLs per status:\n"
|
||||
for o in urls_data_status:
|
||||
message += " {}: {}\n".format(o.get("status"), o.get("count"))
|
||||
|
||||
|
||||
# Count the number of URLs grouped by source
|
||||
urls_data_source = UrlsSourceSearch.objects \
|
||||
.filter(id_url__ts_fetch__gte=start_date) \
|
||||
.values('id_source__source') \
|
||||
.annotate(count=Count('id_url')) \
|
||||
.order_by('id_source__source')
|
||||
|
||||
MINIMUM_SOURCES = 3
|
||||
if (len(urls_data_source) < MINIMUM_SOURCES):
|
||||
message += "WARNING - Very few sources found URLs during the last {} hours".format(last_hours)
|
||||
message += "\nURLs per source:\n"
|
||||
for o in urls_data_source:
|
||||
message += " {}: {}\n".format(o.get("id_source__source"), o.get("count"))
|
||||
|
||||
"""
|
||||
# TODO: URLs per search, key should be present for cnbc.com, foxnews.com, zerohedge.com, breitbart.com, child abuse, child neglect
|
||||
# Count the number of URLs grouped by search
|
||||
urls_data_search = UrlsSourceSearch.objects \
|
||||
.filter(id_url__ts_fetch__gte=start_date) \
|
||||
.values('id_search__search') \
|
||||
.annotate(count=Count('id_url')) \
|
||||
.order_by('id_search__search')
|
||||
|
||||
message += "\nURLs per search:\n"
|
||||
for o in urls_data_search:
|
||||
message += " {}: {}\n".format(o.get("id_search__search"), o.get("count"))
|
||||
"""
|
||||
|
||||
# Valid message body?
|
||||
if (message != ""):
|
||||
bot_token = os.environ.get("TELEGRAM_{}_BOT_TOKEN".format(channel), "")
|
||||
chat_id = os.environ.get("TELEGRAM_{}_CHAT_ID".format(channel), "")
|
||||
|
||||
url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
|
||||
params = {
|
||||
"chat_id": chat_id,
|
||||
"text": message
|
||||
}
|
||||
|
||||
# POST
|
||||
response = requests.post(url, params=params)
|
||||
except Exception as e:
|
||||
logger.info("Exception while notifying status: {}\n{}".format(str(e)), traceback.format_exc())
|
||||
|
||||
|
||||
def notify_telegram(last_hours=12):
|
||||
# INFO
|
||||
notify_telegram_info(last_hours, channel="INFO")
|
||||
# WARNING
|
||||
notify_telegram_warning(last_hours, channel="WARNING")
|
||||
@@ -12,7 +12,8 @@ logger = get_logger()
|
||||
|
||||
class Publisher():
|
||||
def __init__(self):
|
||||
pass
|
||||
self.admin_api_url = os.getenv("GHOST_ADMIN_API_URL")
|
||||
self.admin_api_key = os.getenv("GHOST_ADMIN_API_KEY")
|
||||
|
||||
def _create_jwt(self, admin_api_key):
|
||||
id_, secret = admin_api_key.split(':')
|
||||
@@ -29,9 +30,7 @@ class Publisher():
|
||||
|
||||
def _create_ghost_post(self, post_data):
|
||||
# Get token
|
||||
jwt_token = self._create_jwt(os.getenv("GHOST_ADMIN_API_KEY"))
|
||||
# Get Admin API URL
|
||||
admin_api_url = os.getenv("GHOST_ADMIN_API_URL")
|
||||
jwt_token = self._create_jwt(self.admin_api_key)
|
||||
|
||||
headers = {
|
||||
'Authorization': f'Ghost {jwt_token}',
|
||||
@@ -41,7 +40,7 @@ class Publisher():
|
||||
post_data = {"posts": [post_data]}
|
||||
|
||||
response = requests.post(
|
||||
os.path.join(admin_api_url, "posts"),
|
||||
os.path.join(self.admin_api_url, "posts"),
|
||||
json=post_data,
|
||||
headers=headers,
|
||||
params={"source":"html"}
|
||||
@@ -53,6 +52,27 @@ class Publisher():
|
||||
else:
|
||||
logger.warning("Ghost - Failed to publish post: {} {}".format(response.status_code, response.text))
|
||||
return None
|
||||
|
||||
def _published_url_id(self, url_id):
|
||||
# Get token
|
||||
jwt_token = self._create_jwt(self.admin_api_key)
|
||||
|
||||
headers = {
|
||||
'Authorization': f'Ghost {jwt_token}',
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
|
||||
# Query param filter by URL ID
|
||||
params = {"filter": "tags:hash-url-id-{}".format(url_id)}
|
||||
# Get posts using filter
|
||||
response = requests.get(os.path.join(self.admin_api_url, "posts"), params=params, headers=headers)
|
||||
# To JSON
|
||||
dict_response = response.json()
|
||||
|
||||
if (len(dict_response.get("posts")) > 0):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def _get_photo_url(self, query):
|
||||
# TODO: Get already used photos to skip. Use DB
|
||||
@@ -100,14 +120,56 @@ class Publisher():
|
||||
if (url_content.valid_content is False):
|
||||
logger.warning("Ghost - URL Content is not valid for URL ID: {} {}".format(url_id, url.url))
|
||||
return
|
||||
|
||||
# URL ID already published?
|
||||
if (self._published_url_id(url_id)):
|
||||
logger.info("Ghost - URL ID {} already published, skipping".format(url_id))
|
||||
return
|
||||
|
||||
model = "llama3.2:3b"
|
||||
prompt = "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
|
||||
###########################################
|
||||
client_llm = OllamaClient()
|
||||
# Model
|
||||
model = client_llm.get_models()[0]
|
||||
# Prompt
|
||||
prompt = client_llm.get_prompt(url_content.content)
|
||||
# Generate content
|
||||
generated_content_dict = client_llm.generate(model, prompt, format="json")
|
||||
logger.debug("Generated content: {}".format(generated_content_dict))
|
||||
|
||||
ollama_msg = {"role": "user", "content": "{}\n{}".format(prompt, url_content.content)}
|
||||
response = OllamaClient().client.chat(model=model, messages=[ollama_msg])
|
||||
###########################################
|
||||
# Get where description
|
||||
generated_content_where = generated_content_dict.get("where")
|
||||
# Prompt to extract address / location
|
||||
prompt = 'Only answer with the location or address which can be extracted from this description: "{}"'.format(generated_content_where)
|
||||
# LLM
|
||||
extracted_location = client_llm.generate(model, prompt, format=None)
|
||||
logger.debug("Estimated location: {}".format(extracted_location))
|
||||
# OSM API
|
||||
params = {
|
||||
'q': extracted_location,
|
||||
'format': 'json',
|
||||
'addressdetails': 1,
|
||||
'limit': 1
|
||||
}
|
||||
|
||||
article_summary = response["message"]["content"]
|
||||
response = requests.get('https://nominatim.openstreetmap.org/search', params=params, headers={'User-Agent': 'App'})
|
||||
list_data = response.json()
|
||||
if (len(list_data) > 0):
|
||||
data = list_data[0]
|
||||
location_url = "https://openstreetmap.org/{}/{}".format(data.get("osm_type"), data.get("osm_id"))
|
||||
else:
|
||||
location_url = None
|
||||
###########################################
|
||||
|
||||
# Parse generated content
|
||||
summary, five_w = "", ""
|
||||
for k, v in generated_content_dict.items():
|
||||
if ("summary" in k.lower()):
|
||||
summary = v if type(v) is str else "\n".join(summary)
|
||||
else:
|
||||
five_w += "{}: {}\n".format(k.capitalize(), v if type(v) is str else ". ".join(v) )
|
||||
# Aggregate generated content
|
||||
generated_content = "{}\n\n{}".format(summary, five_w)
|
||||
|
||||
################################################################################################
|
||||
if (url_content.image_main_url is None) or (requests.get(url_content.image_main_url).status_code != 200):
|
||||
@@ -117,15 +179,24 @@ class Publisher():
|
||||
else:
|
||||
photo_url = url_content.image_main_url
|
||||
|
||||
# HTML: Generate content
|
||||
html_data = "".join([ "<p>{}</p>".format(t) for t in generated_content.split("\n") ])
|
||||
# HTML: Add location if available
|
||||
if (location_url is not None):
|
||||
html_data += '<p><a href="{}">Estimated location</a></p>'.format(location_url)
|
||||
# HTML: Add source
|
||||
html_data += '<p><a href="{}">Source: {}</a></p>'.format(url.url, url_content.url_host.replace("https://", ""))
|
||||
|
||||
post_data = {
|
||||
# "slug": "hey-short",
|
||||
"title": url_content.title,
|
||||
"html": "".join([ "<p>{}</p>".format(t) for t in article_summary.split("\n") ]) + '<a href="{}">Source</a>'.format(url.url),
|
||||
"html": html_data,
|
||||
#"meta_title": "",
|
||||
#"meta_description": "",
|
||||
"feature_image": photo_url,
|
||||
#"feature_image_caption": "",
|
||||
"status": "published",
|
||||
"tags": ["#url-id-{}".format(url_id)] # Hidden tag with associated URL ID
|
||||
}
|
||||
|
||||
# Publish post
|
||||
|
||||
8
app_urls/fetcher/src/utils.py
Normal file
8
app_urls/fetcher/src/utils.py
Normal file
@@ -0,0 +1,8 @@
|
||||
|
||||
def get_with_protocol(url):
|
||||
# http:// -> https://
|
||||
url = url.replace("http://", "https://")
|
||||
# "" -> https://
|
||||
if not (url.startswith("https://")):
|
||||
url = "https://" + url
|
||||
return url
|
||||
@@ -1,142 +1,91 @@
|
||||
from scheduler import job
|
||||
from celery import shared_task
|
||||
|
||||
from .src.fetch_feed import FetchFeeds
|
||||
from .src.fetch_parser import FetchParser
|
||||
from .src.fetch_search import FetchSearcher
|
||||
from .src.fetch_missing_kids import FetchMissingKids
|
||||
from .src.fetch_selenium import FetchSeleniumSourceSearch
|
||||
from .src.db_utils import DB_Handler
|
||||
from .src.publisher import Publisher
|
||||
from .src.notifier import notify_telegram
|
||||
|
||||
from .src.logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
@job('default')
|
||||
def fetch_feeds():
|
||||
task = "Fetch Feeds"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchFeeds().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_parser():
|
||||
task = "Fetch Parser"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchParser().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_search():
|
||||
task = "Fetch Search"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchSearcher().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_missing_kids(number_pages=5):
|
||||
task = "Fetch MissingKids"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_missing_kids_all(number_pages=-1):
|
||||
task = "Fetch MissingKids"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
@shared_task(queue='light')
|
||||
def process_raw_urls(batch_size=100):
|
||||
task = "Process raw URLs"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().process_raw_urls(batch_size=batch_size)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
@shared_task(queue='default')
|
||||
def process_error_urls(batch_size=50):
|
||||
task = "Process error URLs"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().process_error_urls(batch_size=batch_size)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def process_missing_kids_urls(batch_size=50):
|
||||
task = "Process Missing Kids URLs"
|
||||
|
||||
|
||||
@shared_task(queue='light')
|
||||
def fetch_feeds():
|
||||
task = "Fetch Feeds"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||
FetchFeeds().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def process_missing_kids_urls_all(batch_size=None):
|
||||
task = "Process Missing Kids URLs ALL"
|
||||
|
||||
@shared_task(queue='default')
|
||||
def fetch_parser():
|
||||
task = "Fetch Parser"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||
FetchParser().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def clean_old_url_content(older_than_days=60):
|
||||
@shared_task(queue='default')
|
||||
def fetch_search():
|
||||
task = "Fetch Search"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchSearcher().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
|
||||
|
||||
@shared_task(queue='heavy')
|
||||
def fetch_selenium_search():
|
||||
task = "Fetch Selenium search"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchSeleniumSourceSearch().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@shared_task(queue='heavy')
|
||||
def fetch_missing_kids(number_pages=5):
|
||||
task = "Fetch MissingKids"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@shared_task(queue='heavy')
|
||||
def process_missing_kids_urls(batch_size=None, process_status_only=None):
|
||||
task = "Process Missing Kids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only)
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size, process_status_only=process_status_only)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
|
||||
|
||||
@shared_task(queue='default')
|
||||
def clean_old_url_content(older_than_days=14):
|
||||
task = "Clean old URL content"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
|
||||
@job('default')
|
||||
def background_task(process_type: str):
|
||||
logger.info("Task triggered: {}".format(process_type))
|
||||
|
||||
try:
|
||||
if (process_type == "fetch_feeds"):
|
||||
FetchFeeds().run()
|
||||
elif (process_type == "fetch_parser"):
|
||||
FetchParser().run()
|
||||
elif (process_type == "fetch_search"):
|
||||
FetchSearcher().run()
|
||||
elif (process_type == "fetch_missingkids_all"):
|
||||
FetchMissingKids().run(number_pages=-1)
|
||||
|
||||
elif ("fetch_missingkids" in process_type):
|
||||
# number_pages encoded in URL
|
||||
try:
|
||||
number_pages = int(process_type.split("_")[-1])
|
||||
except Exception as e:
|
||||
number_pages = -1
|
||||
|
||||
FetchMissingKids().run(number_pages=number_pages)
|
||||
|
||||
elif ("process_" in process_type):
|
||||
# Batch size encoded in URL
|
||||
try:
|
||||
batch_size = int(process_type.split("_")[-1])
|
||||
except Exception as e:
|
||||
batch_size = None
|
||||
|
||||
# Task type
|
||||
if ("process_raw_urls" in process_type):
|
||||
DB_Handler().process_raw_urls(batch_size=batch_size)
|
||||
elif ("process_error_urls" in process_type):
|
||||
DB_Handler().process_error_urls(batch_size=batch_size)
|
||||
elif ("process_missing_kids_urls" in process_type):
|
||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||
|
||||
elif ("clean_old_url_content" in process_type ):
|
||||
# Older than X days encoded in URL
|
||||
try:
|
||||
older_than_days = float(process_type.split("_")[-1])
|
||||
except Exception as e:
|
||||
older_than_days = None
|
||||
|
||||
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
||||
|
||||
elif ("publish" in process_type):
|
||||
# Extract URL ID
|
||||
url_id = process_type.split("_")[-1]
|
||||
# Publish
|
||||
Publisher().publish(url_id)
|
||||
|
||||
else:
|
||||
logger.info("Task unknown!: {}".format(process_type))
|
||||
|
||||
logger.info("Task completed: {}".format(process_type))
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
@shared_task(queue='light')
|
||||
def notify_status():
|
||||
task = "Notify status"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
notify_telegram()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@@ -369,7 +369,7 @@ input[type="checkbox"] {
|
||||
<tbody>
|
||||
{% for url in urls %}
|
||||
<tr>
|
||||
<td><a href="./{{ url.id }}" class="btn btn-primary btn-sm" target="_blank">{{ url.id }}</a></td>
|
||||
<td><a href="./{{ url.id }}" class="btn btn-primary btn-sm" target="_blank">{{ url.id }}</a> <a href="/task/publish_{{ url.id }}" target="_blank">[✍️]</a> </td>
|
||||
<td><a href="{{ url.url }}" target="_blank">{{ url.url }}</a></td>
|
||||
<td>
|
||||
{% if url.status == 'raw' %}
|
||||
|
||||
@@ -278,8 +278,7 @@
|
||||
|
||||
<!-- Input field with a default value -->
|
||||
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
|
||||
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="5">{{ prompt }}
|
||||
{{ url_item.url }}</textarea>
|
||||
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="5">{{ prompt }}</textarea>
|
||||
|
||||
<div class="d-flex align-items-center">
|
||||
<!-- Fetch details button -->
|
||||
|
||||
@@ -7,8 +7,6 @@ urlpatterns = [
|
||||
path('logs/database', views.log_db, name='log_db'),
|
||||
path('logs/<str:log_type>', views.logs, name='logs'),
|
||||
#
|
||||
path('task/<str:task>', views.trigger_task, name='trigger_task'),
|
||||
#
|
||||
path('urls/charts/', views.charts, name='charts'),
|
||||
path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
|
||||
path('urls-per-status/', views.urls_per_status, name='urls_per_status'),
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
from .views_base import link_list, logs, log_db, trigger_task
|
||||
from .views_base import link_list, logs, log_db #, trigger_task,
|
||||
|
||||
from django.core.paginator import Paginator
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
|
||||
from django.http import StreamingHttpResponse, JsonResponse
|
||||
from django.db.models import Q, Count
|
||||
from django.utils import timezone
|
||||
from django.utils.timezone import now, timedelta
|
||||
@@ -14,16 +14,6 @@ import json
|
||||
####################################################################################################
|
||||
|
||||
def llm(request):
|
||||
|
||||
def stream_response(model, text):
|
||||
msg_content = {
|
||||
"role": "user",
|
||||
"content": text,
|
||||
}
|
||||
response = OllamaClient().client.chat(model=model, messages=[msg_content], stream=True)
|
||||
for chunk in response:
|
||||
yield chunk["message"]["content"] # Stream each chunk of text
|
||||
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
body_data = json.loads(request.body)
|
||||
@@ -33,7 +23,7 @@ def llm(request):
|
||||
if message is None:
|
||||
return JsonResponse({'error': 'No message found in request'}, status=400)
|
||||
|
||||
return StreamingHttpResponse(stream_response(model, message), content_type="text/plain")
|
||||
return StreamingHttpResponse(OllamaClient().generate_stream(model, message), content_type="text/plain")
|
||||
except json.JSONDecodeError:
|
||||
return JsonResponse({'error': 'Invalid JSON'}, status=400)
|
||||
|
||||
@@ -55,13 +45,18 @@ def url_detail_view(request, id):
|
||||
url_content = {}
|
||||
|
||||
ollama = OllamaClient()
|
||||
try:
|
||||
# prompt_content = "{}\n{}\n{}".format(url_content.title, url_content.description, url_content.content)
|
||||
prompt_content = "{}".format(url_content.content)
|
||||
except Exception as e:
|
||||
prompt_content = ""
|
||||
|
||||
context = {
|
||||
'url_item': url_item,
|
||||
'sources': url_sources,
|
||||
'searches': url_searches,
|
||||
'models': ollama.get_models(),
|
||||
'prompt': ollama.get_prompt(),
|
||||
'prompt': ollama.get_prompt(prompt_content),
|
||||
'url_content': url_content,
|
||||
'url_canonical': url_canonical,
|
||||
}
|
||||
|
||||
@@ -1,26 +1,25 @@
|
||||
import os
|
||||
from .tasks import background_task
|
||||
from django.http import JsonResponse, HttpResponse
|
||||
from django.db import connection
|
||||
import os
|
||||
|
||||
####################################################################################################
|
||||
"""
|
||||
### from .tasks import background_task
|
||||
|
||||
def trigger_task(request, task):
|
||||
# Enqueue function in "default" queue
|
||||
background_task.delay(task)
|
||||
return JsonResponse({"message": "Task has been enqueued!", "task": task})
|
||||
"""
|
||||
|
||||
####################################################################################################
|
||||
def link_list(request):
|
||||
# Base URL path
|
||||
app_url = request.build_absolute_uri()
|
||||
# Tasks
|
||||
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
|
||||
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"]
|
||||
# List of links
|
||||
list_links = \
|
||||
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
|
||||
[ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning"] ] + \
|
||||
[ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
|
||||
[ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning", "server", "beat", "worker_light", "worker_default", "worker_heavy"] ]
|
||||
|
||||
# Links tuple
|
||||
links = [(l, l) for l in list_links]
|
||||
@@ -32,6 +31,7 @@ def link_list(request):
|
||||
|
||||
return HttpResponse(html)
|
||||
|
||||
|
||||
####################################################################################################
|
||||
def logs(request, log_type):
|
||||
# Capture output: python manage.py rqstats
|
||||
@@ -68,4 +68,4 @@ def log_db(request):
|
||||
""").fetchall()
|
||||
return HttpResponse( "\n".join([str(e) for e in r]) )
|
||||
|
||||
####################################################################################################
|
||||
####################################################################################################
|
||||
|
||||
@@ -17,18 +17,21 @@
|
||||
"cnbc.com"
|
||||
],
|
||||
"keyword_search": [
|
||||
"child abuse"
|
||||
"child abuse",
|
||||
"child neglect"
|
||||
]
|
||||
},
|
||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
||||
["https:\\/\\/x.com\\/.*", "invalid", 50],
|
||||
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
||||
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
|
||||
[".*radio.foxnews\\.com\\/.*", "invalid", 75],
|
||||
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
||||
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
|
||||
[".*zerohedge\\.com\\/(user|contributors)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
||||
[".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
|
||||
[".*foxnews\\.com\\/(video|category|person|books|html-sitemap)\\/.*", "invalid", 75],
|
||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50],
|
||||
[".*foxnews\\.com\\/[^\\/]+\\/?$", "invalid", 25]
|
||||
]
|
||||
}
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
{
|
||||
"SEARCH": {
|
||||
"rss_feed": [
|
||||
],
|
||||
"url_host": [
|
||||
"johnpilger.com",
|
||||
"lapenseeecologique.com",
|
||||
"partage-le.com",
|
||||
"reflets.info",
|
||||
"rezo.net",
|
||||
"consortiumnews.com",
|
||||
"disclose.ngo/fr",
|
||||
"energieetenvironnement.com",
|
||||
"global-climat.com",
|
||||
"slashdot.org",
|
||||
"lesamisdebartleby.wordpress.com",
|
||||
"lundi.am",
|
||||
"lvsl.fr",
|
||||
"moderndiplomacy.eu",
|
||||
"mrmondialisation.org",
|
||||
"ourfiniteworld.com",
|
||||
"southfront.org",
|
||||
"simplicius76.substack.com",
|
||||
"smoothiex12.blogspot.com",
|
||||
"theintercept.com",
|
||||
"wikileaks.org",
|
||||
"contretemps.eu",
|
||||
"indianpunchline.com",
|
||||
"investigaction.net/fr",
|
||||
"notechmagazine.com",
|
||||
"terrestres.org",
|
||||
"truthdig.com",
|
||||
"tass.com",
|
||||
"bastamag.net",
|
||||
"counterpunch.org",
|
||||
"energy-daily.com",
|
||||
"fakirpresse.info",
|
||||
"geopoliticalmonitor.com",
|
||||
"huffingtonpost.fr",
|
||||
"legrandsoir.info",
|
||||
"les-crises.fr",
|
||||
"liberation.fr",
|
||||
"maitre-eolas.fr",
|
||||
"marianne.net",
|
||||
"mediapart.fr",
|
||||
"metaefficient.com",
|
||||
"monde-diplomatique.fr",
|
||||
"paulcraigroberts.org",
|
||||
"politis.fr",
|
||||
"reporterre.net",
|
||||
"rue89.com",
|
||||
"theguardian.com/international",
|
||||
"treehugger.com",
|
||||
"unz.com",
|
||||
"voltairenet.org",
|
||||
"wsws.org"
|
||||
],
|
||||
"keyword_search": [
|
||||
"society collapse"
|
||||
]
|
||||
},
|
||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50]
|
||||
]
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
{
|
||||
"SEARCH": {
|
||||
"rss_feed": [
|
||||
"https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
|
||||
"https://feeds.feedburner.com/breitbart",
|
||||
"https://feeds.feedburner.com/zerohedge/feed",
|
||||
"https://moxie.foxnews.com/google-publisher/latest.xml",
|
||||
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
|
||||
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
|
||||
],
|
||||
"url_host": [
|
||||
"missingkids.org/poster",
|
||||
"missingkids.org/new-poster",
|
||||
"breitbart.com",
|
||||
"zerohedge.com",
|
||||
"foxnews.com",
|
||||
"cnbc.com"
|
||||
],
|
||||
"keyword_search": [
|
||||
"child abuse"
|
||||
]
|
||||
},
|
||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
||||
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
||||
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
|
||||
[".*radio.foxnews\\.com\\/.*", "invalid", 75],
|
||||
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
|
||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
||||
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
|
||||
]
|
||||
}
|
||||
@@ -29,13 +29,15 @@ def wait_connection():
|
||||
connected = True
|
||||
|
||||
except psycopg.OperationalError as e:
|
||||
print(str(e))
|
||||
# Connection not ready...
|
||||
# print(".", end="")
|
||||
time.sleep(2)
|
||||
time.sleep(15)
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
# Connection not ready...
|
||||
# print("e", end="")
|
||||
time.sleep(2)
|
||||
time.sleep(15)
|
||||
|
||||
print("DB connection ready")
|
||||
|
||||
@@ -57,7 +59,8 @@ def initialize_tables():
|
||||
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
|
||||
-- status_wendy WENDY_STATUS DEFAULT NULL,
|
||||
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
|
||||
-- ts_wendy TIMESTAMPTZ DEFAULT NULL,
|
||||
-- child_abuse BOOLEAN DEFAULT NULL,
|
||||
);
|
||||
CREATE INDEX idx_urls_status ON urls(status);
|
||||
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
|
||||
@@ -212,6 +215,10 @@ def initialize_data():
|
||||
print(query)
|
||||
cur.execute(query)
|
||||
|
||||
# Connect to an existing database
|
||||
with psycopg.connect(connection_info) as conn:
|
||||
# Open a cursor to perform database operations
|
||||
with conn.cursor() as cur:
|
||||
# Feeds, URL host, keyword search
|
||||
for search_type, list_searches in data_json.get("SEARCH", {}).items():
|
||||
for search in list_searches:
|
||||
|
||||
@@ -6,7 +6,10 @@ else
|
||||
echo "Initializating database"
|
||||
python init_db.py --initialize_tables --initialize_data
|
||||
python manage.py makemigrations fetcher; python manage.py migrate --fake-initial
|
||||
python manage.py migrate django_celery_beat
|
||||
python manage.py createsuperuser --noinput
|
||||
python manage.py collectstatic --no-input
|
||||
python manage.py import --filename scheduled_tasks.json
|
||||
python manage.py loaddata scheduled_tasks.json
|
||||
#
|
||||
# python manage.py inspectdb # Debugging model
|
||||
fi
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
django==5.1
|
||||
django-tasks-scheduler==3.0.1
|
||||
django-celery-beat
|
||||
django-redis
|
||||
psycopg[binary]
|
||||
gunicorn
|
||||
@@ -13,8 +13,9 @@ lxml[html_clean]
|
||||
googlenewsdecoder
|
||||
gnews
|
||||
GoogleNews
|
||||
duckduckgo_search
|
||||
ddgs
|
||||
git+https://github.com/tasos-py/Search-Engines-Scraper.git
|
||||
furl
|
||||
langdetect
|
||||
ollama
|
||||
PyJWT
|
||||
PyJWT
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ "${DJANGO_DEBUG}" = true ] | [ "${DJANGO_DEBUG}" == "True" ]; then
|
||||
echo "Running in DEBUG mode"
|
||||
gunicorn core.wsgi:application --reload --log-level debug --bind 0.0.0.0:8000 --timeout 600 & python manage.py rqworker high default low
|
||||
else
|
||||
gunicorn core.wsgi:application --bind 0.0.0.0:8000 --timeout 600 & python manage.py rqworker high default low
|
||||
fi
|
||||
@@ -1,212 +1,507 @@
|
||||
[
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Process error URLs",
|
||||
"callable": "fetcher.tasks.process_error_urls",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "low",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": 1800,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 4,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 1,
|
||||
"fields": {
|
||||
"name": "celery.backend_cleanup",
|
||||
"task": "celery.backend_cleanup",
|
||||
"interval": null,
|
||||
"crontab": 1,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": 43200,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:07:34.609Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Process raw URLs",
|
||||
"callable": "fetcher.tasks.process_raw_urls",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "low",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": 1800,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 10,
|
||||
"interval_unit": "minutes",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 2,
|
||||
"fields": {
|
||||
"name": "Process error URLs",
|
||||
"task": "fetcher.tasks.process_error_urls",
|
||||
"interval": 1,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:10:08.861Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Process MissingKids URLs",
|
||||
"callable": "fetcher.tasks.process_missing_kids_urls",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": 1800,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 4,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 3,
|
||||
"fields": {
|
||||
"name": "Process raw URLs",
|
||||
"task": "fetcher.tasks.process_raw_urls",
|
||||
"interval": 2,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": "2025-07-17T16:20:36.751Z",
|
||||
"total_run_count": 1,
|
||||
"date_changed": "2025-07-17T16:21:17.099Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Process MissingKids URLs ALL",
|
||||
"callable": "fetcher.tasks.process_missing_kids_urls_all",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": 7200,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "weeks",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 4,
|
||||
"fields": {
|
||||
"name": "Process MissingKids URLs - batch=50",
|
||||
"task": "fetcher.tasks.process_missing_kids_urls",
|
||||
"interval": 3,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{\"batch_size\": 50}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:12:44.533Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch Feeds",
|
||||
"callable": "fetcher.tasks.fetch_feeds",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": 1800,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 10,
|
||||
"interval_unit": "minutes",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 5,
|
||||
"fields": {
|
||||
"name": "Process MissingKids URLs ALL - unknown",
|
||||
"task": "fetcher.tasks.process_missing_kids_urls",
|
||||
"interval": 4,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{\"process_status_only\": \"unknown\"}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:16:38.258Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch Parser",
|
||||
"callable": "fetcher.tasks.fetch_parser",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": 3600,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 6,
|
||||
"fields": {
|
||||
"name": "Process MissingKids URLs ALL - valid",
|
||||
"task": "fetcher.tasks.process_missing_kids_urls",
|
||||
"interval": 5,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{\"process_status_only\": \"valid\"}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:20:19.969Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch Search",
|
||||
"callable": "fetcher.tasks.fetch_search",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": 3600,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 7,
|
||||
"fields": {
|
||||
"name": "Process MissingKids URLs ALL - invalid",
|
||||
"task": "fetcher.tasks.process_missing_kids_urls",
|
||||
"interval": 6,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{\"process_status_only\": \"invalid\"}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:21:30.809Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch MissingKids",
|
||||
"callable": "fetcher.tasks.fetch_missing_kids",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": 1800,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 4,
|
||||
"interval_unit": "hours",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 8,
|
||||
"fields": {
|
||||
"name": "Fetch Feeds",
|
||||
"task": "fetcher.tasks.fetch_feeds",
|
||||
"interval": 2,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:22:15.615Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch MissingKids ALL",
|
||||
"callable": "fetcher.tasks.fetch_missing_kids_all",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": 7200,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "weeks",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 9,
|
||||
"fields": {
|
||||
"name": "Fetch Parser",
|
||||
"task": "fetcher.tasks.fetch_parser",
|
||||
"interval": 7,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:22:40.215Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Clean old URL content",
|
||||
"callable": "fetcher.tasks.clean_old_url_content",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": null,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "weeks",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 10,
|
||||
"fields": {
|
||||
"name": "Fetch Search",
|
||||
"task": "fetcher.tasks.fetch_search",
|
||||
"interval": 8,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:23:00.329Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 11,
|
||||
"fields": {
|
||||
"name": "Fetch Selenium Search",
|
||||
"task": "fetcher.tasks.fetch_selenium_search",
|
||||
"interval": 3,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:24:08.315Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 12,
|
||||
"fields": {
|
||||
"name": "Fetch MissingKids - pages=5",
|
||||
"task": "fetcher.tasks.fetch_missing_kids",
|
||||
"interval": 4,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{\"number_pages\": 5}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:25:02.494Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 13,
|
||||
"fields": {
|
||||
"name": "Fetch MissingKids - ALL",
|
||||
"task": "fetcher.tasks.fetch_missing_kids",
|
||||
"interval": 9,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{\"number_pages\": -1}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:25:50.597Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 14,
|
||||
"fields": {
|
||||
"name": "Clean old URL content",
|
||||
"task": "fetcher.tasks.clean_old_url_content",
|
||||
"interval": 9,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:26:16.272Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.periodictask",
|
||||
"pk": 4,
|
||||
"fields": {
|
||||
"name": "Notify status",
|
||||
"task": "fetcher.tasks.notify_status",
|
||||
"interval": 4,
|
||||
"crontab": null,
|
||||
"solar": null,
|
||||
"clocked": null,
|
||||
"args": "[]",
|
||||
"kwargs": "{}",
|
||||
"queue": null,
|
||||
"exchange": null,
|
||||
"routing_key": null,
|
||||
"headers": "{}",
|
||||
"priority": null,
|
||||
"expires": null,
|
||||
"expire_seconds": null,
|
||||
"one_off": false,
|
||||
"start_time": null,
|
||||
"enabled": true,
|
||||
"last_run_at": null,
|
||||
"total_run_count": 0,
|
||||
"date_changed": "2025-07-17T16:12:44.533Z",
|
||||
"description": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.intervalschedule",
|
||||
"pk": 1,
|
||||
"fields": {
|
||||
"every": 6,
|
||||
"period": "hours"
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.intervalschedule",
|
||||
"pk": 2,
|
||||
"fields": {
|
||||
"every": 10,
|
||||
"period": "minutes"
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.intervalschedule",
|
||||
"pk": 3,
|
||||
"fields": {
|
||||
"every": 1,
|
||||
"period": "days"
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.intervalschedule",
|
||||
"pk": 4,
|
||||
"fields": {
|
||||
"every": 12,
|
||||
"period": "hours"
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.intervalschedule",
|
||||
"pk": 5,
|
||||
"fields": {
|
||||
"every": 2,
|
||||
"period": "days"
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.intervalschedule",
|
||||
"pk": 6,
|
||||
"fields": {
|
||||
"every": 28,
|
||||
"period": "days"
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.intervalschedule",
|
||||
"pk": 7,
|
||||
"fields": {
|
||||
"every": 8,
|
||||
"period": "hours"
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.intervalschedule",
|
||||
"pk": 8,
|
||||
"fields": {
|
||||
"every": 4,
|
||||
"period": "hours"
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.intervalschedule",
|
||||
"pk": 9,
|
||||
"fields": {
|
||||
"every": 7,
|
||||
"period": "days"
|
||||
}
|
||||
},
|
||||
{
|
||||
"model": "django_celery_beat.crontabschedule",
|
||||
"pk": 1,
|
||||
"fields": {
|
||||
"minute": "0",
|
||||
"hour": "4",
|
||||
"day_of_month": "*",
|
||||
"month_of_year": "*",
|
||||
"day_of_week": "*",
|
||||
"timezone": "UTC"
|
||||
}
|
||||
}
|
||||
]
|
||||
]
|
||||
|
||||
67
app_urls/supervisord.conf
Normal file
67
app_urls/supervisord.conf
Normal file
@@ -0,0 +1,67 @@
|
||||
[supervisord]
|
||||
nodaemon=true
|
||||
|
||||
[program:server]
|
||||
command=gunicorn core.wsgi:application --bind 0.0.0.0:8000
|
||||
directory=/opt/app
|
||||
autostart=true
|
||||
autorestart=true
|
||||
; Unified log file
|
||||
stdout_logfile=/opt/logs/server.log
|
||||
stderr_logfile=/opt/logs/server.log
|
||||
redirect_stderr=true
|
||||
; Rotate when file reaches max size
|
||||
stdout_logfile_maxbytes=20MB
|
||||
stdout_logfile_backups=1
|
||||
|
||||
[program:beat]
|
||||
command=celery -A core beat -l info --scheduler django_celery_beat.schedulers:DatabaseScheduler --logfile=/opt/logs/beat.log
|
||||
directory=/opt/app
|
||||
autostart=true
|
||||
autorestart=true
|
||||
; Unified log file
|
||||
stdout_logfile=/opt/logs/beat.log
|
||||
stderr_logfile=/opt/logs/beat.log
|
||||
redirect_stderr=true
|
||||
; Rotate when file reaches max size
|
||||
stdout_logfile_maxbytes=20MB
|
||||
stdout_logfile_backups=1
|
||||
|
||||
[program:worker_default]
|
||||
command=celery -A core worker -l info --logfile=/opt/logs/worker_default.log --concurrency=1 -Q default -n default
|
||||
directory=/opt/app
|
||||
autostart=true
|
||||
autorestart=true
|
||||
; Unified log file
|
||||
stdout_logfile=/opt/logs/worker_default.log
|
||||
stderr_logfile=/opt/logs/worker_default.log
|
||||
redirect_stderr=true
|
||||
; Rotate when file reaches max size
|
||||
stdout_logfile_maxbytes=20MB
|
||||
stdout_logfile_backups=1
|
||||
|
||||
[program:worker_light]
|
||||
command=celery -A core worker -l info --logfile=/opt/logs/worker_light.log --concurrency=1 -Q light -n light
|
||||
directory=/opt/app
|
||||
autostart=true
|
||||
autorestart=true
|
||||
; Unified log file
|
||||
stdout_logfile=/opt/logs/worker_light.log
|
||||
stderr_logfile=/opt/logs/worker_light.log
|
||||
redirect_stderr=true
|
||||
; Rotate when file reaches max size
|
||||
stdout_logfile_maxbytes=20MB
|
||||
stdout_logfile_backups=1
|
||||
|
||||
[program:worker_heavy]
|
||||
command=celery -A core worker -l info --logfile=/opt/logs/worker_heavy.log --concurrency=1 -Q heavy -n heavy
|
||||
directory=/opt/app
|
||||
autostart=true
|
||||
autorestart=true
|
||||
; Unified log file
|
||||
stdout_logfile=/opt/logs/worker_heavy.log
|
||||
stderr_logfile=/opt/logs/worker_heavy.log
|
||||
redirect_stderr=true
|
||||
; Rotate when file reaches max size
|
||||
stdout_logfile_maxbytes=20MB
|
||||
stdout_logfile_backups=1
|
||||
112
docker-compose-base.yml
Normal file
112
docker-compose-base.yml
Normal file
@@ -0,0 +1,112 @@
|
||||
services:
|
||||
|
||||
fetcher_app_selenium:
|
||||
image: fetcher_app_selenium
|
||||
build:
|
||||
context: ./app_selenium
|
||||
args:
|
||||
- ARCH=${ARCH} # arm64, amd64
|
||||
container_name: fetcher_app_selenium
|
||||
restart: unless-stopped
|
||||
shm_size: 512mb
|
||||
init: true # For zombie processes
|
||||
environment:
|
||||
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
|
||||
ports:
|
||||
- 80
|
||||
dns:
|
||||
- 1.1.1.1
|
||||
- 1.0.0.1
|
||||
|
||||
fetcher_app_urls:
|
||||
image: fetcher_app_urls
|
||||
build:
|
||||
context: ./app_urls
|
||||
container_name: fetcher_app_urls
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
# Initialization
|
||||
- INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence
|
||||
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
|
||||
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
|
||||
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
|
||||
# Django
|
||||
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
|
||||
- DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
|
||||
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
|
||||
- DJANGO_DEBUG=${DJANGO_DEBUG}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
|
||||
# Database
|
||||
- DB_NAME=${DB_NAME}
|
||||
- DB_USER=${DB_USER}
|
||||
- DB_PASSWORD=${DB_PASSWORD}
|
||||
- DB_HOST=${DB_HOST}
|
||||
- DB_PORT=${DB_PORT}
|
||||
- REDIS_CACHE_HOST=${REDIS_CACHE_HOST}
|
||||
- REDIS_CACHE_PORT=${REDIS_CACHE_PORT}
|
||||
- REDIS_CELERY_HOST=${REDIS_CELERY_HOST}
|
||||
- REDIS_CELERY_PORT=${REDIS_CELERY_PORT}
|
||||
# Job timeout: 30 min
|
||||
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
|
||||
# Fetcher
|
||||
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
|
||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
|
||||
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search
|
||||
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host
|
||||
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection
|
||||
- FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
|
||||
- FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
|
||||
# Selenium
|
||||
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
|
||||
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
|
||||
# Ghost
|
||||
- GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
|
||||
- GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
|
||||
- PEXELS_API_KEY=${PEXELS_API_KEY}
|
||||
- OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
|
||||
# Telegram
|
||||
- TELEGRAM_INFO_BOT_TOKEN=${TELEGRAM_INFO_BOT_TOKEN}
|
||||
- TELEGRAM_INFO_CHAT_ID=${TELEGRAM_INFO_CHAT_ID}
|
||||
- TELEGRAM_WARNING_BOT_TOKEN=${TELEGRAM_WARNING_BOT_TOKEN}
|
||||
- TELEGRAM_WARNING_CHAT_ID=${TELEGRAM_WARNING_CHAT_ID}
|
||||
########################
|
||||
ports:
|
||||
- 8000
|
||||
depends_on:
|
||||
- fetcher_db
|
||||
- fetcher_redis_cache
|
||||
- fetcher_redis_celery
|
||||
- fetcher_app_selenium
|
||||
dns:
|
||||
- 1.1.1.1
|
||||
- 1.0.0.1
|
||||
|
||||
fetcher_redis_cache:
|
||||
image: redis:alpine
|
||||
container_name: fetcher_redis_cache
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- 6379
|
||||
|
||||
fetcher_redis_celery:
|
||||
image: redis:alpine
|
||||
container_name: fetcher_redis_celery
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- 6379
|
||||
|
||||
fetcher_db:
|
||||
container_name: fetcher_db
|
||||
restart: unless-stopped
|
||||
|
||||
fetcher_flower:
|
||||
image: mher/flower
|
||||
container_name: fetcher_flower
|
||||
ports:
|
||||
- 5555
|
||||
environment:
|
||||
- CELERY_BROKER_URL=redis://fetcher_redis_celery:6379/0
|
||||
- FLOWER_UNAUTHENTICATED_API=true
|
||||
depends_on:
|
||||
- fetcher_redis_celery
|
||||
@@ -1,24 +1,9 @@
|
||||
version: '3.9'
|
||||
|
||||
services:
|
||||
|
||||
fetcher_app_selenium:
|
||||
image: fetcher_app_selenium
|
||||
build:
|
||||
context: ./app_selenium
|
||||
args:
|
||||
- ARCH=${ARCH} # arm64, amd64
|
||||
container_name: fetcher_app_selenium
|
||||
restart: unless-stopped
|
||||
shm_size: 512mb
|
||||
environment:
|
||||
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
|
||||
ports:
|
||||
- 80:80
|
||||
dns:
|
||||
- 1.1.1.1
|
||||
- 1.0.0.1
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_app_selenium
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
@@ -26,65 +11,11 @@ services:
|
||||
memory: ${DEPLOY_RAM}
|
||||
|
||||
fetcher_app_urls:
|
||||
image: fetcher_app_urls
|
||||
build:
|
||||
context: ./app_urls
|
||||
container_name: fetcher_app_urls
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
# Initialization
|
||||
- INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence
|
||||
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
|
||||
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
|
||||
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
|
||||
# Django
|
||||
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
|
||||
- DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
|
||||
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
|
||||
- DJANGO_DEBUG=${DJANGO_DEBUG}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
|
||||
# Database
|
||||
- DB_NAME=${DB_NAME}
|
||||
- DB_USER=${DB_USER}
|
||||
- DB_PASSWORD=${DB_PASSWORD}
|
||||
- DB_HOST=${DB_HOST}
|
||||
- DB_PORT=${DB_PORT}
|
||||
- REDIS_HOST=${REDIS_HOST}
|
||||
- REDIS_PORT=${REDIS_PORT}
|
||||
# Job timeout: 30 min
|
||||
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
|
||||
# Fetcher
|
||||
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
|
||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
|
||||
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search
|
||||
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host
|
||||
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection
|
||||
- FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
|
||||
- FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
|
||||
# Selenium
|
||||
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
|
||||
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
|
||||
# Ghost
|
||||
- GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
|
||||
- GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
|
||||
- PEXELS_API_KEY=${PEXELS_API_KEY}
|
||||
########################
|
||||
volumes: # Development mode
|
||||
- ./app_urls:/opt/app
|
||||
########################
|
||||
ports:
|
||||
- 8000:8000
|
||||
depends_on:
|
||||
- fetcher_db
|
||||
- fetcher_redis
|
||||
dns:
|
||||
- 1.1.1.1
|
||||
- 1.0.0.1
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '${DEPLOY_CPUS}'
|
||||
memory: ${DEPLOY_RAM}
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_app_urls
|
||||
#env_files:
|
||||
# - .env.dev
|
||||
#labels: # Reverse proxy sample
|
||||
# - "traefik.enable=true"
|
||||
# - "traefik.http.routers.fetcher.rule=Host(`urls.yourdomain.com`)"
|
||||
@@ -94,11 +25,21 @@ services:
|
||||
#networks:
|
||||
# - default # This network
|
||||
# - docker_default # Reverse proxy network
|
||||
ports:
|
||||
- 8005:8000
|
||||
## volumes: # Development mode
|
||||
## - ./app_urls:/opt/app
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '${DEPLOY_CPUS}'
|
||||
memory: ${DEPLOY_RAM}
|
||||
|
||||
fetcher_db:
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_db
|
||||
image: postgres:17
|
||||
container_name: fetcher_db
|
||||
restart: unless-stopped
|
||||
# Set shared memory limit when using docker-compose
|
||||
shm_size: 128mb
|
||||
environment:
|
||||
@@ -106,18 +47,28 @@ services:
|
||||
POSTGRES_PASSWORD: ${DB_PASSWORD}
|
||||
POSTGRES_USER: ${DB_USER}
|
||||
POSTGRES_INITDB_ARGS: '--data-checksums'
|
||||
#volumes: # Persistent DB?
|
||||
# - ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data
|
||||
ports:
|
||||
- 5432 #:5432
|
||||
#volumes: # Persistent DB?
|
||||
# - ./postgres:/var/lib/postgresql/data
|
||||
|
||||
fetcher_redis:
|
||||
image: redis:alpine
|
||||
container_name: fetcher_redis
|
||||
restart: unless-stopped
|
||||
fetcher_redis_cache:
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_redis_cache
|
||||
ports:
|
||||
- 6379 #:6379
|
||||
- 6379
|
||||
|
||||
#networks:
|
||||
# docker_default:
|
||||
# external: true
|
||||
fetcher_redis_celery:
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_redis_celery
|
||||
ports:
|
||||
- 6379
|
||||
|
||||
fetcher_flower:
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_flower
|
||||
ports:
|
||||
- 5555:5555
|
||||
|
||||
@@ -1,24 +1,9 @@
|
||||
version: '3.9'
|
||||
|
||||
services:
|
||||
|
||||
fetcher_app_selenium:
|
||||
image: fetcher_app_selenium
|
||||
build:
|
||||
context: ./app_selenium
|
||||
args:
|
||||
- ARCH=${ARCH} # arm64, amd64
|
||||
container_name: fetcher_app_selenium
|
||||
restart: unless-stopped
|
||||
shm_size: 512mb
|
||||
environment:
|
||||
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
|
||||
ports:
|
||||
- 80
|
||||
dns:
|
||||
- 1.1.1.1
|
||||
- 1.0.0.1
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_app_selenium
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
@@ -26,98 +11,59 @@ services:
|
||||
memory: ${DEPLOY_RAM}
|
||||
|
||||
fetcher_app_urls:
|
||||
image: fetcher_app_urls
|
||||
build:
|
||||
context: ./app_urls
|
||||
container_name: fetcher_app_urls
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
# Initialization
|
||||
- INITIALIZE_DB=${INITIALIZE_DB} # Related to DB persistence
|
||||
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME}
|
||||
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD}
|
||||
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL}
|
||||
# Django
|
||||
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS} # host1,host2
|
||||
- DJANGO_ALLOWED_ORIGINS=${DJANGO_ALLOWED_ORIGINS} # Reverse proxy
|
||||
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY}
|
||||
- DJANGO_DEBUG=${DJANGO_DEBUG}
|
||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY}
|
||||
# Database
|
||||
- DB_NAME=${DB_NAME}
|
||||
- DB_USER=${DB_USER}
|
||||
- DB_PASSWORD=${DB_PASSWORD}
|
||||
- DB_HOST=${DB_HOST}
|
||||
- DB_PORT=${DB_PORT}
|
||||
- REDIS_HOST=${REDIS_HOST}
|
||||
- REDIS_PORT=${REDIS_PORT}
|
||||
# Job timeout: 30 min
|
||||
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
|
||||
# Fetcher
|
||||
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP}
|
||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP}
|
||||
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search
|
||||
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host
|
||||
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection
|
||||
- FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL
|
||||
- FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL
|
||||
# Selenium
|
||||
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT}
|
||||
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA}
|
||||
# Ghost
|
||||
- GHOST_ADMIN_API_KEY=${GHOST_ADMIN_API_KEY}
|
||||
- GHOST_ADMIN_API_URL=${GHOST_ADMIN_API_URL}
|
||||
- PEXELS_API_KEY=${PEXELS_API_KEY}
|
||||
########################
|
||||
#volumes: # Development mode
|
||||
# - ./app_urls:/opt/app
|
||||
########################
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_app_urls
|
||||
ports:
|
||||
- 8000 # :8000
|
||||
depends_on:
|
||||
- fetcher_db
|
||||
- fetcher_redis
|
||||
dns:
|
||||
- 1.1.1.1
|
||||
- 1.0.0.1
|
||||
- 8000:8000
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: '${DEPLOY_CPUS}'
|
||||
memory: ${DEPLOY_RAM}
|
||||
labels: # Reverse proxy sample
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.fetcher.rule=Host(`${REVERSE_PROXY_URL}`)"
|
||||
- "traefik.http.routers.fetcher.entrypoints=websecure"
|
||||
- "traefik.http.routers.fetcher.tls.certresolver=myresolvercd"
|
||||
- "traefik.http.services.fetcher.loadbalancer.server.port=8000"
|
||||
networks:
|
||||
- default # This network
|
||||
- docker_default # Reverse proxy network
|
||||
|
||||
fetcher_db:
|
||||
image: postgres:17
|
||||
container_name: fetcher_db
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_db
|
||||
image: alpine:latest
|
||||
restart: unless-stopped
|
||||
# Set shared memory limit when using docker-compose
|
||||
shm_size: 128mb
|
||||
environment:
|
||||
POSTGRES_DB: ${DB_NAME}
|
||||
POSTGRES_PASSWORD: ${DB_PASSWORD}
|
||||
POSTGRES_USER: ${DB_USER}
|
||||
POSTGRES_INITDB_ARGS: '--data-checksums'
|
||||
volumes: # Persistent DB?
|
||||
- ${PATH_DB_DATA}/postgres:/var/lib/postgresql/data
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 256M
|
||||
volumes:
|
||||
# REQUIREMENT: Add fetcher's SSH public key into the DB's .ssh/authorized_keys machine
|
||||
- ~/.ssh:/root/.ssh:ro
|
||||
ports:
|
||||
- 5432 #:5432
|
||||
- 15885:15885
|
||||
- 5432:5432
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
apk add --update openssh autossh
|
||||
# Monitor status on port 15885
|
||||
autossh -M 15885 -N -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
|
||||
# autossh -M 15885 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
|
||||
|
||||
fetcher_redis:
|
||||
image: redis:alpine
|
||||
container_name: fetcher_redis
|
||||
restart: unless-stopped
|
||||
fetcher_redis_cache:
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_redis_cache
|
||||
ports:
|
||||
- 6379 #:6379
|
||||
- 6379
|
||||
|
||||
networks:
|
||||
docker_default:
|
||||
external: true
|
||||
fetcher_redis_celery:
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_redis_celery
|
||||
ports:
|
||||
- 6379
|
||||
|
||||
fetcher_flower:
|
||||
extends:
|
||||
file: docker-compose-base.yml
|
||||
service: fetcher_flower
|
||||
ports:
|
||||
- 5555:5555
|
||||
|
||||
79
utils/DB-Dev.ipynb
Normal file
79
utils/DB-Dev.ipynb
Normal file
@@ -0,0 +1,79 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install python-dotenv\n",
|
||||
"from dotenv import load_dotenv\n",
|
||||
"\n",
|
||||
"# Specify the path to your .env file (optional if in the current dir)\n",
|
||||
"load_dotenv(dotenv_path=\".env\", override=True)\n",
|
||||
"\n",
|
||||
"import os\n",
|
||||
"import psycopg\n",
|
||||
"from sshtunnel import SSHTunnelForwarder\n",
|
||||
"\n",
|
||||
"if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
|
||||
" print(\"SSH tunnel: True\")\n",
|
||||
"else:\n",
|
||||
" print(\"SSH tunnel: False\")\n",
|
||||
"\n",
|
||||
"connect_info = \"host={} port={} user={} password={} dbname={}\".format(os.environ.get(\"DB_HOST\"), os.environ.get(\"DB_PORT\"), os.environ.get(\"DB_USER\"), os.environ.get(\"DB_PASSWORD\"), os.environ.get(\"DB_NAME\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
|
||||
" ssh_tunnel = SSHTunnelForwarder(\n",
|
||||
" (os.environ.get(\"REMOTE_HOST\"), int(os.environ.get(\"REMOTE_SSH_PORT\"))), \n",
|
||||
" ssh_username=os.environ.get(\"REMOTE_USERNAME\"), ssh_password=os.environ.get(\"REMOTE_PASSWORD\"), \n",
|
||||
" remote_bind_address=('localhost', int(os.environ.get(\"REMOTE_PORT\"))), local_bind_address=('localhost', int(os.environ.get(\"DB_PORT\"))) \n",
|
||||
" )\n",
|
||||
" ssh_tunnel.start()\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" with psycopg.connect(connect_info) as conn:\n",
|
||||
" if True:\n",
|
||||
" for t in conn.execute(\"\"\"\n",
|
||||
" SELECT * from URLS WHERE id IN (SELECT id_url FROM URLS_SOURCE_SEARCH INNER JOIN SEARCH ON URLS_SOURCE_SEARCH.id_search = SEARCH.id WHERE SEARCH.search LIKE '%child abuse%') LIMIT 5;\n",
|
||||
" \"\"\").fetchall():\n",
|
||||
" print(t)\n",
|
||||
" \n",
|
||||
"except Exception as e:\n",
|
||||
" print(\"Err:\", str(e))\n",
|
||||
"\n",
|
||||
"if (os.environ.get(\"SSH_TUNNEL_BASED\") == \"true\"):\n",
|
||||
" ssh_tunnel.stop()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos_urls",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
164
utils/Ghost-Posts.ipynb
Normal file
164
utils/Ghost-Posts.ipynb
Normal file
@@ -0,0 +1,164 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import time\n",
|
||||
"import jwt\n",
|
||||
"import requests\n",
|
||||
"from datetime import datetime, timedelta, timezone\n",
|
||||
"\n",
|
||||
"admin_api_url = \"\" # .env -> GHOST_ADMIN_API_URL\n",
|
||||
"admin_api_key = \"\" # .env -> GHOST_ADMIN_API_KEY\n",
|
||||
"\n",
|
||||
"def _create_jwt(admin_api_key):\n",
|
||||
" id_, secret = admin_api_key.split(':')\n",
|
||||
" iat = int(time.time())\n",
|
||||
" exp = iat + 5 * 60 # 5 minutes\n",
|
||||
" header = {'alg': 'HS256', 'kid': id_}\n",
|
||||
" payload = {\n",
|
||||
" 'iat': iat,\n",
|
||||
" 'exp': exp,\n",
|
||||
" 'aud': '/v5/admin/' # Adjust depending on your Ghost version\n",
|
||||
" }\n",
|
||||
" token = jwt.encode(payload, bytes.fromhex(secret), algorithm='HS256', headers=header)\n",
|
||||
" return token\n",
|
||||
"\n",
|
||||
"# Get token\n",
|
||||
"jwt_token = _create_jwt(os.getenv(\"GHOST_ADMIN_API_KEY\"))\n",
|
||||
"\n",
|
||||
"headers = {\n",
|
||||
" 'Authorization': f'Ghost {jwt_token}',\n",
|
||||
" 'Content-Type': 'application/json'\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DELETE_ALL_POSTS = False\n",
|
||||
"\n",
|
||||
"if DELETE_ALL_POSTS:\n",
|
||||
" while (True):\n",
|
||||
" # GET /admin/posts/\n",
|
||||
" response = requests.get(os.path.join(admin_api_url, \"posts\"), headers=headers)\n",
|
||||
" dict_response = response.json()\n",
|
||||
"\n",
|
||||
" if (len(dict_response.get(\"posts\")) == 0):\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
" # Iterate posts\n",
|
||||
" for p in dict_response.get(\"posts\"):\n",
|
||||
" # Post ID\n",
|
||||
" post_id = p.get(\"id\")\n",
|
||||
"\n",
|
||||
" # DELETE /admin/posts/{id}/\n",
|
||||
" r = requests.delete(os.path.join(admin_api_url, \"posts\", \"{}\".format(post_id)), headers=headers)\n",
|
||||
" print(\"Post:\", post_id, \"Status:\", r.status_code, r.text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PUBLISH_SAMPLE = False\n",
|
||||
"\n",
|
||||
"def _create_ghost_post(jwt_token, admin_api_url, post_data):\n",
|
||||
" # Get Admin API URL\n",
|
||||
" admin_api_url = os.getenv(\"GHOST_ADMIN_API_URL\")\n",
|
||||
"\n",
|
||||
" headers = {\n",
|
||||
" 'Authorization': f'Ghost {jwt_token}',\n",
|
||||
" 'Content-Type': 'application/json'\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" post_data = {\"posts\": [post_data]}\n",
|
||||
"\n",
|
||||
" response = requests.post(\n",
|
||||
" os.path.join(admin_api_url, \"posts\"),\n",
|
||||
" json=post_data,\n",
|
||||
" headers=headers,\n",
|
||||
" params={\"source\":\"html\"}\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" if response.status_code == 201:\n",
|
||||
" print(\"Ghost post published successfully\")\n",
|
||||
" return response.json()\n",
|
||||
" else:\n",
|
||||
" print(\"Ghost - Failed to publish post: {} {}\".format(response.status_code, response.text))\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
"if (PUBLISH_SAMPLE):\n",
|
||||
" url_id = 150\n",
|
||||
"\n",
|
||||
" post_data = {\n",
|
||||
" # \"slug\": \"hey-short\",\n",
|
||||
" \"title\": \"Hey there, sample title\",\n",
|
||||
" \"html\": \"<p>Hey there!</p>\",\n",
|
||||
" # \"feature_image\": photo_url,\n",
|
||||
" # \"feature_image_caption\": \"\",\n",
|
||||
" \"status\": \"published\",\n",
|
||||
" \"tags\": [\"#url-id-{}\".format(url_id)]\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" # Publish post\n",
|
||||
" payload = _create_ghost_post(jwt_token, admin_api_url, post_data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Filter by post title\n",
|
||||
"post_title = \"Funds raised for legal action over failure to stop grooming gangs\"\n",
|
||||
"# Filter by published date\n",
|
||||
"iso_time = (datetime.now(timezone.utc) - timedelta(hours=48)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z'\n",
|
||||
"# Parameter for filter\n",
|
||||
"params = {\"filter\": \"title:'{}'+published_at:>{}\".format(post_title, iso_time)}\n",
|
||||
"\n",
|
||||
"# Filter by URL ID\n",
|
||||
"url_id = 150\n",
|
||||
"# Parameter for filter\n",
|
||||
"params = {\"filter\": \"tags:hash-url-id-{}\".format(url_id)}\n",
|
||||
"\n",
|
||||
"# Get posts using filter\n",
|
||||
"response = requests.get(os.path.join(admin_api_url, \"posts\"), params=params, headers=headers)\n",
|
||||
"dict_response = response.json()\n",
|
||||
"\n",
|
||||
"len(dict_response.get(\"posts\"))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos_urls",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
215
utils/Newspapers.ipynb
Normal file
215
utils/Newspapers.ipynb
Normal file
@@ -0,0 +1,215 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"url = \"https://onlinenewspapers.com/index.shtml\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'newspaper/0.9.3.1'"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
"import newspaper\n",
|
||||
"\n",
|
||||
"newspaper.Config().__dict__\n",
|
||||
"\n",
|
||||
" 'requests_params': {'timeout': 7,\n",
|
||||
" 'proxies': {},\n",
|
||||
" 'headers': {'User-Agent': 'newspaper/0.9.3.1'}},\n",
|
||||
"\"\"\"\n",
|
||||
"import newspaper\n",
|
||||
"newspaper.Config().browser_user_agent"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
" url (str): The url of the source (news website) to build. For example,\n",
|
||||
" `https://www.cnn.com`.\n",
|
||||
" dry (bool): If true, the source object will be constructed but not\n",
|
||||
" downloaded or parsed.\n",
|
||||
" only_homepage (bool): If true, the source object will only parse\n",
|
||||
" the homepage of the source.\n",
|
||||
" only_in_path (bool): If true, the source object will only\n",
|
||||
" parse the articles that are in the same path as the source's\n",
|
||||
" homepage. You can scrape a specific category this way.\n",
|
||||
" Defaults to False.\n",
|
||||
" input_html (str): The HTML of the source to parse. Use this to pass cached\n",
|
||||
" HTML to the source object.\n",
|
||||
" config (Configuration): A configuration object to use for the source.\n",
|
||||
" kwargs: Any other keyword arguments to pass to the Source constructor.\n",
|
||||
" If you omit the config object, you can add any configuration\n",
|
||||
" options here.\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"url = \"https://www.lanacion.com.ar/deportes/\"\n",
|
||||
"\n",
|
||||
"newspaper_built = newspaper.build(url, only_in_path=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newspaper_built.__dict__"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newspaper_built.article_urls()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"url = \"https://www.lanacion.com.ar/\"\n",
|
||||
"#url = \"https://www.lanacion.com.ar/deportes/\"\n",
|
||||
"newspaper_built = newspaper.build(url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
" url (str): The url of the source (news website) to build. For example,\n",
|
||||
" `https://www.cnn.com`.\n",
|
||||
" dry (bool): If true, the source object will be constructed but not\n",
|
||||
" downloaded or parsed.\n",
|
||||
" only_homepage (bool): If true, the source object will only parse\n",
|
||||
" the homepage of the source.\n",
|
||||
" only_in_path (bool): If true, the source object will only\n",
|
||||
" parse the articles that are in the same path as the source's\n",
|
||||
" homepage. You can scrape a specific category this way.\n",
|
||||
" Defaults to False.\n",
|
||||
" input_html (str): The HTML of the source to parse. Use this to pass cached\n",
|
||||
" HTML to the source object.\n",
|
||||
" config (Configuration): A configuration object to use for the source.\n",
|
||||
" kwargs: Any other keyword arguments to pass to the Source constructor.\n",
|
||||
" If you omit the config object, you can add any configuration\n",
|
||||
" options here.\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cat = newspaper_built.categories[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newspaper_built.categories_to_articles()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newspaper_built.category_urls()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
" 'https://www.lanacion.com.ar/tema/futbol-argentino-tid57505/',\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"categories = newspaper_built.category_urls()\n",
|
||||
"url_of_interest = \"https://www.lanacion.com.ar/sabado/todo-para-ellos-nid21042025/\"\n",
|
||||
"\n",
|
||||
"potential_categories = []\n",
|
||||
"\n",
|
||||
"for c in categories:\n",
|
||||
" if (c in url_of_interest):\n",
|
||||
" print(c, url_of_interest)\n",
|
||||
" potential_categories.append(c)\n",
|
||||
"\n",
|
||||
"# Get longest length category"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newspaper_built.article_urls()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos_urls",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
@@ -11,6 +11,8 @@
|
||||
"from urllib.parse import urljoin\n",
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"import json\n",
|
||||
"import csv\n",
|
||||
"\n",
|
||||
"headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
|
||||
]
|
||||
@@ -68,6 +70,154 @@
|
||||
" # websites.append(href)\n",
|
||||
" return href\n",
|
||||
"\n",
|
||||
"def get_num_students_per_zipcode(soup):\n",
|
||||
" list_zipcode_students_percentage = []\n",
|
||||
"\n",
|
||||
" h3_tag = soup.find(\"h3\", string=\"In welk postcodegebied wonen de leerlingen van deze school?\")\n",
|
||||
" if h3_tag:\n",
|
||||
" dialog = h3_tag.find_parent(\"dialog\")\n",
|
||||
"\n",
|
||||
" if dialog:\n",
|
||||
" # print(dialog.prettify())\n",
|
||||
" table = dialog.find(\"table\")\n",
|
||||
" if table:\n",
|
||||
" rows = table.find_all(\"tr\")\n",
|
||||
" for row in rows:\n",
|
||||
" cells = row.find_all([\"th\", \"td\"])\n",
|
||||
" row_data = [cell.get_text(strip=True) for cell in cells]\n",
|
||||
" zipcode, num_students, percentage = row_data\n",
|
||||
" list_zipcode_students_percentage.append( (zipcode, num_students, percentage) )\n",
|
||||
" \n",
|
||||
" return list_zipcode_students_percentage\n",
|
||||
"\n",
|
||||
"def get_num_students_trend(soup):\n",
|
||||
" # Step 1: Locate the <aantal-leerlingen-trend-line-chart> tag\n",
|
||||
" trend_chart_tag = soup.find(\"aantal-leerlingen-trend-line-chart\")\n",
|
||||
"\n",
|
||||
" if trend_chart_tag:\n",
|
||||
" # Step 2: Extract the 'leerlingen-trend-data' attribute\n",
|
||||
" trend_data_attr = trend_chart_tag.get(\"leerlingen-trend-data\")\n",
|
||||
" \n",
|
||||
" if trend_data_attr:\n",
|
||||
" # Step 3: Parse the JSON string into a Python object\n",
|
||||
" trend_data = json.loads(trend_data_attr)\n",
|
||||
" #print(\"Extracted leerlingen-trend-data:\")\n",
|
||||
" #print(json.dumps(trend_data, indent=4)) # Pretty-print the JSON data\n",
|
||||
" return [ (e.get(\"key\"), e.get(\"aantal\") ) for e in trend_data]\n",
|
||||
"\n",
|
||||
"def get_num_students_per_age_and_group(soup):\n",
|
||||
" num_students_per_group, num_students_per_age = [], []\n",
|
||||
" ############################################################################\n",
|
||||
" # Step 1: Locate the <aantal-leerlingen-leeftijd-bar-chart> tag\n",
|
||||
" chart_tag = soup.find('aantal-leerlingen-leeftijd-bar-chart', attrs={'aantal-per-leeftijd': True})\n",
|
||||
" # Step 2: Extract the 'aantal-per-leeftijd' attribute\n",
|
||||
" raw_data = chart_tag['aantal-per-leeftijd']\n",
|
||||
"\n",
|
||||
" # Step 3: Parse the JSON data\n",
|
||||
" try:\n",
|
||||
" data = json.loads(raw_data)\n",
|
||||
" # Step 4: Print the extracted data\n",
|
||||
" # print(\"Aantal per Leeftijd:\")\n",
|
||||
" for entry in data:\n",
|
||||
" age = entry['key']\n",
|
||||
" num_students = entry['aantal']\n",
|
||||
" # school_data[\"num_students_age_{}\".format(age)] = num_students\n",
|
||||
" num_students_per_age.append( (age, num_students) )\n",
|
||||
" # print(f\"Age {entry['key']}: {entry['aantal']} leerlingen\")\n",
|
||||
" except json.JSONDecodeError as e:\n",
|
||||
" print(f\"Failed to parse JSON data: {e}\")\n",
|
||||
"\n",
|
||||
" ############################################################################\n",
|
||||
" # Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
|
||||
" chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
|
||||
"\n",
|
||||
" if not chart_tag:\n",
|
||||
" print(\"Could not find the 'aantal per leerjaar' section.\")\n",
|
||||
" else:\n",
|
||||
" # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
|
||||
" raw_data = chart_tag['aantal-per-leerjaar']\n",
|
||||
" \n",
|
||||
" # Step 3: Parse the JSON data\n",
|
||||
" try:\n",
|
||||
" data = json.loads(raw_data)\n",
|
||||
" # Step 4: Print the extracted data\n",
|
||||
" # print(\"Aantal per Leerjaar:\")\n",
|
||||
" for entry in data:\n",
|
||||
" group = entry['key']\n",
|
||||
" num_students = entry['aantal']\n",
|
||||
" # school_data[\"num_students_group_{}\".format(group)] = num_students\n",
|
||||
" num_students_per_group.append( (group, num_students) )\n",
|
||||
" # print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
|
||||
" except json.JSONDecodeError as e:\n",
|
||||
" print(f\"Failed to parse JSON data: {e}\")\n",
|
||||
" ############################################################################\n",
|
||||
" return num_students_per_group, num_students_per_age\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def update_school_data(school_url, school_data):\n",
|
||||
" try:\n",
|
||||
" # Process school (request contact details)\n",
|
||||
" response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
|
||||
" response.raise_for_status() # Raise an exception for HTTP errors\n",
|
||||
" # Parse the HTML content using BeautifulSoup\n",
|
||||
" soup_school = BeautifulSoup(response.text, 'html.parser')\n",
|
||||
"\n",
|
||||
" # School details\n",
|
||||
" school_details = soup_school.find(class_=\"school-details\")\n",
|
||||
" for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n",
|
||||
" data = li_detail.find('span', class_='infotip-term')['data-dfn']\n",
|
||||
" text = li_detail.get_text(strip=True)\n",
|
||||
" # Set data\n",
|
||||
" school_data[\"category_{}\".format(category_idx)] = text\n",
|
||||
" school_data[\"category_{}_description\".format(category_idx)] = data\n",
|
||||
" \n",
|
||||
" school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n",
|
||||
" school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n",
|
||||
" school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n",
|
||||
" school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n",
|
||||
"\n",
|
||||
" school_data[\"city\"] = school_city\n",
|
||||
" school_data[\"postcode\"] = school_postcode\n",
|
||||
" school_data[\"address\"] = school_address\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n",
|
||||
" except Exception as e:\n",
|
||||
" pass\n",
|
||||
" try:\n",
|
||||
" school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n",
|
||||
" except Exception as e:\n",
|
||||
" pass\n",
|
||||
" try:\n",
|
||||
" school_data[\"email\"] = extract_emails(soup_school)\n",
|
||||
" except Exception as e:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" # Process school main site\n",
|
||||
" response = requests.get(os.path.join(school_url), headers=headers)\n",
|
||||
" response.raise_for_status() # Raise an exception for HTTP errors\n",
|
||||
" # Parse the HTML content using BeautifulSoup\n",
|
||||
" soup_school = BeautifulSoup(response.text, 'html.parser')\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" school_data[\"students_per_zipcode\"] = get_num_students_per_zipcode(soup_school)\n",
|
||||
" except Exception as e:\n",
|
||||
" pass\n",
|
||||
" try:\n",
|
||||
" school_data[\"students_per_year_trend\"] = get_num_students_trend(soup_school)\n",
|
||||
" except Exception as e:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" if (school_data.get(\"category\").lower() == \"basisscholen\"):\n",
|
||||
" try:\n",
|
||||
" num_students_per_group, num_students_per_age = get_num_students_per_age_and_group(soup_school)\n",
|
||||
" school_data[\"num_students_per_group\"] = num_students_per_group if len(num_students_per_group)>0 else None\n",
|
||||
" school_data[\"num_students_per_age\"] = num_students_per_age if len(num_students_per_age)>0 else None\n",
|
||||
" except Exception as e:\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" print(school_url, str(e))\n",
|
||||
"\n",
|
||||
"def main():\n",
|
||||
" list_urls = [\n",
|
||||
@@ -128,54 +278,26 @@
|
||||
" \"url\": school_url,\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" # Process school (request contact details)\n",
|
||||
" response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
|
||||
" response.raise_for_status() # Raise an exception for HTTP errors\n",
|
||||
"\n",
|
||||
" # Parse the HTML content using BeautifulSoup\n",
|
||||
" soup_school = BeautifulSoup(response.text, 'html.parser')\n",
|
||||
"\n",
|
||||
" # School details\n",
|
||||
" school_details = soup_school.find(class_=\"school-details\")\n",
|
||||
" for category_idx, li_detail in enumerate(school_details.find_all(\"li\")):\n",
|
||||
" data = li_detail.find('span', class_='infotip-term')['data-dfn']\n",
|
||||
" text = li_detail.get_text(strip=True)\n",
|
||||
" # Set data\n",
|
||||
" school_data[\"category_{}\".format(category_idx)] = text\n",
|
||||
" school_data[\"category_{}_description\".format(category_idx)] = data\n",
|
||||
" \n",
|
||||
" school_address = soup_school.find(class_=\"school-adres\").get_text(strip=True)\n",
|
||||
" school_postcode_city = soup_school.find(class_=\"school-postcode-woonplaats\").get_text(strip=True)\n",
|
||||
" school_postcode = \"\".join(school_postcode_city.split(\" \")[:2])\n",
|
||||
" school_city = \" \".join(school_postcode_city.split(\" \")[2:])\n",
|
||||
"\n",
|
||||
" school_data[\"city\"] = school_city\n",
|
||||
" school_data[\"postcode\"] = school_postcode\n",
|
||||
" school_data[\"address\"] = school_address\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" school_data[\"website\"] = find_website(soup_school) # soup_school.find(class_=\"button button-primary\").get('href')\n",
|
||||
" except Exception as e:\n",
|
||||
" pass\n",
|
||||
" try:\n",
|
||||
" school_data[\"phone\"] = soup_school.find('a', href=lambda href: href and href.startswith('tel:')).text\n",
|
||||
" except Exception as e:\n",
|
||||
" pass\n",
|
||||
" try:\n",
|
||||
" school_data[\"email\"] = extract_emails(soup_school)\n",
|
||||
" except Exception as e:\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" print(school_url, str(e))\n",
|
||||
" # assert False\n",
|
||||
" update_school_data(school_url, school_data)\n",
|
||||
"\n",
|
||||
" list_school_data_dicts.append(school_data)\n",
|
||||
"\n",
|
||||
" df = pd.DataFrame(list_school_data_dicts)\n",
|
||||
" df.to_csv(\"scholenopdekaart.csv\")\n",
|
||||
" # Save per processed school to track progress\n",
|
||||
" df = pd.DataFrame(list_school_data_dicts)\n",
|
||||
" df.to_csv(\"scholenopdekaart_tmp.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n",
|
||||
"\n",
|
||||
" df = pd.DataFrame(list_school_data_dicts)\n",
|
||||
" df.to_csv(\"scholenopdekaart.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n",
|
||||
" # Without extra columns\n",
|
||||
" df.drop(columns=[\"students_per_zipcode\", \"students_per_year_trend\", \"num_students_per_group\", \"num_students_per_age\"]).to_csv(\"scholenopdekaart_.csv\", encoding=\"utf-8\", quoting=csv.QUOTE_ALL)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\"\"\" # Issues with URL:\n",
|
||||
"https://scholenopdekaart.nl/middelbare-scholen/grave/1900/merletcollege-grave/\n",
|
||||
"https://scholenopdekaart.nl/middelbare-scholen/lent/4386/citadel-college-locatie-griftdijk/\n",
|
||||
@@ -211,15 +333,7 @@
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"'''\n",
|
||||
"school_url = \"https://scholenopdekaart.nl/basisscholen/aalden/9661/christelijke-basisschool-de-schutse/\"\n",
|
||||
"response = requests.get(os.path.join(school_url, \"contact/#inhoud\"), headers=headers)\n",
|
||||
"# Parse the HTML content using BeautifulSoup\n",
|
||||
"soup_school = BeautifulSoup(response.text, 'html.parser')\n",
|
||||
"soup_school\n",
|
||||
"'''"
|
||||
]
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
@@ -229,8 +343,9 @@
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n",
|
||||
"df.loc[0, \"category_3\"]"
|
||||
"df = pd.read_csv(\"~/Downloads/scholenopdekaart.csv\", index_col=0)\n",
|
||||
"\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -238,82 +353,102 @@
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"def to_dict(row):\n",
|
||||
" # Empty?\n",
|
||||
" if (pd.isna(row)):\n",
|
||||
" return {}\n",
|
||||
" # Evaluate, to dict\n",
|
||||
" dict_data = dict(eval(row))\n",
|
||||
" # Remove None values\n",
|
||||
" for k in list(dict_data.keys()):\n",
|
||||
" if dict_data[k] is None:\n",
|
||||
" del dict_data[k]\n",
|
||||
" # Prefix\n",
|
||||
" return {f\"{column}_{k}\": v for k, v in dict_data.items()}\n",
|
||||
"\n",
|
||||
"# Step 1: Fetch the webpage\n",
|
||||
"url = \"https://scholenopdekaart.nl/basisscholen/aagtekerke/25963/jhr-willem-versluijsschool/\"\n",
|
||||
"headers = {\n",
|
||||
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36\"\n",
|
||||
"}\n",
|
||||
"response = requests.get(url, headers=headers)\n",
|
||||
"\n",
|
||||
"# Check if the request was successful\n",
|
||||
"if response.status_code != 200:\n",
|
||||
" print(f\"Failed to retrieve the page. Status code: {response.status_code}\")\n",
|
||||
" exit()\n",
|
||||
"\n",
|
||||
"# Step 2: Parse the HTML content\n",
|
||||
"soup = BeautifulSoup(response.text, 'html.parser')"
|
||||
"for column in [\"students_per_year_trend\", \"num_students_per_group\", \"num_students_per_age\"]:\n",
|
||||
" print(column)\n",
|
||||
" # Convert the list of tuples into a dictionary per row\n",
|
||||
" df_dicts = df[column].apply(to_dict)\n",
|
||||
" # Expand into separate columns\n",
|
||||
" df_expanded = pd.json_normalize(df_dicts)\n",
|
||||
" # Sort\n",
|
||||
" df_expanded = df_expanded[sorted(df_expanded.columns)]\n",
|
||||
" # Combine with original columns\n",
|
||||
" df = pd.concat([df.drop(columns=[column]), df_expanded], axis=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Aantal per Leerjaar:\n",
|
||||
"Groep 1: 29 leerlingen\n",
|
||||
"Groep 2: 28 leerlingen\n",
|
||||
"Groep 3: 30 leerlingen\n",
|
||||
"Groep 4: 25 leerlingen\n",
|
||||
"Groep 5: 19 leerlingen\n",
|
||||
"Groep 6: 26 leerlingen\n",
|
||||
"Groep 7: 22 leerlingen\n",
|
||||
"Groep 8: 20 leerlingen\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"def to_dict(row):\n",
|
||||
" # Empty?\n",
|
||||
" if (pd.isna(row)):\n",
|
||||
" return {}\n",
|
||||
" # Evaluate, to dict\n",
|
||||
" data = eval(row)\n",
|
||||
" # Remove first useless data\n",
|
||||
" data = data[1:]\n",
|
||||
"\n",
|
||||
"# Step 1: Locate the <aantal-leerlingen-leerjaar-bar-chart> tag\n",
|
||||
"chart_tag = soup.find('aantal-leerlingen-leerjaar-bar-chart', attrs={'aantal-per-leerjaar': True})\n",
|
||||
" # Generate dict\n",
|
||||
" dict_data = {}\n",
|
||||
" for (zipcode, num, percentage) in data:\n",
|
||||
" dict_data[f\"num_students_zipcode_{zipcode}\"] = num\n",
|
||||
" dict_data[f\"percentage_students_zipcode_{zipcode}\"] = percentage\n",
|
||||
"\n",
|
||||
"if not chart_tag:\n",
|
||||
" print(\"Could not find the 'aantal per leerjaar' section.\")\n",
|
||||
"else:\n",
|
||||
" # Step 2: Extract the 'aantal-per-leerjaar' attribute\n",
|
||||
" raw_data = chart_tag['aantal-per-leerjaar']\n",
|
||||
" \n",
|
||||
" # Step 3: Parse the JSON data\n",
|
||||
" try:\n",
|
||||
" data = json.loads(raw_data)\n",
|
||||
" \n",
|
||||
" # Step 4: Print the extracted data\n",
|
||||
" print(\"Aantal per Leerjaar:\")\n",
|
||||
" for entry in data:\n",
|
||||
" print(f\"Groep {entry['key']}: {entry['aantal']} leerlingen\")\n",
|
||||
" except json.JSONDecodeError as e:\n",
|
||||
" print(f\"Failed to parse JSON data: {e}\")"
|
||||
" # Remove None values\n",
|
||||
" for k in list(dict_data.keys()):\n",
|
||||
" if dict_data[k] is None:\n",
|
||||
" del dict_data[k]\n",
|
||||
" return dict_data\n",
|
||||
"\n",
|
||||
"for column in [\"students_per_zipcode\"]:\n",
|
||||
" print(column)\n",
|
||||
" # Convert the list of tuples into a dictionary per row\n",
|
||||
" df_dicts = df[column].apply(to_dict)\n",
|
||||
" # Expand into separate columns\n",
|
||||
" df_expanded = pd.json_normalize(df_dicts)\n",
|
||||
" # Sort\n",
|
||||
" df_expanded = df_expanded[sorted(df_expanded.columns)]\n",
|
||||
" # Combine with original columns\n",
|
||||
" df = pd.concat([df.drop(columns=[column]), df_expanded], axis=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.to_csv(\"schools_nl.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"list(df.columns)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos_urls",
|
||||
"display_name": "fetcher",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@@ -327,7 +462,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
"version": "3.12.11"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
182
utils/Summary.ipynb
Normal file
182
utils/Summary.ipynb
Normal file
@@ -0,0 +1,182 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# docker exec -it ollama_npu bash\n",
|
||||
"# rkllama pull\n",
|
||||
"#\n",
|
||||
"# c01zaut/Llama-3.2-3B-Instruct-rk3588-1.1.4\n",
|
||||
"# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-0-hybrid-ratio-0.0.rkllm\n",
|
||||
"# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-0-hybrid-ratio-0.5.rkllm\n",
|
||||
"# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-0.0.rkllm\n",
|
||||
"# Llama-3.2-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-0.5.rkllm\n",
|
||||
"# Llama-3.2-3B-Instruct-rk3588-w8a8_g512-opt-1-hybrid-ratio-0.5.rkllm\n",
|
||||
"#\n",
|
||||
"# c01zaut/Qwen2.5-3B-Instruct-RK3588-1.1.4\n",
|
||||
"# Qwen2.5-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-0.0.rkllm\n",
|
||||
"# Qwen2.5-3B-Instruct-rk3588-w8a8-opt-1-hybrid-ratio-1.0.rkllm\n",
|
||||
"# Qwen2.5-3B-Instruct-rk3588-w8a8_g256-opt-1-hybrid-ratio-1.0.rkllm\n",
|
||||
"#"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import ollama\n",
|
||||
"import os\n",
|
||||
"import requests\n",
|
||||
"import json\n",
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"# endpoint = \"https://ollamamodelnpu.matitos.org\"\n",
|
||||
"endpoint = \"https://ollamamodel.matitos.org\"\n",
|
||||
"model = \"qwen3:0.6b\"\n",
|
||||
"model = \"qwen3:1.7b\"\n",
|
||||
"client = ollama.Client(endpoint)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"r = requests.post( os.path.join(endpoint, \"unload_model\") )\n",
|
||||
"r.status_code, r.json()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"r = requests.get( os.path.join(endpoint, \"models\") )\n",
|
||||
"r.json().get(\"models\"), [ m.model for m in client.list().get(\"models\") ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = \"llama3-instruct:3b\"\n",
|
||||
"model = \"qwen2.5-instruct:3b\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"article_content = \"Kevin Sutherland's message to Rowan Lumsden told of his agony at what he believed were malicious rumours about his life. The best friend of tragic Kevin Sutherland has revealed a heartbreaking message sent in the last hours of his life. Rowan Lumsden, 35, says Kevin’s death would have been avoided if his request for anonymity in the Scottish Child Abuse Inquiry had been accepted. Mum-of-one Rowan told how her friend sent a 17-minute voice message that culminated as he stood on the Forth Road Bridge, where he is thought to have plunged to his death on December 19. The Daily Record has told how Kevin, 33, had ticked a box to say he approved of his testimony of historic abuse that he suffered to be published online. Kevin’s family later revealed an email sent to the inquiry, in which he begged for his real name to be redacted, suggesting he may take his own life if he was not given that protection. His appeal was dismissed by SCAI chair Lady Smith. Rowan told how Kevin left a harrowing final message, telling of his agony at what he believed to be malicious rumours that plagued his life. Rowan said: “I was asleep when the messages came in and it was devastating to hear his voice, knowing where he was and what was going to happen. I just wish I could have helped. “Kevin was pushed to the limit and he was so troubled about what people were saying about him. “He lived in fear his testimony would be used by people to make him out to be at fault or misconstrued and he bitterly regretted his decision to allow it to be made public. “I have no doubt that he would be alive today if he was allowed to to retract his on story from the record.” Rowan, 35, said Lady Smith’s decision was wrong “in so many ways”. She said: “He begged her to let him be anonymous and he said that he would take his life if she refused. “But she said, ‘No’. I cannot see any way that can be explained away. He just needed the time it took to get the right interventions to turn his mental health and his life around. “Lady Smith was the top person in the inquiry. She knew she was dealing with a hugely vulnerable person – as all victims are. She knew that he was having suicidal thoughts.” Kevin suffered trauma, including sexual abuse, in his childhood. In his final message to Rowan, in the hours before his suspected death, Kevin didn’t refer directly to the SCAI inquiry but stated: “It’s just coming from the absolute f****** heart and I just cannot cope with this life any more. “It’s just been so f****** unbelievably brutal. I kind of feel like, what’s the point? People have got their preconceived ideas and malicious gossip has served such a toxic contribution to this final decision that I’ve made. “That’s me on the bridge. End of the road, eh? End of the road to all the liars and doubters and gossip mongrels.” Kevin’s sister Melanie Watson, who recently revealed the text of Kevin’s final appeal for anonymity, said she was aware of his final messages to friends. She added: “He was very fixated with the fear that people would make false assumptions about him, based on reading his testimony on Google.” The inquiry’s handling of Kevin is now part of an independent inquiry. An SCAI spokesperson said: “SCAI has commissioned an independent review to consider all aspects of its interactions with Kevin.”\"\n",
|
||||
"article_content = \"Child services visited a Bronx apartment while a 4-year-old girl was trapped inside with the corpses of her troubled mom and brother – but walked away after knocking, neighbors said. Lisa Cotton, 38, and her 8-year-old son, Nazir Millien, 8, had been dead for at least two weeks before relatives found them and the toddler inside the house of horrors Friday, one day after reps for the Administration for Children’s Services dropped the ball, neighbor Sabrina Coleson said. “They didn’t do s–t,” Coleson said Sunday. “They were here ringing people’s bells the day before the wellness check. They were here, but they didn’t do s–t. “One rang my bell and asked if I had any concerns for upstairs. And then a man opened his door and started yelling,” she said. “Lisa was a very cool girl. I never saw her son with her, only the girl. It’s terrible.” Concerned relatives finally checked on the family on Friday and found the 4-year-old, Promise, alone, starving and in horrid condition on her mother’s bed — as bugs crawled over her dead family. Cotton’s father, Hubert, 71, had sent his oldest granddaughter to check the apartment at East 231st Street — with the woman grabbing her young sibling and fleeing the putrid home to call police. ACS wasn’t the only city agency to leave Promise trapped in hellish conditions — neighbors said cops were also called to the apartment on Tuesday but left after not sensing the stench reported by others. Hubert Cotton said the toddler survived by “feeding herself with chocolate.” Law enforcement sources said Lisa Cotton had a history of erratic behavior, and had a pending ACS case for alleged child neglect before she was found dead. She was arrested in 2021 on child abandonment charges after police said she was caught swinging her then-infant daughter around in a stroller and lighting a wig on fire on White Plains Road, sources said. When cops arrived she was allegedly walking away, leaving Promise behind. The outcome of the case was not available because the file is sealed. One neighbor said the mom had “episodes” in the past. Sources said police believe Lisa Cotton, who suffered from asthma, may have died from cardiac arrest, while her son, who was born prematurely and had a feeding tube, may have starved to death. A spokesperson for ACS declined to comment on the case on Sunday other than to say the agency is “investigating this tragedy with the NYPD.”\"\n",
|
||||
"\n",
|
||||
"# prompt = \"Rewrite the content below into a clear and concise summary of one paragraph maximum, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article:\\n\\n{}\".format(article_content)\n",
|
||||
"# prompt = \"Provide a summary of the content below, presenting the key points as if they are newly written insights. Write in a natural, standalone format that feels like an original explanation. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Keep it brief, engaging, informative, in the style of a news article, and in one single paragraph:\\n\\n{}\".format(article_content)\n",
|
||||
"# prompt = \"Provide a summary of the content below, writing in a natural and standalone format that feels like an original explanation. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Keep it brief, engaging, informative, in the style of a news article, and in one single paragraph:\\n\\n{}\".format(article_content)\n",
|
||||
"\n",
|
||||
"# in one sentence each\n",
|
||||
"prompt = \"First, provide a summary of the content below in one paragraph. Second, specify the Who, What, When, Where and Why of the story:\\n\\n{}\".format(article_content)\n",
|
||||
"# prompt = \"Provide the 5W (Who, What, When, Where, Why) and a detailed summary of the content below:\\n\\n{}\".format(article_content)\n",
|
||||
"# Only answer with the location or address which can be extracted from this description\n",
|
||||
"\n",
|
||||
"prompt = \"Provide, in one sentence each, the who, what, when, where, why, and a detailed summary of the content below:\\n\\n{}\".format(article_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"options = {\"temperature\": 0, \"seed\": 51029}\n",
|
||||
"resp = client.generate(model=model, prompt=prompt, format=\"json\", options=options)\n",
|
||||
"r = requests.post( os.path.join(endpoint, \"unload_model\") )\n",
|
||||
"\n",
|
||||
"response_dict = json.loads(resp.response)\n",
|
||||
"pprint(response_dict)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'{\\n\\n\\n}'"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"resp.response"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"<think>\\nOkay, let's tackle this query. The user wants a one-sentence summary for each element: who, what, when, where, why, and a detailed summary.\\n\\nFirst, the main event is the child services visiting a Bronx apartment with a 4-year-old trapped, but the neighbors say they knocked out the corpses. So for the first sentence, I need to include who (child services), what (visited the apartment), when (Friday), where (the apartment), why (neighbors said they didn't do it), and a summary. \\n\\nThen, for the second part, the user might want more details. Let me check the content. The summary needs to include the specific details like the family members, the days they were found dead, the agencies involved, and the outcomes. Also, mention the sources like ACS and the neighbors' statements. I need to make sure each sentence is concise and covers all the points without being too lengthy. Let me structure each sentence to fit the required format.\\n</think>\\n\\n**Who:** Child services in the Bronx, **What:** Visited an apartment containing a 4-year-old trapped with a dead mom and brother, **When:** Friday, **Where:** East 231st Street, **Why:** Neighbors reported the agency’s actions were inadequate, **Summary:** Child services visited a Bronx apartment with a 4-year-old trapped and dead, neighbors say they knocked out the corpses, and the incident is attributed to the agency’s failure to address the situation, with the family surviving by feeding themselves and the case being sealed.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#resp = client.generate(model=model, prompt=prompt, format=\"json\")\n",
|
||||
"resp = client.generate(model=model, prompt=prompt)\n",
|
||||
"resp.response"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos_urls",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user