Compare commits
1 Commits
main
...
app_read_o
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dc56cc3365 |
12
.env.sample
12
.env.sample
@@ -24,10 +24,8 @@ DB_PASSWORD=supermatitos
|
|||||||
DB_USER=supermatitos
|
DB_USER=supermatitos
|
||||||
DB_HOST=fetcher_db
|
DB_HOST=fetcher_db
|
||||||
DB_PORT=5432
|
DB_PORT=5432
|
||||||
REDIS_CACHE_HOST=fetcher_redis_cache
|
REDIS_HOST=fetcher_redis
|
||||||
REDIS_CACHE_PORT=6379
|
REDIS_PORT=6379
|
||||||
REDIS_CELERY_HOST=fetcher_redis_celery
|
|
||||||
REDIS_CELERY_PORT=6379
|
|
||||||
|
|
||||||
# Job timeout: 30 min
|
# Job timeout: 30 min
|
||||||
JOB_DEFAULT_TIMEOUT=1800
|
JOB_DEFAULT_TIMEOUT=1800
|
||||||
@@ -57,9 +55,3 @@ PEXELS_API_KEY=Y6clJkY32eihf34ukX4JsINYu9lzxh3xDdNq2HMAmGwXp0a0tt6vr6S9
|
|||||||
# Ollama
|
# Ollama
|
||||||
ENDPOINT_OLLAMA=https://ollamamodelnpu.matitos.org
|
ENDPOINT_OLLAMA=https://ollamamodelnpu.matitos.org
|
||||||
OLLAMA_MODEL_DEFAULT=qwen2.5-instruct:3b
|
OLLAMA_MODEL_DEFAULT=qwen2.5-instruct:3b
|
||||||
|
|
||||||
# Telegram
|
|
||||||
TELEGRAM_INFO_BOT_TOKEN="..."
|
|
||||||
TELEGRAM_INFO_CHAT_ID="..."
|
|
||||||
TELEGRAM_WARNING_BOT_TOKEN="..."
|
|
||||||
TELEGRAM_WARNING_CHAT_ID="..."
|
|
||||||
|
|||||||
@@ -97,10 +97,9 @@ DATABASES = {
|
|||||||
CACHES = {
|
CACHES = {
|
||||||
"default": {
|
"default": {
|
||||||
"BACKEND": "django_redis.cache.RedisCache",
|
"BACKEND": "django_redis.cache.RedisCache",
|
||||||
"LOCATION": "redis://{}:{}/{}".format(
|
"LOCATION": "redis://{}:{}".format(
|
||||||
os.environ.get("REDIS_CACHE_HOST", "localhost"),
|
os.environ.get("REDIS_HOST", "localhost"),
|
||||||
os.environ.get("REDIS_CACHE_PORT", 6379),
|
os.environ.get("REDIS_PORT", 6379)
|
||||||
2 # DB for Caching
|
|
||||||
),
|
),
|
||||||
"OPTIONS": {
|
"OPTIONS": {
|
||||||
"MEMCACHE_MAX_KEY_LENGTH": 2048,
|
"MEMCACHE_MAX_KEY_LENGTH": 2048,
|
||||||
@@ -109,14 +108,13 @@ CACHES = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Celery configuration
|
# Celery configuration
|
||||||
CELERY_BROKER_URL = 'redis://{}:{}/{}'.format(os.environ.get("REDIS_CELERY_HOST", "localhost"), os.environ.get("REDIS_CELERY_PORT", 6379), 0)
|
CELERY_BROKER_URL = 'redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB", 0))
|
||||||
CELERY_RESULT_BACKEND = 'redis://{}:{}/{}'.format(os.environ.get("REDIS_CELERY_HOST", "localhost"), os.environ.get("REDIS_CELERY_PORT", 6379), 1)
|
CELERY_RESULT_BACKEND = 'redis://{}:{}/{}'.format(os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_PORT", 6379), os.environ.get("REDIS_DB_RESULTS", 1))
|
||||||
CELERY_ACCEPT_CONTENT = ['json']
|
CELERY_ACCEPT_CONTENT = ['json']
|
||||||
CELERY_TASK_SERIALIZER = 'json'
|
CELERY_TASK_SERIALIZER = 'json'
|
||||||
CELERY_RESULT_EXPIRES = 3600 # Auto clean results after 1 hour
|
|
||||||
CELERY_ENABLE_UTC = True
|
|
||||||
CELERY_TIMEZONE = "UTC"
|
|
||||||
|
|
||||||
# Celery Beat scheduler (required for django-celery-beat to work)
|
# Celery Beat scheduler (required for django-celery-beat to work)
|
||||||
CELERY_BEAT_SCHEDULER = 'django_celery_beat.schedulers.DatabaseScheduler'
|
CELERY_BEAT_SCHEDULER = 'django_celery_beat.schedulers.DatabaseScheduler'
|
||||||
|
|||||||
@@ -4,8 +4,7 @@ from django.core.cache import cache
|
|||||||
from django.db import IntegrityError
|
from django.db import IntegrityError
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from .fetch_utils_url_processor import process_url, verify_missing_kid_url
|
from .fetch_utils_url_processor import process_url, get_with_protocol, url_host_slowdown
|
||||||
from .utils import get_with_protocol
|
|
||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
@@ -17,7 +16,7 @@ class DB_Handler():
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def insert_raw_urls(self, urls, obj_source, obj_search):
|
def insert_raw_urls(self, urls, obj_source, obj_search):
|
||||||
try:
|
try:
|
||||||
logger.debug("Inserting raw URLs")
|
logger.debug("Inserting raw URLs")
|
||||||
# Empty?
|
# Empty?
|
||||||
@@ -101,13 +100,15 @@ class DB_Handler():
|
|||||||
# URLs duplciate association
|
# URLs duplciate association
|
||||||
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||||
|
|
||||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False, request_timeout=15):
|
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False):
|
||||||
##########################################################################
|
##########################################################################
|
||||||
# URL pattern: missingkids.org/poster OR missingkids.org/new-poster
|
# URL pattern: missingkids.org/poster OR missingkids.org/new-poster
|
||||||
if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url):
|
if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url):
|
||||||
|
# Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint)
|
||||||
|
url_host_slowdown(obj_url.url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||||
try:
|
try:
|
||||||
# Verify missing kid URL
|
# Request
|
||||||
results = verify_missing_kid_url(obj_url.url)
|
r = requests.get(obj_url.url, allow_redirects=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if (raise_exception_on_error):
|
if (raise_exception_on_error):
|
||||||
# Simply raise exception, handled in a different way
|
# Simply raise exception, handled in a different way
|
||||||
@@ -117,16 +118,20 @@ class DB_Handler():
|
|||||||
# Set status to error
|
# Set status to error
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
self._set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||||
return
|
return
|
||||||
|
|
||||||
if (results.get("status") == "valid"):
|
if (r.url != obj_url.url):
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
# Canonical
|
||||||
elif (results.get("status") == "invalid"):
|
url_canonical = r.url
|
||||||
|
# Set duplicate, and insert new canonical form
|
||||||
|
self._set_duplicate_and_insert_canonical(obj_url, url_canonical)
|
||||||
|
elif (r.status_code == 200):
|
||||||
|
# Not enough to determine if it is valid. Need to wait to finish javascript, it might redirect to 404
|
||||||
|
# self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||||
|
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
||||||
|
elif (r.status_code == 404):
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
self._set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||||
elif (results.get("status") == "duplicate"):
|
else:
|
||||||
self._set_duplicate_and_insert_canonical(obj_url, results.get("redirection"))
|
logger.debug("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))
|
||||||
elif (results.get("status") == "unknown"):
|
|
||||||
# Nothing to do, not sure about it...
|
|
||||||
logger.info("Missing kid verification returned unknown for URL: {}".format(obj_url.url))
|
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
self._set_status(obj_url, Urls.STATUS_ENUM.UNKNOWN)
|
||||||
return
|
return
|
||||||
##########################################################################
|
##########################################################################
|
||||||
@@ -142,7 +147,7 @@ class DB_Handler():
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Extract URL content
|
# Extract URL content
|
||||||
dict_url_data = process_url(obj_url.url, paywall_bypass, request_timeout)
|
dict_url_data = process_url(obj_url.url, paywall_bypass)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if (raise_exception_on_error):
|
if (raise_exception_on_error):
|
||||||
# Simply raise exception, handled in a different way
|
# Simply raise exception, handled in a different way
|
||||||
@@ -309,20 +314,14 @@ class DB_Handler():
|
|||||||
# Per URL
|
# Per URL
|
||||||
for obj_url in missingkids_urls:
|
for obj_url in missingkids_urls:
|
||||||
try:
|
try:
|
||||||
SELENIUM_BASED_MISSINGKID_VERIFICATION = False
|
# Missing kids fetching endpoint, verify URL
|
||||||
if (SELENIUM_BASED_MISSINGKID_VERIFICATION):
|
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
|
||||||
# Missing kids fetching endpoint, verify URL
|
data = {"url": obj_url.url}
|
||||||
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "verify_missing_kid/")
|
# POST
|
||||||
data = {"url": obj_url.url}
|
r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120)
|
||||||
# POST
|
# Jsonify
|
||||||
r = requests.post(missingkids_fetch_endpoint, json=data, timeout=120)
|
results = r.json()
|
||||||
# Jsonify
|
logger.debug("Selenium results for URL {}: {}".format(obj_url.url, str(results)))
|
||||||
results = r.json()
|
|
||||||
logger.debug("Missingkids Selenium results for URL {}: {}".format(obj_url.url, str(results)))
|
|
||||||
else:
|
|
||||||
# Verify
|
|
||||||
results = verify_missing_kid_url(obj_url.url)
|
|
||||||
logger.debug("Missingkids verify results for URL {}: {}".format(obj_url.url, str(results)))
|
|
||||||
|
|
||||||
if (results.get("status") == "valid"):
|
if (results.get("status") == "valid"):
|
||||||
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
self._set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
from .db_utils import DB_Handler
|
from .db_utils import DB_Handler
|
||||||
from ..models import Search, Source
|
from ..models import Search, Source
|
||||||
from .fetch_utils_url_processor import url_host_slowdown
|
from .fetch_utils_url_processor import get_with_protocol, url_host_slowdown
|
||||||
from .utils import get_with_protocol
|
|
||||||
import newspaper
|
import newspaper
|
||||||
import traceback
|
import traceback
|
||||||
from .logger import get_logger
|
from .logger import get_logger
|
||||||
|
|||||||
@@ -9,6 +9,14 @@ from urllib.parse import unquote
|
|||||||
import langdetect
|
import langdetect
|
||||||
langdetect.DetectorFactory.seed = 0
|
langdetect.DetectorFactory.seed = 0
|
||||||
|
|
||||||
|
def get_with_protocol(url):
|
||||||
|
# http:// -> https://
|
||||||
|
url = url.replace("http://", "https://")
|
||||||
|
# "" -> https://
|
||||||
|
if not (url.startswith("https://")):
|
||||||
|
url = "https://" + url
|
||||||
|
return url
|
||||||
|
|
||||||
def get_url_host(url):
|
def get_url_host(url):
|
||||||
# URL no protocol, first substring before '/'
|
# URL no protocol, first substring before '/'
|
||||||
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
|
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||||
@@ -31,48 +39,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
|
|||||||
# About to process URL host, cache time
|
# About to process URL host, cache time
|
||||||
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||||
|
|
||||||
|
def process_url(url, paywall_bypass=False):
|
||||||
def verify_missing_kid_url(url):
|
|
||||||
# Sleep required? To avoid too many requests error
|
|
||||||
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
|
||||||
|
|
||||||
# Request, get redirection
|
|
||||||
r = requests.get(url, allow_redirects=True)
|
|
||||||
# Redirection?
|
|
||||||
if (url != r.url):
|
|
||||||
url_redirection = r.url
|
|
||||||
return {"status": "duplicate", "redirection": url_redirection}
|
|
||||||
|
|
||||||
# Sample URL: "https://www.missingkids.org/poster/NCMC/2058896/1"
|
|
||||||
org_prefix, case_num = url.split("/")[-3], url.split("/")[-2]
|
|
||||||
# Fill details to API endpoint
|
|
||||||
base_url = "https://www.missingkids.org/bin/ncmecEndpoint?action=childDetail&orgPrefix={}&caseNum={}"
|
|
||||||
url_endpoint = base_url.format(org_prefix, case_num)
|
|
||||||
|
|
||||||
# Cache timeout missingkids.org
|
|
||||||
time.sleep(0.25)
|
|
||||||
|
|
||||||
# Request
|
|
||||||
r = requests.get(url_endpoint)
|
|
||||||
# Analyze status code and status result
|
|
||||||
if (r.status_code == 200):
|
|
||||||
r_json = r.json()
|
|
||||||
# Valid poster
|
|
||||||
if (r_json.get("status") == "success"):
|
|
||||||
return {"status": "valid"}
|
|
||||||
# Invalid poster
|
|
||||||
elif (r_json.get("status") == "error"):
|
|
||||||
return {"status": "invalid"}
|
|
||||||
else:
|
|
||||||
# ?
|
|
||||||
logger.info("Unknown json status: {} when verifying missing kid: {}".format(str(r_json), url))
|
|
||||||
return {"status": "unknown"}
|
|
||||||
else:
|
|
||||||
# Error status code
|
|
||||||
logger.info("Unknown request status: {} when verifying missing kid: {}".format(r.status_code, url))
|
|
||||||
return {"status": "unknown"}
|
|
||||||
|
|
||||||
def process_url(url, paywall_bypass=False, request_timeout=15):
|
|
||||||
|
|
||||||
if (paywall_bypass):
|
if (paywall_bypass):
|
||||||
# TODO: Implement self-hosted instance
|
# TODO: Implement self-hosted instance
|
||||||
@@ -91,7 +58,7 @@ def process_url(url, paywall_bypass=False, request_timeout=15):
|
|||||||
# Process
|
# Process
|
||||||
if ("foxnews.com" in url_of_interest) or ("zerohedge" in url_of_interest):
|
if ("foxnews.com" in url_of_interest) or ("zerohedge" in url_of_interest):
|
||||||
# Request
|
# Request
|
||||||
r = requests.get(url, headers={"User-Agent": user_agent}, timeout=request_timeout)
|
r = requests.get(url, headers={"User-Agent": user_agent})
|
||||||
# Raise for error code
|
# Raise for error code
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
# Parse
|
# Parse
|
||||||
@@ -100,10 +67,8 @@ def process_url(url, paywall_bypass=False, request_timeout=15):
|
|||||||
# Config: Fake user agent
|
# Config: Fake user agent
|
||||||
config = newspaper.configuration.Configuration()
|
config = newspaper.configuration.Configuration()
|
||||||
config.headers = {'User-Agent': user_agent}
|
config.headers = {'User-Agent': user_agent}
|
||||||
config.request_timeout = request_timeout
|
|
||||||
# Default mode
|
# Default mode
|
||||||
article = newspaper.article(url_of_interest, config=config)
|
article = newspaper.article(url_of_interest, config=config)
|
||||||
|
|
||||||
except newspaper.ArticleBinaryDataException:
|
except newspaper.ArticleBinaryDataException:
|
||||||
logger.warning("ArticleException for input URL {}".format(url))
|
logger.warning("ArticleException for input URL {}".format(url))
|
||||||
return {"override_status": "invalid"}
|
return {"override_status": "invalid"}
|
||||||
@@ -141,7 +106,7 @@ def process_url(url, paywall_bypass=False, request_timeout=15):
|
|||||||
|
|
||||||
# Try simple request, valid response but couldn't parse article? e.g. getting blocked? -> unknown
|
# Try simple request, valid response but couldn't parse article? e.g. getting blocked? -> unknown
|
||||||
time.sleep(0.25)
|
time.sleep(0.25)
|
||||||
r = requests.get(url_of_interest, timeout=request_timeout)
|
r = requests.get(url_of_interest)
|
||||||
if (r.status_code == 200):
|
if (r.status_code == 200):
|
||||||
return {"override_status": "unknown"}
|
return {"override_status": "unknown"}
|
||||||
else:
|
else:
|
||||||
@@ -152,7 +117,7 @@ def process_url(url, paywall_bypass=False, request_timeout=15):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Not a valid URL?
|
# Not a valid URL?
|
||||||
if (not article.is_valid_url()):
|
if (not article.is_valid_url()):
|
||||||
logger.debug("Invalid URL found: {}".format(url))
|
logger.debug("Invalid URL found: {}".format(url))
|
||||||
|
|||||||
@@ -4,150 +4,54 @@ from ..models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDup
|
|||||||
from django.db.models import Count
|
from django.db.models import Count
|
||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
import traceback
|
|
||||||
from .logger import get_logger
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
def notify_telegram_info(last_hours, channel="INFO"):
|
|
||||||
try:
|
|
||||||
start_date = timezone.now() - timedelta(hours=last_hours)
|
|
||||||
|
|
||||||
# Count the number of URLs grouped by status within the date range
|
|
||||||
urls_data_status = Urls.objects.filter(ts_fetch__gte=start_date) \
|
|
||||||
.values('status') \
|
|
||||||
.annotate(count=Count('id')) \
|
|
||||||
.order_by('status')
|
|
||||||
|
|
||||||
# Count the number of URLs grouped by source
|
|
||||||
urls_data_source = UrlsSourceSearch.objects \
|
|
||||||
.filter(id_url__ts_fetch__gte=start_date) \
|
|
||||||
.values('id_source__source') \
|
|
||||||
.annotate(count=Count('id_url')) \
|
|
||||||
.order_by('id_source__source')
|
|
||||||
|
|
||||||
# Count the number of URLs grouped by search
|
|
||||||
urls_data_search = UrlsSourceSearch.objects \
|
|
||||||
.filter(id_url__ts_fetch__gte=start_date) \
|
|
||||||
.values('id_search__search') \
|
|
||||||
.annotate(count=Count('id_url')) \
|
|
||||||
.order_by('id_search__search')
|
|
||||||
|
|
||||||
|
|
||||||
bot_token = os.environ.get("TELEGRAM_{}_BOT_TOKEN".format(channel), "")
|
def notify_telegram(last_hours=24):
|
||||||
chat_id = os.environ.get("TELEGRAM_{}_CHAT_ID".format(channel), "")
|
start_date = timezone.now() - timedelta(hours=last_hours)
|
||||||
|
|
||||||
message = "During the last {} hours:\n".format(last_hours)
|
# Count the number of URLs grouped by status within the date range
|
||||||
|
urls_data_status = Urls.objects.filter(ts_fetch__gte=start_date) \
|
||||||
message += "\nURLs per status:\n"
|
.values('status') \
|
||||||
for o in urls_data_status:
|
.annotate(count=Count('id')) \
|
||||||
message += " {}: {}\n".format(o.get("status"), o.get("count"))
|
.order_by('status')
|
||||||
message += "\nURLs per source:\n"
|
|
||||||
for o in urls_data_source:
|
# Count the number of URLs grouped by source
|
||||||
message += " {}: {}\n".format(o.get("id_source__source"), o.get("count"))
|
urls_data_source = UrlsSourceSearch.objects \
|
||||||
message += "\nURLs per search:\n"
|
.filter(id_url__ts_fetch__gte=start_date) \
|
||||||
for o in urls_data_search:
|
.values('id_source__source') \
|
||||||
message += " {}: {}\n".format(o.get("id_search__search"), o.get("count"))
|
.annotate(count=Count('id_url')) \
|
||||||
|
.order_by('id_source__source')
|
||||||
|
|
||||||
|
# Count the number of URLs grouped by search
|
||||||
|
urls_data_search = UrlsSourceSearch.objects \
|
||||||
|
.filter(id_url__ts_fetch__gte=start_date) \
|
||||||
|
.values('id_search__search') \
|
||||||
|
.annotate(count=Count('id_url')) \
|
||||||
|
.order_by('id_search__search')
|
||||||
|
|
||||||
|
|
||||||
url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
|
bot_token = os.environ.get("TELEGRAM_BOT_TOKEN", "")
|
||||||
params = {
|
chat_id = os.environ.get("TELEGRAM_CHAT_ID", "")
|
||||||
"chat_id": chat_id,
|
|
||||||
"text": message
|
|
||||||
}
|
|
||||||
|
|
||||||
# POST
|
|
||||||
response = requests.post(url, params=params)
|
|
||||||
except Exception as e:
|
|
||||||
logger.info("Exception while notifying status: {}\n{}".format(str(e), traceback.format_exc()))
|
|
||||||
|
|
||||||
|
|
||||||
def notify_telegram_warning(last_hours, channel="WARNING"):
|
message = "During the last {} hours:\n".format(last_hours)
|
||||||
try:
|
|
||||||
# Message appending logic
|
message += "\nURLs per status:\n"
|
||||||
message = ""
|
for o in urls_data_status:
|
||||||
|
message += " {}: {}\n".format(o.get("status"), o.get("count"))
|
||||||
start_date = timezone.now() - timedelta(hours=last_hours)
|
message += "\nURLs per source:\n"
|
||||||
|
for o in urls_data_source:
|
||||||
# Count the number of URLs grouped by status within the date range
|
message += " {}: {}\n".format(o.get("id_source__source"), o.get("count"))
|
||||||
urls_data_status = Urls.objects.filter(ts_fetch__gte=start_date) \
|
message += "\nURLs per search:\n"
|
||||||
.values('status') \
|
for o in urls_data_search:
|
||||||
.annotate(count=Count('id')) \
|
message += " {}: {}\n".format(o.get("id_search__search"), o.get("count"))
|
||||||
.order_by('status')
|
|
||||||
|
|
||||||
# Build dictionary
|
|
||||||
urls_data_status_dict = {}
|
|
||||||
for o in urls_data_status:
|
|
||||||
# #STATUS
|
|
||||||
urls_data_status_dict[o.get("status")] = o.get("count")
|
|
||||||
# #TOTAL
|
|
||||||
urls_data_status_dict["total"] = urls_data_status_dict.get("total", 0) + o.get("count")
|
|
||||||
|
|
||||||
MINIMUM_URLS_THRESHOLD = 10
|
|
||||||
MINIMUM_PROCESSED_URLS_RATIO = 0.7
|
|
||||||
|
|
||||||
# Minimum amount of URLs
|
|
||||||
if (urls_data_status_dict.get("total") < MINIMUM_URLS_THRESHOLD):
|
|
||||||
message += "WARNING - Total #URLS during the last {} hours: {}\n".format(last_hours, urls_data_status_dict.get("total"))
|
|
||||||
message += "\nURLs per status:\n"
|
|
||||||
for o in urls_data_status:
|
|
||||||
message += " {}: {}\n".format(o.get("status"), o.get("count"))
|
|
||||||
|
|
||||||
# Minimum ratio of processed raw urls
|
|
||||||
if (urls_data_status_dict.get("total") > 0):
|
|
||||||
if (urls_data_status_dict.get("raw", 0) / urls_data_status_dict.get("total") >= MINIMUM_PROCESSED_URLS_RATIO):
|
|
||||||
message += "WARNING - Small ratio of processed raw URLs during the last {} hours: {}\n".format(last_hours, urls_data_status_dict.get("total"))
|
|
||||||
message += "\nURLs per status:\n"
|
|
||||||
for o in urls_data_status:
|
|
||||||
message += " {}: {}\n".format(o.get("status"), o.get("count"))
|
|
||||||
|
|
||||||
|
|
||||||
# Count the number of URLs grouped by source
|
url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
|
||||||
urls_data_source = UrlsSourceSearch.objects \
|
params = {
|
||||||
.filter(id_url__ts_fetch__gte=start_date) \
|
"chat_id": chat_id,
|
||||||
.values('id_source__source') \
|
"text": message
|
||||||
.annotate(count=Count('id_url')) \
|
}
|
||||||
.order_by('id_source__source')
|
|
||||||
|
|
||||||
MINIMUM_SOURCES = 3
|
|
||||||
if (len(urls_data_source) < MINIMUM_SOURCES):
|
|
||||||
message += "WARNING - Very few sources found URLs during the last {} hours".format(last_hours)
|
|
||||||
message += "\nURLs per source:\n"
|
|
||||||
for o in urls_data_source:
|
|
||||||
message += " {}: {}\n".format(o.get("id_source__source"), o.get("count"))
|
|
||||||
|
|
||||||
"""
|
# POST
|
||||||
# TODO: URLs per search, key should be present for cnbc.com, foxnews.com, zerohedge.com, breitbart.com, child abuse, child neglect
|
response = requests.post(url, params=params)
|
||||||
# Count the number of URLs grouped by search
|
|
||||||
urls_data_search = UrlsSourceSearch.objects \
|
|
||||||
.filter(id_url__ts_fetch__gte=start_date) \
|
|
||||||
.values('id_search__search') \
|
|
||||||
.annotate(count=Count('id_url')) \
|
|
||||||
.order_by('id_search__search')
|
|
||||||
|
|
||||||
message += "\nURLs per search:\n"
|
|
||||||
for o in urls_data_search:
|
|
||||||
message += " {}: {}\n".format(o.get("id_search__search"), o.get("count"))
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Valid message body?
|
|
||||||
if (message != ""):
|
|
||||||
bot_token = os.environ.get("TELEGRAM_{}_BOT_TOKEN".format(channel), "")
|
|
||||||
chat_id = os.environ.get("TELEGRAM_{}_CHAT_ID".format(channel), "")
|
|
||||||
|
|
||||||
url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
|
|
||||||
params = {
|
|
||||||
"chat_id": chat_id,
|
|
||||||
"text": message
|
|
||||||
}
|
|
||||||
|
|
||||||
# POST
|
|
||||||
response = requests.post(url, params=params)
|
|
||||||
except Exception as e:
|
|
||||||
logger.info("Exception while notifying status: {}\n{}".format(str(e)), traceback.format_exc())
|
|
||||||
|
|
||||||
|
|
||||||
def notify_telegram(last_hours=12):
|
|
||||||
# INFO
|
|
||||||
notify_telegram_info(last_hours, channel="INFO")
|
|
||||||
# WARNING
|
|
||||||
notify_telegram_warning(last_hours, channel="WARNING")
|
|
||||||
|
|||||||
@@ -1,8 +0,0 @@
|
|||||||
|
|
||||||
def get_with_protocol(url):
|
|
||||||
# http:// -> https://
|
|
||||||
url = url.replace("http://", "https://")
|
|
||||||
# "" -> https://
|
|
||||||
if not (url.startswith("https://")):
|
|
||||||
url = "https://" + url
|
|
||||||
return url
|
|
||||||
@@ -13,30 +13,13 @@ from .src.logger import get_logger
|
|||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
|
|
||||||
@shared_task(queue='light')
|
|
||||||
def process_raw_urls(batch_size=100):
|
|
||||||
task = "Process raw URLs"
|
|
||||||
logger.info("Task triggered: {}".format(task))
|
|
||||||
DB_Handler().process_raw_urls(batch_size=batch_size)
|
|
||||||
logger.info("Task completed: {}".format(task))
|
|
||||||
|
|
||||||
@shared_task(queue='default')
|
@shared_task(queue='default')
|
||||||
def process_error_urls(batch_size=50):
|
|
||||||
task = "Process error URLs"
|
|
||||||
logger.info("Task triggered: {}".format(task))
|
|
||||||
DB_Handler().process_error_urls(batch_size=batch_size)
|
|
||||||
logger.info("Task completed: {}".format(task))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@shared_task(queue='light')
|
|
||||||
def fetch_feeds():
|
def fetch_feeds():
|
||||||
task = "Fetch Feeds"
|
task = "Fetch Feeds"
|
||||||
logger.info("Task triggered: {}".format(task))
|
logger.info("Task triggered: {}".format(task))
|
||||||
FetchFeeds().run()
|
FetchFeeds().run()
|
||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
|
|
||||||
@shared_task(queue='default')
|
@shared_task(queue='default')
|
||||||
def fetch_parser():
|
def fetch_parser():
|
||||||
task = "Fetch Parser"
|
task = "Fetch Parser"
|
||||||
@@ -51,31 +34,41 @@ def fetch_search():
|
|||||||
FetchSearcher().run()
|
FetchSearcher().run()
|
||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
|
@shared_task(queue='low')
|
||||||
|
|
||||||
@shared_task(queue='heavy')
|
|
||||||
def fetch_selenium_search():
|
def fetch_selenium_search():
|
||||||
task = "Fetch Selenium search"
|
task = "Fetch Selenium search"
|
||||||
logger.info("Task triggered: {}".format(task))
|
logger.info("Task triggered: {}".format(task))
|
||||||
FetchSeleniumSourceSearch().run()
|
FetchSeleniumSourceSearch().run()
|
||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
@shared_task(queue='heavy')
|
@shared_task(queue='low')
|
||||||
def fetch_missing_kids(number_pages=5):
|
def fetch_missing_kids(number_pages=5):
|
||||||
task = "Fetch MissingKids"
|
task = "Fetch MissingKids"
|
||||||
logger.info("Task triggered: {}".format(task))
|
logger.info("Task triggered: {}".format(task))
|
||||||
FetchMissingKids().run(number_pages)
|
FetchMissingKids().run(number_pages)
|
||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
@shared_task(queue='heavy')
|
@shared_task(queue='default')
|
||||||
|
def process_raw_urls(batch_size=100):
|
||||||
|
task = "Process raw URLs"
|
||||||
|
logger.info("Task triggered: {}".format(task))
|
||||||
|
DB_Handler().process_raw_urls(batch_size=batch_size)
|
||||||
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
|
@shared_task(queue='default')
|
||||||
|
def process_error_urls(batch_size=50):
|
||||||
|
task = "Process error URLs"
|
||||||
|
logger.info("Task triggered: {}".format(task))
|
||||||
|
DB_Handler().process_error_urls(batch_size=batch_size)
|
||||||
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
|
@shared_task(queue='low')
|
||||||
def process_missing_kids_urls(batch_size=None, process_status_only=None):
|
def process_missing_kids_urls(batch_size=None, process_status_only=None):
|
||||||
task = "Process Missing Kids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only)
|
task = "Process Missing Kids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only)
|
||||||
logger.info("Task triggered: {}".format(task))
|
logger.info("Task triggered: {}".format(task))
|
||||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size, process_status_only=process_status_only)
|
DB_Handler().process_missing_kids_urls(batch_size=batch_size, process_status_only=process_status_only)
|
||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@shared_task(queue='default')
|
@shared_task(queue='default')
|
||||||
def clean_old_url_content(older_than_days=14):
|
def clean_old_url_content(older_than_days=14):
|
||||||
task = "Clean old URL content"
|
task = "Clean old URL content"
|
||||||
@@ -83,7 +76,7 @@ def clean_old_url_content(older_than_days=14):
|
|||||||
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
||||||
logger.info("Task completed: {}".format(task))
|
logger.info("Task completed: {}".format(task))
|
||||||
|
|
||||||
@shared_task(queue='light')
|
@shared_task(queue='default')
|
||||||
def notify_status():
|
def notify_status():
|
||||||
task = "Notify status"
|
task = "Notify status"
|
||||||
logger.info("Task triggered: {}".format(task))
|
logger.info("Task triggered: {}".format(task))
|
||||||
|
|||||||
@@ -16,10 +16,14 @@ def trigger_task(request, task):
|
|||||||
def link_list(request):
|
def link_list(request):
|
||||||
# Base URL path
|
# Base URL path
|
||||||
app_url = request.build_absolute_uri()
|
app_url = request.build_absolute_uri()
|
||||||
|
# Tasks
|
||||||
|
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "fetch_selenium_search"]
|
||||||
|
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_valid_all", "process_missing_kids_urls_invalid_all", "process_missing_kids_urls_unknown_all", "process_missing_kids_urls_all", "clean_old_url_content_60"]
|
||||||
# List of links
|
# List of links
|
||||||
list_links = \
|
list_links = \
|
||||||
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
|
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
|
||||||
[ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning", "server", "beat", "worker_light", "worker_default", "worker_heavy"] ]
|
[ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning", "server", "beat", "worker_default", "worker_low"] ] #+ \
|
||||||
|
#[ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
|
||||||
|
|
||||||
# Links tuple
|
# Links tuple
|
||||||
links = [(l, l) for l in list_links]
|
links = [(l, l) for l in list_links]
|
||||||
|
|||||||
@@ -17,20 +17,19 @@
|
|||||||
"cnbc.com"
|
"cnbc.com"
|
||||||
],
|
],
|
||||||
"keyword_search": [
|
"keyword_search": [
|
||||||
"child abuse",
|
"child abuse"
|
||||||
"child neglect"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"REGEX_PATTERN_STATUS_PRIORITY": [
|
"REGEX_PATTERN_STATUS_PRIORITY": [
|
||||||
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
|
||||||
["https:\\/\\/x.com\\/.*", "invalid", 50],
|
["https:\\/\\/x.com\\/.*", "invalid", 50],
|
||||||
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
|
||||||
|
[".*foxnews\\.com\\/(video|category|person|books|html-sitemap)\\/.*", "invalid", 75],
|
||||||
|
[".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
|
||||||
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
|
||||||
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
|
||||||
[".*zerohedge\\.com\\/(user|contributors)\\/.*", "invalid", 75],
|
[".*zerohedge\\.com\\/(user|contributors)\\/.*", "invalid", 75],
|
||||||
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
|
||||||
[".*radio\\.foxnews\\.com\\/.*", "invalid", 75],
|
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
|
||||||
[".*foxnews\\.com\\/(video|category|person|books|html-sitemap)\\/.*", "invalid", 75],
|
|
||||||
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50],
|
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50],
|
||||||
[".*foxnews\\.com\\/[^\\/]+\\/?$", "invalid", 25]
|
[".*foxnews\\.com\\/[^\\/]+\\/?$", "invalid", 25]
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -4,12 +4,12 @@ if [ "${INITIALIZE_DB}" = false ]; then
|
|||||||
echo "Initialization not required"
|
echo "Initialization not required"
|
||||||
else
|
else
|
||||||
echo "Initializating database"
|
echo "Initializating database"
|
||||||
python init_db.py --initialize_tables --initialize_data
|
# python init_db.py --initialize_tables --initialize_data
|
||||||
python manage.py makemigrations fetcher; python manage.py migrate --fake-initial
|
python manage.py makemigrations fetcher; python manage.py migrate --fake-initial
|
||||||
python manage.py migrate django_celery_beat
|
# python manage.py migrate django_celery_beat
|
||||||
python manage.py createsuperuser --noinput
|
python manage.py createsuperuser --noinput
|
||||||
python manage.py collectstatic --no-input
|
python manage.py collectstatic --no-input
|
||||||
python manage.py loaddata scheduled_tasks.json
|
# python manage.py loaddata scheduled_tasks.json
|
||||||
#
|
#
|
||||||
# python manage.py inspectdb # Debugging model
|
# python manage.py inspectdb # Debugging model
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -160,7 +160,7 @@
|
|||||||
"expire_seconds": null,
|
"expire_seconds": null,
|
||||||
"one_off": false,
|
"one_off": false,
|
||||||
"start_time": null,
|
"start_time": null,
|
||||||
"enabled": true,
|
"enabled": false,
|
||||||
"last_run_at": null,
|
"last_run_at": null,
|
||||||
"total_run_count": 0,
|
"total_run_count": 0,
|
||||||
"date_changed": "2025-07-17T16:20:19.969Z",
|
"date_changed": "2025-07-17T16:20:19.969Z",
|
||||||
@@ -188,7 +188,7 @@
|
|||||||
"expire_seconds": null,
|
"expire_seconds": null,
|
||||||
"one_off": false,
|
"one_off": false,
|
||||||
"start_time": null,
|
"start_time": null,
|
||||||
"enabled": true,
|
"enabled": false,
|
||||||
"last_run_at": null,
|
"last_run_at": null,
|
||||||
"total_run_count": 0,
|
"total_run_count": 0,
|
||||||
"date_changed": "2025-07-17T16:21:30.809Z",
|
"date_changed": "2025-07-17T16:21:30.809Z",
|
||||||
@@ -397,7 +397,7 @@
|
|||||||
"fields": {
|
"fields": {
|
||||||
"name": "Notify status",
|
"name": "Notify status",
|
||||||
"task": "fetcher.tasks.notify_status",
|
"task": "fetcher.tasks.notify_status",
|
||||||
"interval": 4,
|
"interval": 3,
|
||||||
"crontab": null,
|
"crontab": null,
|
||||||
"solar": null,
|
"solar": null,
|
||||||
"clocked": null,
|
"clocked": null,
|
||||||
|
|||||||
@@ -13,55 +13,3 @@ redirect_stderr=true
|
|||||||
; Rotate when file reaches max size
|
; Rotate when file reaches max size
|
||||||
stdout_logfile_maxbytes=20MB
|
stdout_logfile_maxbytes=20MB
|
||||||
stdout_logfile_backups=1
|
stdout_logfile_backups=1
|
||||||
|
|
||||||
[program:beat]
|
|
||||||
command=celery -A core beat -l info --scheduler django_celery_beat.schedulers:DatabaseScheduler --logfile=/opt/logs/beat.log
|
|
||||||
directory=/opt/app
|
|
||||||
autostart=true
|
|
||||||
autorestart=true
|
|
||||||
; Unified log file
|
|
||||||
stdout_logfile=/opt/logs/beat.log
|
|
||||||
stderr_logfile=/opt/logs/beat.log
|
|
||||||
redirect_stderr=true
|
|
||||||
; Rotate when file reaches max size
|
|
||||||
stdout_logfile_maxbytes=20MB
|
|
||||||
stdout_logfile_backups=1
|
|
||||||
|
|
||||||
[program:worker_default]
|
|
||||||
command=celery -A core worker -l info --logfile=/opt/logs/worker_default.log --concurrency=1 -Q default -n default
|
|
||||||
directory=/opt/app
|
|
||||||
autostart=true
|
|
||||||
autorestart=true
|
|
||||||
; Unified log file
|
|
||||||
stdout_logfile=/opt/logs/worker_default.log
|
|
||||||
stderr_logfile=/opt/logs/worker_default.log
|
|
||||||
redirect_stderr=true
|
|
||||||
; Rotate when file reaches max size
|
|
||||||
stdout_logfile_maxbytes=20MB
|
|
||||||
stdout_logfile_backups=1
|
|
||||||
|
|
||||||
[program:worker_light]
|
|
||||||
command=celery -A core worker -l info --logfile=/opt/logs/worker_light.log --concurrency=1 -Q light -n light
|
|
||||||
directory=/opt/app
|
|
||||||
autostart=true
|
|
||||||
autorestart=true
|
|
||||||
; Unified log file
|
|
||||||
stdout_logfile=/opt/logs/worker_light.log
|
|
||||||
stderr_logfile=/opt/logs/worker_light.log
|
|
||||||
redirect_stderr=true
|
|
||||||
; Rotate when file reaches max size
|
|
||||||
stdout_logfile_maxbytes=20MB
|
|
||||||
stdout_logfile_backups=1
|
|
||||||
|
|
||||||
[program:worker_heavy]
|
|
||||||
command=celery -A core worker -l info --logfile=/opt/logs/worker_heavy.log --concurrency=1 -Q heavy -n heavy
|
|
||||||
directory=/opt/app
|
|
||||||
autostart=true
|
|
||||||
autorestart=true
|
|
||||||
; Unified log file
|
|
||||||
stdout_logfile=/opt/logs/worker_heavy.log
|
|
||||||
stderr_logfile=/opt/logs/worker_heavy.log
|
|
||||||
redirect_stderr=true
|
|
||||||
; Rotate when file reaches max size
|
|
||||||
stdout_logfile_maxbytes=20MB
|
|
||||||
stdout_logfile_backups=1
|
|
||||||
|
|||||||
@@ -43,10 +43,8 @@ services:
|
|||||||
- DB_PASSWORD=${DB_PASSWORD}
|
- DB_PASSWORD=${DB_PASSWORD}
|
||||||
- DB_HOST=${DB_HOST}
|
- DB_HOST=${DB_HOST}
|
||||||
- DB_PORT=${DB_PORT}
|
- DB_PORT=${DB_PORT}
|
||||||
- REDIS_CACHE_HOST=${REDIS_CACHE_HOST}
|
- REDIS_HOST=${REDIS_HOST}
|
||||||
- REDIS_CACHE_PORT=${REDIS_CACHE_PORT}
|
- REDIS_PORT=${REDIS_PORT}
|
||||||
- REDIS_CELERY_HOST=${REDIS_CELERY_HOST}
|
|
||||||
- REDIS_CELERY_PORT=${REDIS_CELERY_PORT}
|
|
||||||
# Job timeout: 30 min
|
# Job timeout: 30 min
|
||||||
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
|
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT}
|
||||||
# Fetcher
|
# Fetcher
|
||||||
@@ -66,32 +64,22 @@ services:
|
|||||||
- PEXELS_API_KEY=${PEXELS_API_KEY}
|
- PEXELS_API_KEY=${PEXELS_API_KEY}
|
||||||
- OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
|
- OLLAMA_MODEL_DEFAULT=${OLLAMA_MODEL_DEFAULT}
|
||||||
# Telegram
|
# Telegram
|
||||||
- TELEGRAM_INFO_BOT_TOKEN=${TELEGRAM_INFO_BOT_TOKEN}
|
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
|
||||||
- TELEGRAM_INFO_CHAT_ID=${TELEGRAM_INFO_CHAT_ID}
|
- TELEGRAM_CHAT_ID=${TELEGRAM_CHAT_ID}
|
||||||
- TELEGRAM_WARNING_BOT_TOKEN=${TELEGRAM_WARNING_BOT_TOKEN}
|
|
||||||
- TELEGRAM_WARNING_CHAT_ID=${TELEGRAM_WARNING_CHAT_ID}
|
|
||||||
########################
|
########################
|
||||||
ports:
|
ports:
|
||||||
- 8000
|
- 8000
|
||||||
depends_on:
|
depends_on:
|
||||||
- fetcher_db
|
- fetcher_db
|
||||||
- fetcher_redis_cache
|
- fetcher_redis
|
||||||
- fetcher_redis_celery
|
# - fetcher_app_selenium
|
||||||
- fetcher_app_selenium
|
|
||||||
dns:
|
dns:
|
||||||
- 1.1.1.1
|
- 1.1.1.1
|
||||||
- 1.0.0.1
|
- 1.0.0.1
|
||||||
|
|
||||||
fetcher_redis_cache:
|
fetcher_redis:
|
||||||
image: redis:alpine
|
image: redis:alpine
|
||||||
container_name: fetcher_redis_cache
|
container_name: fetcher_redis
|
||||||
restart: unless-stopped
|
|
||||||
ports:
|
|
||||||
- 6379
|
|
||||||
|
|
||||||
fetcher_redis_celery:
|
|
||||||
image: redis:alpine
|
|
||||||
container_name: fetcher_redis_celery
|
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ports:
|
ports:
|
||||||
- 6379
|
- 6379
|
||||||
@@ -106,7 +94,6 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- 5555
|
- 5555
|
||||||
environment:
|
environment:
|
||||||
- CELERY_BROKER_URL=redis://fetcher_redis_celery:6379/0
|
- CELERY_BROKER_URL=redis://fetcher_redis:6379/0
|
||||||
- FLOWER_UNAUTHENTICATED_API=true
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- fetcher_redis_celery
|
- fetcher_redis
|
||||||
|
|||||||
@@ -52,19 +52,12 @@ services:
|
|||||||
#volumes: # Persistent DB?
|
#volumes: # Persistent DB?
|
||||||
# - ./postgres:/var/lib/postgresql/data
|
# - ./postgres:/var/lib/postgresql/data
|
||||||
|
|
||||||
fetcher_redis_cache:
|
fetcher_redis:
|
||||||
extends:
|
extends:
|
||||||
file: docker-compose-base.yml
|
file: docker-compose-base.yml
|
||||||
service: fetcher_redis_cache
|
service: fetcher_redis
|
||||||
ports:
|
ports:
|
||||||
- 6379
|
- 6379:6379
|
||||||
|
|
||||||
fetcher_redis_celery:
|
|
||||||
extends:
|
|
||||||
file: docker-compose-base.yml
|
|
||||||
service: fetcher_redis_celery
|
|
||||||
ports:
|
|
||||||
- 6379
|
|
||||||
|
|
||||||
fetcher_flower:
|
fetcher_flower:
|
||||||
extends:
|
extends:
|
||||||
|
|||||||
@@ -1,26 +1,34 @@
|
|||||||
services:
|
services:
|
||||||
|
|
||||||
fetcher_app_selenium:
|
#fetcher_app_selenium:
|
||||||
extends:
|
# extends:
|
||||||
file: docker-compose-base.yml
|
# file: docker-compose-base.yml
|
||||||
service: fetcher_app_selenium
|
# service: fetcher_app_selenium
|
||||||
deploy:
|
# deploy:
|
||||||
resources:
|
# resources:
|
||||||
limits:
|
# limits:
|
||||||
cpus: '${DEPLOY_CPUS}'
|
# cpus: '${DEPLOY_CPUS}'
|
||||||
memory: ${DEPLOY_RAM}
|
# memory: ${DEPLOY_RAM}
|
||||||
|
|
||||||
fetcher_app_urls:
|
fetcher_app_urls:
|
||||||
extends:
|
extends:
|
||||||
file: docker-compose-base.yml
|
file: docker-compose-base.yml
|
||||||
service: fetcher_app_urls
|
service: fetcher_app_urls
|
||||||
ports:
|
ports:
|
||||||
- 8000:8000
|
- 8067:8000
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
cpus: '${DEPLOY_CPUS}'
|
cpus: '${DEPLOY_CPUS}'
|
||||||
memory: ${DEPLOY_RAM}
|
memory: ${DEPLOY_RAM}
|
||||||
|
labels: # Reverse proxy sample
|
||||||
|
- "traefik.enable=true"
|
||||||
|
- "traefik.http.routers.fetcher.rule=Host(`fetcher.matitos.org`)"
|
||||||
|
- "traefik.http.routers.fetcher.entrypoints=websecure"
|
||||||
|
- "traefik.http.routers.fetcher.tls.certresolver=myresolvercd"
|
||||||
|
- "traefik.http.services.fetcher.loadbalancer.server.port=8000"
|
||||||
|
networks:
|
||||||
|
- docker_default # Reverse proxy network
|
||||||
|
|
||||||
fetcher_db:
|
fetcher_db:
|
||||||
extends:
|
extends:
|
||||||
@@ -36,34 +44,36 @@ services:
|
|||||||
# REQUIREMENT: Add fetcher's SSH public key into the DB's .ssh/authorized_keys machine
|
# REQUIREMENT: Add fetcher's SSH public key into the DB's .ssh/authorized_keys machine
|
||||||
- ~/.ssh:/root/.ssh:ro
|
- ~/.ssh:/root/.ssh:ro
|
||||||
ports:
|
ports:
|
||||||
- 15885:15885
|
- 15889:15889
|
||||||
- 5432:5432
|
- 5432:5432
|
||||||
command:
|
command:
|
||||||
- sh
|
- sh
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
apk add --update openssh autossh
|
apk add --update openssh autossh
|
||||||
# Monitor status on port 15885
|
# Monitor status on port 15889
|
||||||
autossh -M 15885 -N -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
|
autossh -M 15889 -N -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
|
||||||
# autossh -M 15885 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
|
# autossh -M 15889 -N -o 'GatewayPorts yes' -L 0.0.0.0:5432:127.0.0.1:5432 ${REMOTE_USERNAME}@${REMOTE_HOST}
|
||||||
|
networks:
|
||||||
|
- docker_default # Reverse proxy network
|
||||||
|
|
||||||
fetcher_redis_cache:
|
fetcher_redis:
|
||||||
extends:
|
extends:
|
||||||
file: docker-compose-base.yml
|
file: docker-compose-base.yml
|
||||||
service: fetcher_redis_cache
|
service: fetcher_redis
|
||||||
ports:
|
ports:
|
||||||
- 6379
|
- 6379:6379
|
||||||
|
networks:
|
||||||
|
- docker_default # Reverse proxy network
|
||||||
|
|
||||||
fetcher_redis_celery:
|
#fetcher_flower:
|
||||||
extends:
|
# extends:
|
||||||
file: docker-compose-base.yml
|
# file: docker-compose-base.yml
|
||||||
service: fetcher_redis_celery
|
# service: fetcher_flower
|
||||||
ports:
|
# ports:
|
||||||
- 6379
|
# - 5555:5555
|
||||||
|
|
||||||
fetcher_flower:
|
|
||||||
extends:
|
networks:
|
||||||
file: docker-compose-base.yml
|
docker_default:
|
||||||
service: fetcher_flower
|
external: true
|
||||||
ports:
|
|
||||||
- 5555:5555
|
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
"import json\n",
|
"import json\n",
|
||||||
"import csv\n",
|
"import csv\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"\n",
|
||||||
"headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
|
"headers = {\"User-Agent\": \"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36\"}"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -328,22 +329,13 @@
|
|||||||
" main()"
|
" main()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import pandas as pd\n",
|
"df = pd.read_csv(\"scholenopdekaart.csv\", index_col=0)\n",
|
||||||
"\n",
|
|
||||||
"df = pd.read_csv(\"~/Downloads/scholenopdekaart.csv\", index_col=0)\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"df.head()"
|
"df.head()"
|
||||||
]
|
]
|
||||||
@@ -354,101 +346,13 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def to_dict(row):\n",
|
"df.tail()"
|
||||||
" # Empty?\n",
|
|
||||||
" if (pd.isna(row)):\n",
|
|
||||||
" return {}\n",
|
|
||||||
" # Evaluate, to dict\n",
|
|
||||||
" dict_data = dict(eval(row))\n",
|
|
||||||
" # Remove None values\n",
|
|
||||||
" for k in list(dict_data.keys()):\n",
|
|
||||||
" if dict_data[k] is None:\n",
|
|
||||||
" del dict_data[k]\n",
|
|
||||||
" # Prefix\n",
|
|
||||||
" return {f\"{column}_{k}\": v for k, v in dict_data.items()}\n",
|
|
||||||
"\n",
|
|
||||||
"for column in [\"students_per_year_trend\", \"num_students_per_group\", \"num_students_per_age\"]:\n",
|
|
||||||
" print(column)\n",
|
|
||||||
" # Convert the list of tuples into a dictionary per row\n",
|
|
||||||
" df_dicts = df[column].apply(to_dict)\n",
|
|
||||||
" # Expand into separate columns\n",
|
|
||||||
" df_expanded = pd.json_normalize(df_dicts)\n",
|
|
||||||
" # Sort\n",
|
|
||||||
" df_expanded = df_expanded[sorted(df_expanded.columns)]\n",
|
|
||||||
" # Combine with original columns\n",
|
|
||||||
" df = pd.concat([df.drop(columns=[column]), df_expanded], axis=1)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def to_dict(row):\n",
|
|
||||||
" # Empty?\n",
|
|
||||||
" if (pd.isna(row)):\n",
|
|
||||||
" return {}\n",
|
|
||||||
" # Evaluate, to dict\n",
|
|
||||||
" data = eval(row)\n",
|
|
||||||
" # Remove first useless data\n",
|
|
||||||
" data = data[1:]\n",
|
|
||||||
"\n",
|
|
||||||
" # Generate dict\n",
|
|
||||||
" dict_data = {}\n",
|
|
||||||
" for (zipcode, num, percentage) in data:\n",
|
|
||||||
" dict_data[f\"num_students_zipcode_{zipcode}\"] = num\n",
|
|
||||||
" dict_data[f\"percentage_students_zipcode_{zipcode}\"] = percentage\n",
|
|
||||||
"\n",
|
|
||||||
" # Remove None values\n",
|
|
||||||
" for k in list(dict_data.keys()):\n",
|
|
||||||
" if dict_data[k] is None:\n",
|
|
||||||
" del dict_data[k]\n",
|
|
||||||
" return dict_data\n",
|
|
||||||
"\n",
|
|
||||||
"for column in [\"students_per_zipcode\"]:\n",
|
|
||||||
" print(column)\n",
|
|
||||||
" # Convert the list of tuples into a dictionary per row\n",
|
|
||||||
" df_dicts = df[column].apply(to_dict)\n",
|
|
||||||
" # Expand into separate columns\n",
|
|
||||||
" df_expanded = pd.json_normalize(df_dicts)\n",
|
|
||||||
" # Sort\n",
|
|
||||||
" df_expanded = df_expanded[sorted(df_expanded.columns)]\n",
|
|
||||||
" # Combine with original columns\n",
|
|
||||||
" df = pd.concat([df.drop(columns=[column]), df_expanded], axis=1)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"df.to_csv(\"schools_nl.csv\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"df.head()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"list(df.columns)"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "fetcher",
|
"display_name": "matitos_urls",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@@ -462,7 +366,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.12.11"
|
"version": "3.12.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
Reference in New Issue
Block a user