Views update URLs list, job timeout, url content fail debug

This commit is contained in:
Luciano Gervasoni
2025-04-04 20:11:22 +02:00
parent 3e84fc4508
commit e87f10e7d4
3 changed files with 47 additions and 51 deletions

View File

@@ -151,26 +151,30 @@ class DB_Handler():
# Update status # Update status
set_status(obj_url, Urls.STATUS_ENUM.VALID) set_status(obj_url, Urls.STATUS_ENUM.VALID)
# Create or update extracted URL data try:
UrlContent.objects.update_or_create( # Create or update extracted URL data
id_url=obj_url, UrlContent.objects.update_or_create(
defaults = { id_url=obj_url,
"date_published" : dict_url_data.get("publish_date"), defaults = {
"title" : dict_url_data.get("title"), "date_published" : dict_url_data.get("publish_date"),
"description" : dict_url_data.get("description"), "title" : dict_url_data.get("title"),
"content" : dict_url_data.get("content"), "description" : dict_url_data.get("description"),
"valid_content" : dict_url_data.get("valid_content"), "content" : dict_url_data.get("content"),
"language" : dict_url_data.get("language"), "valid_content" : dict_url_data.get("valid_content"),
"keywords" : dict_url_data.get("keywords"), "language" : dict_url_data.get("language"),
"tags" : dict_url_data.get("tags"), "keywords" : dict_url_data.get("keywords"),
"authors" : dict_url_data.get("authors"), "tags" : dict_url_data.get("tags"),
"image_main_url" : dict_url_data.get("image_main_url"), "authors" : dict_url_data.get("authors"),
"images_url" : dict_url_data.get("images_url"), "image_main_url" : dict_url_data.get("image_main_url"),
"videos_url" : dict_url_data.get("videos_url"), "images_url" : dict_url_data.get("images_url"),
"url_host" : dict_url_data.get("url_host"), "videos_url" : dict_url_data.get("videos_url"),
"site_name" : dict_url_data.get("site_name"), "url_host" : dict_url_data.get("url_host"),
} "site_name" : dict_url_data.get("site_name"),
) }
)
except Exception as e:
logger.debug("Error in update_or_create UrlContent: {}\ndict_url_data: {}\n{}\n{}".format(obj_url.url, dict_url_data, str(e), traceback.format_exc()))
def process_raw_urls(self, batch_size): def process_raw_urls(self, batch_size):

View File

@@ -17,24 +17,16 @@ def trigger_task(request, task):
#################################################################################################### ####################################################################################################
def link_list(request): def link_list(request):
prefix = "http://localhost:8000/task" # Base URL path
links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"] app_url = request.build_absolute_uri()
# Tasks
list_links = [ links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
# DB links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
"http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500", # List of links
# Admin panel list_links = \
"http://localhost:8000/admin", [ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
# Logs [ os.path.join(app_url, "logs", log_type) for log_type in ["debug", "info", "error"] ] + \
"http://localhost:8000/logs/debug", [ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
"http://localhost:8000/logs/info",
"http://localhost:8000/logs/error",
# URLs
"http://localhost:8000/urls",
# Charts
"http://localhost:8000/urls/charts",
# Fetcher tasks
] + [os.path.join(prefix, l) for l in links]
# Json # Json
return JsonResponse({"links": list_links }) return JsonResponse({"links": list_links })

View File

@@ -50,7 +50,7 @@ services:
- REDIS_HOST=${REDIS_HOST:-fetcher_redis} - REDIS_HOST=${REDIS_HOST:-fetcher_redis}
- REDIS_PORT=${REDIS_PORT:-6379} - REDIS_PORT=${REDIS_PORT:-6379}
# Job timeout: 30 min # Job timeout: 30 min
- JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800} - JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT:-1800}
# Fetcher # Fetcher
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2} - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5} - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
@@ -64,7 +64,7 @@ services:
# - ./app_urls:/opt/app # - ./app_urls:/opt/app
######################## ########################
ports: ports:
- 8000 # :8000 - 8000:8000
depends_on: depends_on:
- fetcher_db - fetcher_db
- fetcher_redis - fetcher_redis
@@ -76,14 +76,14 @@ services:
limits: limits:
cpus: '4' cpus: '4'
memory: 4G memory: 4G
labels: # Reverse proxy sample #labels: # Reverse proxy sample
- "traefik.enable=true" # - "traefik.enable=true"
- "traefik.http.routers.fetcher.rule=Host(`fetcher.matitos.org`)" # - "traefik.http.routers.fetcher.rule=Host(`fetcher.matitos.org`)"
- "traefik.http.routers.fetcher.entrypoints=websecure" # - "traefik.http.routers.fetcher.entrypoints=websecure"
- "traefik.http.routers.fetcher.tls.certresolver=myresolvercd" # - "traefik.http.routers.fetcher.tls.certresolver=myresolvercd"
networks: #networks:
- default # This network # - default # This network
- docker_default # Reverse proxy network # - docker_default # Reverse proxy network
fetcher_db: fetcher_db:
image: postgres:17 image: postgres:17
@@ -108,6 +108,6 @@ services:
ports: ports:
- 6379 #:6379 - 6379 #:6379
networks: #networks:
docker_default: # docker_default:
external: true # external: true