From e87f10e7d442b83fb7fdece9bdeff3420b9d9771 Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Fri, 4 Apr 2025 20:11:22 +0200 Subject: [PATCH] Views update URLs list, job timeout, url content fail debug --- app_urls/fetcher/src/db_utils.py | 44 +++++++++++++++++--------------- app_urls/fetcher/views.py | 28 ++++++++------------ docker-compose.yml | 26 +++++++++---------- 3 files changed, 47 insertions(+), 51 deletions(-) diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index 4bf23d4..324a01e 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -151,26 +151,30 @@ class DB_Handler(): # Update status set_status(obj_url, Urls.STATUS_ENUM.VALID) - # Create or update extracted URL data - UrlContent.objects.update_or_create( - id_url=obj_url, - defaults = { - "date_published" : dict_url_data.get("publish_date"), - "title" : dict_url_data.get("title"), - "description" : dict_url_data.get("description"), - "content" : dict_url_data.get("content"), - "valid_content" : dict_url_data.get("valid_content"), - "language" : dict_url_data.get("language"), - "keywords" : dict_url_data.get("keywords"), - "tags" : dict_url_data.get("tags"), - "authors" : dict_url_data.get("authors"), - "image_main_url" : dict_url_data.get("image_main_url"), - "images_url" : dict_url_data.get("images_url"), - "videos_url" : dict_url_data.get("videos_url"), - "url_host" : dict_url_data.get("url_host"), - "site_name" : dict_url_data.get("site_name"), - } - ) + try: + # Create or update extracted URL data + UrlContent.objects.update_or_create( + id_url=obj_url, + defaults = { + "date_published" : dict_url_data.get("publish_date"), + "title" : dict_url_data.get("title"), + "description" : dict_url_data.get("description"), + "content" : dict_url_data.get("content"), + "valid_content" : dict_url_data.get("valid_content"), + "language" : dict_url_data.get("language"), + "keywords" : dict_url_data.get("keywords"), + "tags" : dict_url_data.get("tags"), + "authors" : dict_url_data.get("authors"), + "image_main_url" : dict_url_data.get("image_main_url"), + "images_url" : dict_url_data.get("images_url"), + "videos_url" : dict_url_data.get("videos_url"), + "url_host" : dict_url_data.get("url_host"), + "site_name" : dict_url_data.get("site_name"), + } + ) + except Exception as e: + logger.debug("Error in update_or_create UrlContent: {}\ndict_url_data: {}\n{}\n{}".format(obj_url.url, dict_url_data, str(e), traceback.format_exc())) + def process_raw_urls(self, batch_size): diff --git a/app_urls/fetcher/views.py b/app_urls/fetcher/views.py index 739098c..bc9598b 100644 --- a/app_urls/fetcher/views.py +++ b/app_urls/fetcher/views.py @@ -17,24 +17,16 @@ def trigger_task(request, task): #################################################################################################### def link_list(request): - prefix = "http://localhost:8000/task" - links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"] - - list_links = [ - # DB - "http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500", - # Admin panel - "http://localhost:8000/admin", - # Logs - "http://localhost:8000/logs/debug", - "http://localhost:8000/logs/info", - "http://localhost:8000/logs/error", - # URLs - "http://localhost:8000/urls", - # Charts - "http://localhost:8000/urls/charts", - # Fetcher tasks - ] + [os.path.join(prefix, l) for l in links] + # Base URL path + app_url = request.build_absolute_uri() + # Tasks + links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"] + links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"] + # List of links + list_links = \ + [ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \ + [ os.path.join(app_url, "logs", log_type) for log_type in ["debug", "info", "error"] ] + \ + [ os.path.join(app_url, "task", l) for l in links_fetch + links_process ] # Json return JsonResponse({"links": list_links }) diff --git a/docker-compose.yml b/docker-compose.yml index ebff2c4..60e077f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -50,7 +50,7 @@ services: - REDIS_HOST=${REDIS_HOST:-fetcher_redis} - REDIS_PORT=${REDIS_PORT:-6379} # Job timeout: 30 min - - JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800} + - JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT:-1800} # Fetcher - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2} - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5} @@ -64,7 +64,7 @@ services: # - ./app_urls:/opt/app ######################## ports: - - 8000 # :8000 + - 8000:8000 depends_on: - fetcher_db - fetcher_redis @@ -76,14 +76,14 @@ services: limits: cpus: '4' memory: 4G - labels: # Reverse proxy sample - - "traefik.enable=true" - - "traefik.http.routers.fetcher.rule=Host(`fetcher.matitos.org`)" - - "traefik.http.routers.fetcher.entrypoints=websecure" - - "traefik.http.routers.fetcher.tls.certresolver=myresolvercd" - networks: - - default # This network - - docker_default # Reverse proxy network + #labels: # Reverse proxy sample + # - "traefik.enable=true" + # - "traefik.http.routers.fetcher.rule=Host(`fetcher.matitos.org`)" + # - "traefik.http.routers.fetcher.entrypoints=websecure" + # - "traefik.http.routers.fetcher.tls.certresolver=myresolvercd" + #networks: + # - default # This network + # - docker_default # Reverse proxy network fetcher_db: image: postgres:17 @@ -108,6 +108,6 @@ services: ports: - 6379 #:6379 -networks: - docker_default: - external: true +#networks: +# docker_default: +# external: true