From ef51a96db6ad717f0a86dec6e221bd7b4af66c27 Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Mon, 8 Sep 2025 16:20:39 +0200 Subject: [PATCH] Process missing kids url based on API endpoint, fix2 --- app_urls/fetcher/src/db_utils.py | 9 +-------- app_urls/fetcher/src/fetch_parser.py | 3 ++- app_urls/fetcher/src/utils.py | 8 ++++++++ 3 files changed, 11 insertions(+), 9 deletions(-) create mode 100644 app_urls/fetcher/src/utils.py diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index 6b5a5fd..d0ba365 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -5,6 +5,7 @@ from django.db import IntegrityError from django.utils import timezone from datetime import timedelta from .fetch_utils_url_processor import process_url, verify_missing_kid_url +from .utils import get_with_protocol import re import requests import os @@ -17,14 +18,6 @@ class DB_Handler(): pass def insert_raw_urls(self, urls, obj_source, obj_search): - def get_with_protocol(url): - # http:// -> https:// - url = url.replace("http://", "https://") - # "" -> https:// - if not (url.startswith("https://")): - url = "https://" + url - return url - try: logger.debug("Inserting raw URLs") # Empty? diff --git a/app_urls/fetcher/src/fetch_parser.py b/app_urls/fetcher/src/fetch_parser.py index 4ffb25e..45aebf5 100644 --- a/app_urls/fetcher/src/fetch_parser.py +++ b/app_urls/fetcher/src/fetch_parser.py @@ -1,6 +1,7 @@ from .db_utils import DB_Handler from ..models import Search, Source -from .fetch_utils_url_processor import get_with_protocol, url_host_slowdown +from .fetch_utils_url_processor import url_host_slowdown +from .utils import get_with_protocol import newspaper import traceback from .logger import get_logger diff --git a/app_urls/fetcher/src/utils.py b/app_urls/fetcher/src/utils.py new file mode 100644 index 0000000..3e565a0 --- /dev/null +++ b/app_urls/fetcher/src/utils.py @@ -0,0 +1,8 @@ + +def get_with_protocol(url): + # http:// -> https:// + url = url.replace("http://", "https://") + # "" -> https:// + if not (url.startswith("https://")): + url = "https://" + url + return url \ No newline at end of file