diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index 6b5a5fd..d0ba365 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -5,6 +5,7 @@ from django.db import IntegrityError from django.utils import timezone from datetime import timedelta from .fetch_utils_url_processor import process_url, verify_missing_kid_url +from .utils import get_with_protocol import re import requests import os @@ -17,14 +18,6 @@ class DB_Handler(): pass def insert_raw_urls(self, urls, obj_source, obj_search): - def get_with_protocol(url): - # http:// -> https:// - url = url.replace("http://", "https://") - # "" -> https:// - if not (url.startswith("https://")): - url = "https://" + url - return url - try: logger.debug("Inserting raw URLs") # Empty? diff --git a/app_urls/fetcher/src/fetch_parser.py b/app_urls/fetcher/src/fetch_parser.py index 4ffb25e..45aebf5 100644 --- a/app_urls/fetcher/src/fetch_parser.py +++ b/app_urls/fetcher/src/fetch_parser.py @@ -1,6 +1,7 @@ from .db_utils import DB_Handler from ..models import Search, Source -from .fetch_utils_url_processor import get_with_protocol, url_host_slowdown +from .fetch_utils_url_processor import url_host_slowdown +from .utils import get_with_protocol import newspaper import traceback from .logger import get_logger diff --git a/app_urls/fetcher/src/utils.py b/app_urls/fetcher/src/utils.py new file mode 100644 index 0000000..3e565a0 --- /dev/null +++ b/app_urls/fetcher/src/utils.py @@ -0,0 +1,8 @@ + +def get_with_protocol(url): + # http:// -> https:// + url = url.replace("http://", "https://") + # "" -> https:// + if not (url.startswith("https://")): + url = "https://" + url + return url \ No newline at end of file