diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index 47e1853..f65df66 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -4,7 +4,7 @@ from django.core.cache import cache from django.db import IntegrityError from django.utils import timezone from datetime import timedelta -from .fetch_utils_url_processor import process_url, get_with_protocol +from .fetch_utils_url_processor import process_url, get_with_protocol, url_host_slowdown import re import requests import os @@ -104,6 +104,8 @@ class DB_Handler(): ########################################################################## # URL pattern: missingkids.org/poster OR missingkids.org/new-poster if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url): + # Sleep required? To avoid too many requests error (original URL, not paywall bypassing endpoint) + url_host_slowdown(obj_url.url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5))) # Request r = requests.get(obj_url.url, allow_redirects=True) @@ -113,9 +115,9 @@ class DB_Handler(): # Set duplicate, and insert new canonical form self._set_duplicate_and_insert_canonical(obj_url, url_canonical) elif (r.status_code == 200): - self._set_status(self, obj_url, Urls.STATUS_ENUM.VALID) + self._set_status(obj_url, Urls.STATUS_ENUM.VALID) elif (r.status_code == 404): - self._set_status(self, obj_url, Urls.STATUS_ENUM.INVALID) + self._set_status(obj_url, Urls.STATUS_ENUM.INVALID) else: logger.info("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url))