from .db_utils import URL_DB_Writer from .url_utils import get_missing_kid_status from .logger import get_logger logger = get_logger() def get_missing_kid_status(url, return_canonical_url=False): import time import requests # Sleep time.sleep(0.75) try: # Request r = requests.get(url, timeout=300) # Decode status_code = r.status_code # Canonical URL removing parameters url_canonical = r.url except Exception as e: logger.warning("Exception on get URL status request: {}. {}".format(url, str(e))) status_code = None url_canonical = url if (status_code == 200): status = "valid" elif (status_code == 404): status = "invalid" else: status = "unknown" logger.debug("Missing Kid URL {} status: {}".format(url, status)) if (return_canonical_url): return status, url_canonical else: return status class MissingKidsStatus(): def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None: self.num_urls = num_urls self.db_connect_info = db_connect_info self.redis_connect_info = redis_connect_info self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info) def update_missing_kids_status(self): try: logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls)) # List of URLs list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls) # Dict: status -> IDs to update to new status dict_status_ids, dict_status_urls = {}, {} # Check URLs with invalid status? skip_invalid_check = False flush_every, flush_current = 20, 0 # Iterate URLs for (id, url, current_status) in list_ids_and_urls: # Skip duplicate URLs if (current_status == "duplicate"): continue # Skip invalid URLs? if (skip_invalid_check): if (current_status == "invalid"): continue # Get status new_status = get_missing_kid_status(url) # Different? Update if (current_status != new_status): # Extend array dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id] # Debugging dict dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url] # +1 processed flush_current += 1 # Flush batch? if (flush_every == flush_current): logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls)) # Update DB self.db_writer._update_urls_status(dict_status_ids) # Reset flush_current = 0 dict_status_ids, dict_status_urls = {}, {} # Flush remaining batch if (flush_current > 0): logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls)) # Update DB self.db_writer._update_urls_status(dict_status_ids) # Reset flush_current = 0 dict_status_ids, dict_status_urls = {}, {} logger.info("Finished updating status to Missing Kids URLs") except Exception as e: logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e)))