98 lines
3.6 KiB
Python
98 lines
3.6 KiB
Python
from .db_utils import URL_DB_Writer
|
|
from .url_utils import get_missing_kid_status
|
|
from .logger import get_logger
|
|
logger = get_logger()
|
|
|
|
|
|
def get_missing_kid_status(url, return_canonical_url=False):
|
|
import time
|
|
import requests
|
|
|
|
# Sleep
|
|
time.sleep(0.75)
|
|
try:
|
|
# Request
|
|
r = requests.get(url, timeout=300)
|
|
# Decode
|
|
status_code = r.status_code
|
|
# Canonical URL removing parameters
|
|
url_canonical = r.url
|
|
except Exception as e:
|
|
logger.warning("Exception on get URL status request: {}. {}".format(url, str(e)))
|
|
status_code = None
|
|
url_canonical = url
|
|
|
|
if (status_code == 200):
|
|
status = "valid"
|
|
elif (status_code == 404):
|
|
status = "invalid"
|
|
else:
|
|
status = "unknown"
|
|
|
|
logger.debug("Missing Kid URL {} status: {}".format(url, status))
|
|
if (return_canonical_url):
|
|
return status, url_canonical
|
|
else:
|
|
return status
|
|
|
|
class MissingKidsStatus():
|
|
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
|
|
self.num_urls = num_urls
|
|
self.db_connect_info = db_connect_info
|
|
self.redis_connect_info = redis_connect_info
|
|
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
|
|
|
def update_missing_kids_status(self):
|
|
try:
|
|
logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls))
|
|
# List of URLs
|
|
list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls)
|
|
# Dict: status -> IDs to update to new status
|
|
dict_status_ids, dict_status_urls = {}, {}
|
|
# Check URLs with invalid status?
|
|
skip_invalid_check = False
|
|
|
|
flush_every, flush_current = 20, 0
|
|
# Iterate URLs
|
|
for (id, url, current_status) in list_ids_and_urls:
|
|
# Skip duplicate URLs
|
|
if (current_status == "duplicate"):
|
|
continue
|
|
# Skip invalid URLs?
|
|
if (skip_invalid_check):
|
|
if (current_status == "invalid"):
|
|
continue
|
|
|
|
# Get status
|
|
new_status = get_missing_kid_status(url)
|
|
# Different? Update
|
|
if (current_status != new_status):
|
|
# Extend array
|
|
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
|
|
# Debugging dict
|
|
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
|
|
# +1 processed
|
|
flush_current += 1
|
|
|
|
# Flush batch?
|
|
if (flush_every == flush_current):
|
|
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
|
|
# Update DB
|
|
self.db_writer._update_urls_status(dict_status_ids)
|
|
# Reset
|
|
flush_current = 0
|
|
dict_status_ids, dict_status_urls = {}, {}
|
|
|
|
# Flush remaining batch
|
|
if (flush_current > 0):
|
|
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
|
|
# Update DB
|
|
self.db_writer._update_urls_status(dict_status_ids)
|
|
# Reset
|
|
flush_current = 0
|
|
dict_status_ids, dict_status_urls = {}, {}
|
|
|
|
logger.info("Finished updating status to Missing Kids URLs")
|
|
except Exception as e:
|
|
logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e)))
|
|
|