Files
matitos_news/app_fetcher/src/missing_kids_status.py
Luciano Gervasoni 54ebd58070 Url content
2025-03-07 00:34:46 +01:00

69 lines
2.9 KiB
Python

import requests
from .db_utils import URL_DB_Writer
from .url_utils import get_missing_kid_status
import time
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class MissingKidsStatus():
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
self.num_urls = num_urls
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
def update_missing_kids_status(self):
try:
logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls))
# List of URLs
list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls)
# Dict: status -> IDs to update to new status
dict_status_ids, dict_status_urls = {}, {}
# Check URLs with invalid status?
skip_invalid_check = False
flush_every, flush_current = 20, 0
# Iterate URLs
for (id, url, current_status) in list_ids_and_urls:
# Skip duplicate URLs
if (current_status == "duplicate"):
continue
# Skip invalid URLs?
if (skip_invalid_check):
if (current_status == "invalid"):
continue
# Get status
new_status = get_missing_kid_status(url)
# Different? Update
if (current_status != new_status):
# Extend array
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
# Debugging dict
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
# +1 processed
flush_current += 1
# Flush batch?
if (flush_every == flush_current):
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
# Flush remaining batch
if (flush_current > 0):
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
# Update DB
self.db_writer._update_urls_status(dict_status_ids)
# Reset
flush_current = 0
dict_status_ids, dict_status_urls = {}, {}
logger.info("Finished updating status to Missing Kids URLs")
except Exception as e:
logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e)))