Refactoring fetcher, working feeds and raw url writer
This commit is contained in:
62
app_urls/api/obsolete_src/url_status.py
Normal file
62
app_urls/api/obsolete_src/url_status.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
from .url_utils import process_article
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class UpdateErrorURLs():
|
||||
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
|
||||
self.num_urls = num_urls
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
||||
|
||||
def update_error_urls_status(self):
|
||||
try:
|
||||
logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls))
|
||||
# List of URLs with status 'error'
|
||||
list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls)
|
||||
# Current status
|
||||
current_status = "error"
|
||||
# Dict: status -> IDs to update to new status
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
# Get list of (pattern, priority, status) tuples to override status if required
|
||||
list_pattern_status_tuple = self.db_writer._get_pattern_status_list()
|
||||
# Sort pattern tuples by priority
|
||||
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
flush_every, flush_current = 20, 0
|
||||
# Iterate URLs
|
||||
for (id, url) in list_ids_and_urls:
|
||||
# Get status
|
||||
url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple)
|
||||
# Different? Update
|
||||
if (current_status != new_status):
|
||||
# Extend array
|
||||
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
|
||||
# Debugging dict
|
||||
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
|
||||
# +1 processed
|
||||
flush_current += 1
|
||||
|
||||
# Flush batch?
|
||||
if (flush_every == flush_current):
|
||||
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
|
||||
# Update DB
|
||||
self.db_writer._update_urls_status(dict_status_ids)
|
||||
# Reset
|
||||
flush_current = 0
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
# Flush remaining batch
|
||||
if (flush_current > 0):
|
||||
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
|
||||
# Update DB
|
||||
self.db_writer._update_urls_status(dict_status_ids)
|
||||
# Reset
|
||||
flush_current = 0
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
logger.info("Finished updating status to URLs with error")
|
||||
except Exception as e:
|
||||
logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))
|
||||
Reference in New Issue
Block a user