Files
matitos_news/app_urls/fetcher/tasks.py
2025-07-17 22:29:06 +02:00

148 lines
5.5 KiB
Python

from celery import shared_task
from .src.fetch_feed import FetchFeeds
from .src.fetch_parser import FetchParser
from .src.fetch_search import FetchSearcher
from .src.fetch_missing_kids import FetchMissingKids
from .src.fetch_selenium import FetchSeleniumSourceSearch
from .src.db_utils import DB_Handler
from .src.publisher import Publisher
from .src.logger import get_logger
logger = get_logger()
@shared_task(queue='default')
def fetch_feeds():
task = "Fetch Feeds"
logger.info("Task triggered: {}".format(task))
FetchFeeds().run()
logger.info("Task completed: {}".format(task))
@shared_task(queue='default')
def fetch_parser():
task = "Fetch Parser"
logger.info("Task triggered: {}".format(task))
FetchParser().run()
logger.info("Task completed: {}".format(task))
@shared_task(queue='default')
def fetch_search():
task = "Fetch Search"
logger.info("Task triggered: {}".format(task))
FetchSearcher().run()
logger.info("Task completed: {}".format(task))
@shared_task(queue='low')
def fetch_selenium_search():
task = "Fetch Selenium search"
logger.info("Task triggered: {}".format(task))
FetchSeleniumSourceSearch().run()
logger.info("Task completed: {}".format(task))
@shared_task(queue='low')
def fetch_missing_kids(number_pages=5):
task = "Fetch MissingKids"
logger.info("Task triggered: {}".format(task))
FetchMissingKids().run(number_pages)
logger.info("Task completed: {}".format(task))
@shared_task(queue='default')
def process_raw_urls(batch_size=100):
task = "Process raw URLs"
logger.info("Task triggered: {}".format(task))
DB_Handler().process_raw_urls(batch_size=batch_size)
logger.info("Task completed: {}".format(task))
@shared_task(queue='default')
def process_error_urls(batch_size=50):
task = "Process error URLs"
logger.info("Task triggered: {}".format(task))
DB_Handler().process_error_urls(batch_size=batch_size)
logger.info("Task completed: {}".format(task))
@shared_task(queue='low')
def process_missing_kids_urls(batch_size=None, process_status_only=None):
task = "Process Missing Kids URLs - batch_size={} process_status_only={}".format(batch_size, process_status_only)
logger.info("Task triggered: {}".format(task))
DB_Handler().process_missing_kids_urls(batch_size=batch_size, process_status_only=process_status_only)
logger.info("Task completed: {}".format(task))
@shared_task(queue='default')
def clean_old_url_content(older_than_days=14):
task = "Clean old URL content"
logger.info("Task triggered: {}".format(task))
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
logger.info("Task completed: {}".format(task))
'''
@job('default')
def background_task(process_type: str):
logger.info("Task triggered: {}".format(process_type))
try:
if (process_type == "fetch_feeds"):
FetchFeeds().run()
elif (process_type == "fetch_parser"):
FetchParser().run()
elif (process_type == "fetch_search"):
FetchSearcher().run()
elif (process_type == "fetch_selenium_search"):
FetchSeleniumSourceSearch().run()
elif (process_type == "fetch_missingkids_all"):
FetchMissingKids().run(number_pages=-1)
elif ("fetch_missingkids" in process_type):
# number_pages encoded in URL
try:
number_pages = int(process_type.split("_")[-1])
except Exception as e:
number_pages = -1
FetchMissingKids().run(number_pages=number_pages)
elif ("process_" in process_type):
# Batch size encoded in URL
try:
batch_size = int(process_type.split("_")[-1])
except Exception as e:
batch_size = None
# Task type
if ("process_raw_urls" in process_type):
DB_Handler().process_raw_urls(batch_size=batch_size)
elif ("process_error_urls" in process_type):
DB_Handler().process_error_urls(batch_size=batch_size)
elif ("process_missing_kids_urls" in process_type):
if ("process_missing_kids_urls_valid" in process_type):
DB_Handler().process_missing_kids_urls(batch_size=batch_size, process_status_only="valid")
elif ("process_missing_kids_urls_invalid" in process_type):
DB_Handler().process_missing_kids_urls(batch_size=batch_size, process_status_only="invalid")
elif ("process_missing_kids_urls_unknown" in process_type):
DB_Handler().process_missing_kids_urls(batch_size=batch_size, process_status_only="unknown")
else:
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
elif ("clean_old_url_content" in process_type ):
# Older than X days encoded in URL
try:
older_than_days = float(process_type.split("_")[-1])
except Exception as e:
older_than_days = None
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
elif ("publish" in process_type):
# Extract URL ID
url_id = process_type.split("_")[-1]
# Publish
Publisher().publish(url_id)
else:
logger.info("Task unknown!: {}".format(process_type))
logger.info("Task completed: {}".format(process_type))
except Exception as e:
logger.error(e)
'''