Valid content filter, language detect on min chars, fetch missingkids.org

This commit is contained in:
Luciano Gervasoni
2025-04-03 09:44:46 +02:00
parent 3b54e247e7
commit 5addfa5ba9
18 changed files with 533 additions and 66 deletions

View File

@@ -3,10 +3,8 @@ from scheduler import job
from .src.fetch_feed import FetchFeeds
from .src.fetch_parser import FetchParser
from .src.fetch_search import FetchSearcher
from .src.fetch_missing_kids import FetchMissingKids
from .src.db_utils import DB_Handler
'''
from src.missing_kids_fetch import MissingKidsFetch
'''
from .src.logger import get_logger
logger = get_logger()
@@ -32,7 +30,19 @@ def fetch_search():
FetchSearcher().run()
logger.info("Task completed: {}".format(task))
# TODO: fetch_missing_kids()
@job('default')
def fetch_missing_kids(number_pages=5):
task = "Fetch MissingKids"
logger.info("Task triggered: {}".format(task))
FetchMissingKids().run(number_pages)
logger.info("Task completed: {}".format(task))
@job('default')
def fetch_missing_kids_all(number_pages=-1):
task = "Fetch MissingKids"
logger.info("Task triggered: {}".format(task))
FetchMissingKids().run(number_pages)
logger.info("Task completed: {}".format(task))
@job('default')
def process_raw_urls(batch_size=50):
@@ -77,8 +87,15 @@ def background_task(process_type: str):
FetchParser().run()
elif (process_type == "fetch_search"):
FetchSearcher().run()
#elif (process_type == "fetch_missingkids"):
# FetchMissingKids().run()
elif (process_type == "fetch_missingkids_all"):
FetchMissingKids().run(number_pages=-1)
elif ("fetch_missingkids" in process_type):
# number_pages encoded in URL
try:
number_pages = int(process_type.split("_")[-1])
except Exception as e:
number_pages = -1
FetchMissingKids().run(number_pages=number_pages)
elif ("process_" in process_type):
# Batch size encoded in URL
try:
@@ -95,14 +112,6 @@ def background_task(process_type: str):
else:
logger.info("Task unknown!: {}".format(process_type))
'''
# Selenium based
elif (process_type == "fetch_missing_kids_reduced"):
MissingKidsFetch(db_handler, num_pages=4).run()
elif (process_type == "fetch_missing_kids_full"):
MissingKidsFetch(db_handler, num_pages=100000).run()
'''
logger.info("Task completed: {}".format(process_type))
except Exception as e:
logger.error(e)