Refactoring fetcher, working feeds and raw url writer

This commit is contained in:
Luciano Gervasoni
2025-03-12 17:56:40 +01:00
parent e124dbc21a
commit 61c31ee9aa
24 changed files with 2085 additions and 194 deletions

View File

@@ -0,0 +1,36 @@
from .db_utils import DB_Handler
import requests
import json
from .logger import get_logger
logger = get_logger()
class MissingKidsFetch():
def __init__(self, db_handler: DB_Handler, num_pages) -> None:
logger.debug("Initializing News MissingKids")
self.db_handler = db_handler
self.num_pages = num_pages
self.missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}"
def run(self):
try:
logger.debug("Starting NewsMissingKids.run()")
try:
# Timeout
if (self.num_pages > 15):
timeout = 60*90 # 1.5h
else:
timeout = 60*5 # 5 min
# Request
r = requests.get(self.missingkids_fetch_endpoint.format(self.num_pages), timeout=timeout)
# Decode
urls_fetched = json.loads(r.text).get("list_urls", [])
except Exception as e:
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
urls_fetched = []
# URL fetching source
source = "missingkids fetcher"
# Write to DB
self.db_handler.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e)))