Selenium based fetch of different sources

This commit is contained in:
Luciano Gervasoni
2025-07-08 18:18:26 +02:00
parent f729bd1cb2
commit 0cf61026e8
10 changed files with 235 additions and 31 deletions

View File

@@ -0,0 +1,42 @@
from .db_utils import DB_Handler
from ..models import Search, Source
import traceback
import requests
import os
from .logger import get_logger
logger = get_logger()
class FetchSeleniumSourceSearch():
def __init__(self) -> None:
logger.debug("Initializing Selenium Source Search")
def run(self):
try:
logger.debug("Starting FetchSeleniumSourceSearch.run()")
# Get keyword searches
list_keyword_search = Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH)
logger.debug("Fetching news Selenium based for keyword searches: {}".format([e.search for e in list_keyword_search]))
# Run selenium search for each keyword search
for obj_search in list_keyword_search:
try:
# Selenium fetching endpoint
selenium_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "fetch_search/")
data = {"search": obj_search.search}
# POST
r = requests.post(selenium_fetch_endpoint, json=data, timeout=900)
# Jsonify
results = r.json()
logger.debug("Selenium results for URL {}: {}".format(obj_search.search, str(results)))
for source, urls_fetched in results.items():
# Get source object
obj_source, created = Source.objects.get_or_create(source="selenium {}".format(source))
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e:
logger.warning("Exception while fetching selenium search: {}\n{}".format(obj_search.search, str(e)))
except Exception as e:
logger.warning("Exception in FetchSeleniumSourceSearch.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -4,6 +4,7 @@ from .src.fetch_feed import FetchFeeds
from .src.fetch_parser import FetchParser
from .src.fetch_search import FetchSearcher
from .src.fetch_missing_kids import FetchMissingKids
from .src.fetch_selenium import FetchSeleniumSourceSearch
from .src.db_utils import DB_Handler
from .src.publisher import Publisher
@@ -32,14 +33,14 @@ def fetch_search():
logger.info("Task completed: {}".format(task))
@job('default')
def fetch_missing_kids(number_pages=5):
task = "Fetch MissingKids"
def fetch_selenium_search():
task = "Fetch Selenium search"
logger.info("Task triggered: {}".format(task))
FetchMissingKids().run(number_pages)
FetchSeleniumSourceSearch().run()
logger.info("Task completed: {}".format(task))
@job('default')
def fetch_missing_kids_all(number_pages=-1):
def fetch_missing_kids(number_pages=5):
task = "Fetch MissingKids"
logger.info("Task triggered: {}".format(task))
FetchMissingKids().run(number_pages)
@@ -85,6 +86,8 @@ def background_task(process_type: str):
FetchParser().run()
elif (process_type == "fetch_search"):
FetchSearcher().run()
elif (process_type == "fetch_selenium_search"):
FetchSeleniumSourceSearch().run()
elif (process_type == "fetch_missingkids_all"):
FetchMissingKids().run(number_pages=-1)

View File

@@ -14,8 +14,8 @@ def link_list(request):
# Base URL path
app_url = request.build_absolute_uri()
# Tasks
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"]
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "fetch_selenium_search"]
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_valid_all", "process_missing_kids_urls_invalid_all", "process_missing_kids_urls_unknown_all", "process_missing_kids_urls_all", "clean_old_url_content_60"]
# List of links
list_links = \
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \