Selenium based fetch of different sources
This commit is contained in:
42
app_urls/fetcher/src/fetch_selenium.py
Normal file
42
app_urls/fetcher/src/fetch_selenium.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
import traceback
|
||||
import requests
|
||||
import os
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchSeleniumSourceSearch():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Selenium Source Search")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchSeleniumSourceSearch.run()")
|
||||
|
||||
# Get keyword searches
|
||||
list_keyword_search = Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH)
|
||||
logger.debug("Fetching news Selenium based for keyword searches: {}".format([e.search for e in list_keyword_search]))
|
||||
|
||||
# Run selenium search for each keyword search
|
||||
for obj_search in list_keyword_search:
|
||||
try:
|
||||
# Selenium fetching endpoint
|
||||
selenium_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "fetch_search/")
|
||||
data = {"search": obj_search.search}
|
||||
# POST
|
||||
r = requests.post(selenium_fetch_endpoint, json=data, timeout=900)
|
||||
# Jsonify
|
||||
results = r.json()
|
||||
logger.debug("Selenium results for URL {}: {}".format(obj_search.search, str(results)))
|
||||
|
||||
for source, urls_fetched in results.items():
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source="selenium {}".format(source))
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("Exception while fetching selenium search: {}\n{}".format(obj_search.search, str(e)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchSeleniumSourceSearch.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
@@ -4,6 +4,7 @@ from .src.fetch_feed import FetchFeeds
|
||||
from .src.fetch_parser import FetchParser
|
||||
from .src.fetch_search import FetchSearcher
|
||||
from .src.fetch_missing_kids import FetchMissingKids
|
||||
from .src.fetch_selenium import FetchSeleniumSourceSearch
|
||||
from .src.db_utils import DB_Handler
|
||||
from .src.publisher import Publisher
|
||||
|
||||
@@ -32,14 +33,14 @@ def fetch_search():
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_missing_kids(number_pages=5):
|
||||
task = "Fetch MissingKids"
|
||||
def fetch_selenium_search():
|
||||
task = "Fetch Selenium search"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
FetchSeleniumSourceSearch().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_missing_kids_all(number_pages=-1):
|
||||
def fetch_missing_kids(number_pages=5):
|
||||
task = "Fetch MissingKids"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
@@ -85,6 +86,8 @@ def background_task(process_type: str):
|
||||
FetchParser().run()
|
||||
elif (process_type == "fetch_search"):
|
||||
FetchSearcher().run()
|
||||
elif (process_type == "fetch_selenium_search"):
|
||||
FetchSeleniumSourceSearch().run()
|
||||
elif (process_type == "fetch_missingkids_all"):
|
||||
FetchMissingKids().run(number_pages=-1)
|
||||
|
||||
|
||||
@@ -14,8 +14,8 @@ def link_list(request):
|
||||
# Base URL path
|
||||
app_url = request.build_absolute_uri()
|
||||
# Tasks
|
||||
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
|
||||
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"]
|
||||
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "fetch_selenium_search"]
|
||||
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_valid_all", "process_missing_kids_urls_invalid_all", "process_missing_kids_urls_unknown_all", "process_missing_kids_urls_all", "clean_old_url_content_60"]
|
||||
# List of links
|
||||
list_links = \
|
||||
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
|
||||
|
||||
Reference in New Issue
Block a user