from utils import get_webdriver from selenium.webdriver.common.by import By from urllib.parse import quote import time from logger import get_logger logger = get_logger() class SearchFetcher(): def __init__(self): pass def get_available_sources(self, ): return ["foxnews", "breitbart", "zerohedge"] def search(self, source, search="child abuse"): try: if (source == "foxnews"): return self._search_foxnews(search) elif (source == "breitbart"): return self._search_breitbart(search) elif (source == "zerohedge"): return self._search_zerohedge(search) else: logger.warning("Search not implemented for source={} search={}".format(source, search)) return [] except Exception as e: logger.warning("Error searching for source={} search={}".format(source, search)) return [] def _search_foxnews(self, search): url_host = "foxnews.com" # URL search url_unquoted = "https://www.foxnews.com/search-results/search#q={}".format(search) url = quote(url_unquoted, safe=":/?=&#") # Initialize driver = get_webdriver() # Load URL driver.get(url) time.sleep(2) # Find the element with class "page" page_element = driver.find_element(By.CLASS_NAME, "page") # Find the articles articles = page_element.find_elements(By.CLASS_NAME, "article") # Extract URLs urls = [ art.find_element(By.CLASS_NAME, "m").find_element(By.TAG_NAME, "a").get_attribute("href") for art in articles ] # Remove duplicates, remove None urls = [u for u in set(urls) if u is not None] # Filter by URL host urls = [u for u in urls if url_host in u] return urls def _search_breitbart(self, search): url_host = "breitbart.com" # URL search url_unquoted = "https://www.breitbart.com/search/?s={}".format(search.replace(" ", "+")) url = quote(url_unquoted, safe=":/?=&#") # Initialize driver = get_webdriver() # Load URL driver.get(url) time.sleep(4) # Find the element with class "page" page_element = driver.find_element(By.CLASS_NAME, "gsc-expansionArea") # Find the articles articles = page_element.find_elements(By.CLASS_NAME, "gs-title") # Extract URLs urls = [ art.get_attribute("href") for art in articles ] # Remove duplicates, remove None urls = [u for u in set(urls) if u is not None] # Filter by URL host urls = [u for u in urls if url_host in u] return urls def _search_zerohedge(self, search): url_host = "zerohedge.com" # URL search url_unquoted = "https://www.zerohedge.com/search-content?qTitleBody={}".format(search.replace(" ", "+")) url = quote(url_unquoted, safe=":/?=&#") # Initialize driver = get_webdriver() # Load URL driver.get(url) time.sleep(2) # Find the element with class "page" page_element = driver.find_element(By.CLASS_NAME, "main-content") # Find the articles articles = page_element.find_elements(By.TAG_NAME, "a") # Extract URLs urls = [ art.get_attribute("href") for art in articles] # Remove duplicates, remove None urls = [u for u in set(urls) if u is not None] # Filter by URL host urls = [u for u in urls if url_host in u] return urls