matitos_news/app_selenium/search.py

from utils import get_webdriver
from selenium.webdriver.common.by import By
from urllib.parse import quote
import time
from logger import get_logger
logger = get_logger()

class SearchFetcher():
    def __init__(self):
        pass

    def get_available_sources(self, ):
        return ["foxnews", "breitbart", "zerohedge"]

    def search(self, source, search="child abuse"):
        try:
            if (source == "foxnews"):
                return self._search_foxnews(search)
            elif (source == "breitbart"):
                return self._search_breitbart(search)
            elif (source == "zerohedge"):
                return self._search_zerohedge(search)
            else:
                logger.warning("Search not implemented for source={} search={}".format(source, search))
                return []
        except Exception as e:
            logger.warning("Error searching for source={} search={}".format(source, search))
            return []

    def _search_foxnews(self, search):
        url_host = "foxnews.com"
        # URL search
        url_unquoted = "https://www.foxnews.com/search-results/search#q={}".format(search)
        url = quote(url_unquoted, safe=":/?=&#")

        # Initialize
        driver = get_webdriver()
        # Load URL
        driver.get(url)
        time.sleep(2)

        # Find the element with class "page"
        page_element = driver.find_element(By.CLASS_NAME, "page")
        # Find the articles
        articles = page_element.find_elements(By.CLASS_NAME, "article")
        # Extract URLs
        urls = [ art.find_element(By.CLASS_NAME, "m").find_element(By.TAG_NAME, "a").get_attribute("href") for art in articles ]

        # Remove duplicates, remove None
        urls = [u for u in set(urls) if u is not None]
        # Filter by URL host
        urls = [u for u in urls if url_host in u]

        return urls

    def _search_breitbart(self, search):
        url_host = "breitbart.com"
        # URL search
        url_unquoted = "https://www.breitbart.com/search/?s={}".format(search.replace(" ", "+"))
        url = quote(url_unquoted, safe=":/?=&#")

        # Initialize
        driver = get_webdriver()
        # Load URL
        driver.get(url)
        time.sleep(4)

        # Find the element with class "page"
        page_element = driver.find_element(By.CLASS_NAME, "gsc-expansionArea")
        # Find the articles
        articles = page_element.find_elements(By.CLASS_NAME, "gs-title")
        # Extract URLs
        urls = [ art.get_attribute("href") for art in articles ]

        # Remove duplicates, remove None
        urls = [u for u in set(urls) if u is not None]
        # Filter by URL host
        urls = [u for u in urls if url_host in u]

        return urls

    def _search_zerohedge(self, search):
        url_host = "zerohedge.com"
        # URL search
        url_unquoted = "https://www.zerohedge.com/search-content?qTitleBody={}".format(search.replace(" ", "+"))
        url = quote(url_unquoted, safe=":/?=&#")

        # Initialize
        driver = get_webdriver()
        # Load URL
        driver.get(url)
        time.sleep(2)

        # Find the element with class "page"
        page_element = driver.find_element(By.CLASS_NAME, "main-content")
        # Find the articles
        articles = page_element.find_elements(By.TAG_NAME, "a")
        # Extract URLs
        urls = [ art.get_attribute("href") for art in articles]

        # Remove duplicates, remove None
        urls = [u for u in set(urls) if u is not None]
        # Filter by URL host
        urls = [u for u in urls if url_host in u]

        return urls