106 lines
3.6 KiB
Python
106 lines
3.6 KiB
Python
from utils import get_webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from urllib.parse import quote
|
|
import time
|
|
from logger import get_logger
|
|
logger = get_logger()
|
|
|
|
class SearchFetcher():
|
|
def __init__(self):
|
|
pass
|
|
|
|
def get_available_sources(self, ):
|
|
return ["foxnews", "breitbart", "zerohedge"]
|
|
|
|
def search(self, source, search="child abuse"):
|
|
try:
|
|
if (source == "foxnews"):
|
|
return self._search_foxnews(search)
|
|
elif (source == "breitbart"):
|
|
return self._search_breitbart(search)
|
|
elif (source == "zerohedge"):
|
|
return self._search_zerohedge(search)
|
|
else:
|
|
logger.warning("Search not implemented for source={} search={}".format(source, search))
|
|
return []
|
|
except Exception as e:
|
|
logger.warning("Error searching for source={} search={}".format(source, search))
|
|
return []
|
|
|
|
def _search_foxnews(self, search):
|
|
url_host = "foxnews.com"
|
|
# URL search
|
|
url_unquoted = "https://www.foxnews.com/search-results/search#q={}".format(search)
|
|
url = quote(url_unquoted, safe=":/?=&#")
|
|
|
|
# Initialize
|
|
driver = get_webdriver()
|
|
# Load URL
|
|
driver.get(url)
|
|
time.sleep(2)
|
|
|
|
# Find the element with class "page"
|
|
page_element = driver.find_element(By.CLASS_NAME, "page")
|
|
# Find the articles
|
|
articles = page_element.find_elements(By.CLASS_NAME, "article")
|
|
# Extract URLs
|
|
urls = [ art.find_element(By.CLASS_NAME, "m").find_element(By.TAG_NAME, "a").get_attribute("href") for art in articles ]
|
|
|
|
# Remove duplicates, remove None
|
|
urls = [u for u in set(urls) if u is not None]
|
|
# Filter by URL host
|
|
urls = [u for u in urls if url_host in u]
|
|
|
|
return urls
|
|
|
|
def _search_breitbart(self, search):
|
|
url_host = "breitbart.com"
|
|
# URL search
|
|
url_unquoted = "https://www.breitbart.com/search/?s={}".format(search.replace(" ", "+"))
|
|
url = quote(url_unquoted, safe=":/?=&#")
|
|
|
|
# Initialize
|
|
driver = get_webdriver()
|
|
# Load URL
|
|
driver.get(url)
|
|
time.sleep(4)
|
|
|
|
# Find the element with class "page"
|
|
page_element = driver.find_element(By.CLASS_NAME, "gsc-expansionArea")
|
|
# Find the articles
|
|
articles = page_element.find_elements(By.CLASS_NAME, "gs-title")
|
|
# Extract URLs
|
|
urls = [ art.get_attribute("href") for art in articles ]
|
|
|
|
# Remove duplicates, remove None
|
|
urls = [u for u in set(urls) if u is not None]
|
|
# Filter by URL host
|
|
urls = [u for u in urls if url_host in u]
|
|
|
|
return urls
|
|
|
|
def _search_zerohedge(self, search):
|
|
url_host = "zerohedge.com"
|
|
# URL search
|
|
url_unquoted = "https://www.zerohedge.com/search-content?qTitleBody={}".format(search.replace(" ", "+"))
|
|
url = quote(url_unquoted, safe=":/?=&#")
|
|
|
|
# Initialize
|
|
driver = get_webdriver()
|
|
# Load URL
|
|
driver.get(url)
|
|
time.sleep(2)
|
|
|
|
# Find the element with class "page"
|
|
page_element = driver.find_element(By.CLASS_NAME, "main-content")
|
|
# Find the articles
|
|
articles = page_element.find_elements(By.TAG_NAME, "a")
|
|
# Extract URLs
|
|
urls = [ art.get_attribute("href") for art in articles]
|
|
|
|
# Remove duplicates, remove None
|
|
urls = [u for u in set(urls) if u is not None]
|
|
# Filter by URL host
|
|
urls = [u for u in urls if url_host in u]
|
|
|
|
return urls |