Selenium based fetch of different sources
This commit is contained in:
@@ -15,6 +15,8 @@
|
||||
- TODO: Proxy / VPN?
|
||||
- TooManyRequests, ...
|
||||
- TODO: Search per locale (nl-NL, fr-FR, en-GB)
|
||||
- Fetch keyword search for selenium sources
|
||||
|
||||
|
||||
- URLs Processing -> Updates raw URLs
|
||||
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
|
||||
@@ -52,12 +54,10 @@
|
||||
* Dev mode
|
||||
```
|
||||
docker compose -f docker-compose-dev.yml down -v
|
||||
docker compose -f docker-compose-dev.yml build --progress=plain
|
||||
docker compose -f docker-compose-dev.yml up
|
||||
docker compose -f docker-compose-dev.yml up --no-deps --build
|
||||
```
|
||||
* Prod mode
|
||||
```
|
||||
docker compose down -v
|
||||
docker compose build --progress=plain
|
||||
docker compose up -d
|
||||
docker compose up -d --no-deps --build
|
||||
```
|
||||
@@ -1,6 +1,7 @@
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
from missing_kids import MissingKidsFetcher
|
||||
from search import SearchFetcher
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
@@ -12,17 +13,41 @@ def get_missing_kids(pages: int = -1):
|
||||
logger.info("Get missing kids, #pages={}".format(pages))
|
||||
res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)}
|
||||
except Exception as e:
|
||||
logger.warning("Exception: {}".format(str(e)), exc_info=True)
|
||||
res = {}
|
||||
return res
|
||||
|
||||
class Body(BaseModel):
|
||||
class BodyVerifyMissingKid(BaseModel):
|
||||
url: str
|
||||
|
||||
@app.post("/verify_missing_kid/")
|
||||
def get_missing_kids(data: Body):
|
||||
def get_missing_kids(data: BodyVerifyMissingKid):
|
||||
try:
|
||||
logger.info("Verify missing kid, URL={}".format(data.url))
|
||||
res = MissingKidsFetcher().verify_missing_kid_url(data.url)
|
||||
except Exception as e:
|
||||
logger.warning("Exception: {}".format(str(e)), exc_info=True)
|
||||
res = {}
|
||||
return res
|
||||
return res
|
||||
|
||||
class BodyFetchSearch(BaseModel):
|
||||
search: str
|
||||
|
||||
@app.post("/fetch_search/")
|
||||
def fetch_search(data: BodyFetchSearch):
|
||||
try:
|
||||
# Initialize
|
||||
search_fetcher, results = SearchFetcher(), {}
|
||||
# Iterate
|
||||
for source in search_fetcher.get_available_sources():
|
||||
logger.info("Fetch based search source={} search={}".format(source, data.search))
|
||||
# Fetch
|
||||
results[source] = SearchFetcher().search(source, data.search)
|
||||
# Empty?
|
||||
if (len(results[source]) == 0):
|
||||
results.pop(source)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception: {}".format(str(e)), exc_info=True)
|
||||
results = {}
|
||||
return results
|
||||
|
||||
@@ -8,10 +8,10 @@ logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
|
||||
os.makedirs(logs_directory, exist_ok=True)
|
||||
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("app_selenium")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger = logging.getLogger("selenium")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||
# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
from selenium import webdriver
|
||||
from utils import get_webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.firefox.service import Service
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
@@ -11,16 +9,6 @@ import os
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
def get_webdriver():
|
||||
options = Options()
|
||||
options.add_argument('--headless') # Optional
|
||||
options.binary_location = '/opt/firefox/firefox'
|
||||
|
||||
service = Service('/usr/local/bin/geckodriver')
|
||||
|
||||
driver = webdriver.Firefox(options=options, service=service)
|
||||
return driver
|
||||
|
||||
class MissingKidsFetcher():
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
106
app_selenium/search.py
Normal file
106
app_selenium/search.py
Normal file
@@ -0,0 +1,106 @@
|
||||
from utils import get_webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from urllib.parse import quote
|
||||
import time
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class SearchFetcher():
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get_available_sources(self, ):
|
||||
return ["foxnews", "breitbart", "zerohedge"]
|
||||
|
||||
def search(self, source, search="child abuse"):
|
||||
try:
|
||||
if (source == "foxnews"):
|
||||
return self._search_foxnews(search)
|
||||
elif (source == "breitbart"):
|
||||
return self._search_breitbart(search)
|
||||
elif (source == "zerohedge"):
|
||||
return self._search_zerohedge(search)
|
||||
else:
|
||||
logger.warning("Search not implemented for source={} search={}".format(source, search))
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.warning("Error searching for source={} search={}".format(source, search))
|
||||
return []
|
||||
|
||||
def _search_foxnews(self, search):
|
||||
url_host = "foxnews.com"
|
||||
# URL search
|
||||
url_unquoted = "https://www.foxnews.com/search-results/search#q={}".format(search)
|
||||
url = quote(url_unquoted, safe=":/?=&#")
|
||||
|
||||
# Initialize
|
||||
driver = get_webdriver()
|
||||
# Load URL
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Find the element with class "page"
|
||||
page_element = driver.find_element(By.CLASS_NAME, "page")
|
||||
# Find the articles
|
||||
articles = page_element.find_elements(By.CLASS_NAME, "article")
|
||||
# Extract URLs
|
||||
urls = [ art.find_element(By.CLASS_NAME, "m").find_element(By.TAG_NAME, "a").get_attribute("href") for art in articles ]
|
||||
|
||||
# Remove duplicates, remove None
|
||||
urls = [u for u in set(urls) if u is not None]
|
||||
# Filter by URL host
|
||||
urls = [u for u in urls if url_host in u]
|
||||
|
||||
return urls
|
||||
|
||||
def _search_breitbart(self, search):
|
||||
url_host = "breitbart.com"
|
||||
# URL search
|
||||
url_unquoted = "https://www.breitbart.com/search/?s={}".format(search.replace(" ", "+"))
|
||||
url = quote(url_unquoted, safe=":/?=&#")
|
||||
|
||||
# Initialize
|
||||
driver = get_webdriver()
|
||||
# Load URL
|
||||
driver.get(url)
|
||||
time.sleep(4)
|
||||
|
||||
# Find the element with class "page"
|
||||
page_element = driver.find_element(By.CLASS_NAME, "gsc-expansionArea")
|
||||
# Find the articles
|
||||
articles = page_element.find_elements(By.CLASS_NAME, "gs-title")
|
||||
# Extract URLs
|
||||
urls = [ art.get_attribute("href") for art in articles ]
|
||||
|
||||
# Remove duplicates, remove None
|
||||
urls = [u for u in set(urls) if u is not None]
|
||||
# Filter by URL host
|
||||
urls = [u for u in urls if url_host in u]
|
||||
|
||||
return urls
|
||||
|
||||
def _search_zerohedge(self, search):
|
||||
url_host = "zerohedge.com"
|
||||
# URL search
|
||||
url_unquoted = "https://www.zerohedge.com/search-content?qTitleBody={}".format(search.replace(" ", "+"))
|
||||
url = quote(url_unquoted, safe=":/?=&#")
|
||||
|
||||
# Initialize
|
||||
driver = get_webdriver()
|
||||
# Load URL
|
||||
driver.get(url)
|
||||
time.sleep(2)
|
||||
|
||||
# Find the element with class "page"
|
||||
page_element = driver.find_element(By.CLASS_NAME, "main-content")
|
||||
# Find the articles
|
||||
articles = page_element.find_elements(By.TAG_NAME, "a")
|
||||
# Extract URLs
|
||||
urls = [ art.get_attribute("href") for art in articles]
|
||||
|
||||
# Remove duplicates, remove None
|
||||
urls = [u for u in set(urls) if u is not None]
|
||||
# Filter by URL host
|
||||
urls = [u for u in urls if url_host in u]
|
||||
|
||||
return urls
|
||||
13
app_selenium/utils.py
Normal file
13
app_selenium/utils.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.firefox.service import Service
|
||||
|
||||
def get_webdriver():
|
||||
options = Options()
|
||||
options.add_argument('--headless') # Optional
|
||||
options.binary_location = '/opt/firefox/firefox'
|
||||
|
||||
service = Service('/usr/local/bin/geckodriver')
|
||||
|
||||
driver = webdriver.Firefox(options=options, service=service)
|
||||
return driver
|
||||
42
app_urls/fetcher/src/fetch_selenium.py
Normal file
42
app_urls/fetcher/src/fetch_selenium.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
import traceback
|
||||
import requests
|
||||
import os
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchSeleniumSourceSearch():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Selenium Source Search")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchSeleniumSourceSearch.run()")
|
||||
|
||||
# Get keyword searches
|
||||
list_keyword_search = Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH)
|
||||
logger.debug("Fetching news Selenium based for keyword searches: {}".format([e.search for e in list_keyword_search]))
|
||||
|
||||
# Run selenium search for each keyword search
|
||||
for obj_search in list_keyword_search:
|
||||
try:
|
||||
# Selenium fetching endpoint
|
||||
selenium_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "fetch_search/")
|
||||
data = {"search": obj_search.search}
|
||||
# POST
|
||||
r = requests.post(selenium_fetch_endpoint, json=data, timeout=900)
|
||||
# Jsonify
|
||||
results = r.json()
|
||||
logger.debug("Selenium results for URL {}: {}".format(obj_search.search, str(results)))
|
||||
|
||||
for source, urls_fetched in results.items():
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source="selenium {}".format(source))
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("Exception while fetching selenium search: {}\n{}".format(obj_search.search, str(e)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchSeleniumSourceSearch.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
@@ -4,6 +4,7 @@ from .src.fetch_feed import FetchFeeds
|
||||
from .src.fetch_parser import FetchParser
|
||||
from .src.fetch_search import FetchSearcher
|
||||
from .src.fetch_missing_kids import FetchMissingKids
|
||||
from .src.fetch_selenium import FetchSeleniumSourceSearch
|
||||
from .src.db_utils import DB_Handler
|
||||
from .src.publisher import Publisher
|
||||
|
||||
@@ -32,14 +33,14 @@ def fetch_search():
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_missing_kids(number_pages=5):
|
||||
task = "Fetch MissingKids"
|
||||
def fetch_selenium_search():
|
||||
task = "Fetch Selenium search"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
FetchSeleniumSourceSearch().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_missing_kids_all(number_pages=-1):
|
||||
def fetch_missing_kids(number_pages=5):
|
||||
task = "Fetch MissingKids"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
@@ -85,6 +86,8 @@ def background_task(process_type: str):
|
||||
FetchParser().run()
|
||||
elif (process_type == "fetch_search"):
|
||||
FetchSearcher().run()
|
||||
elif (process_type == "fetch_selenium_search"):
|
||||
FetchSeleniumSourceSearch().run()
|
||||
elif (process_type == "fetch_missingkids_all"):
|
||||
FetchMissingKids().run(number_pages=-1)
|
||||
|
||||
|
||||
@@ -14,8 +14,8 @@ def link_list(request):
|
||||
# Base URL path
|
||||
app_url = request.build_absolute_uri()
|
||||
# Tasks
|
||||
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
|
||||
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"]
|
||||
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "fetch_selenium_search"]
|
||||
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_valid_all", "process_missing_kids_urls_invalid_all", "process_missing_kids_urls_unknown_all", "process_missing_kids_urls_all", "clean_old_url_content_60"]
|
||||
# List of links
|
||||
list_links = \
|
||||
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
|
||||
|
||||
@@ -212,6 +212,27 @@
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch Selenium Search",
|
||||
"callable": "fetcher.tasks.fetch_selenium_search",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
"at_front": false,
|
||||
"timeout": 3600,
|
||||
"result_ttl": 86400,
|
||||
"cron_string": null,
|
||||
"scheduled_time": "2025-01-01T00:00:00+00:00",
|
||||
"interval": 1,
|
||||
"interval_unit": "days",
|
||||
"successful_runs": 0,
|
||||
"failed_runs": 0,
|
||||
"last_successful_run": null,
|
||||
"last_failed_run": null
|
||||
},
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch MissingKids",
|
||||
@@ -236,9 +257,15 @@
|
||||
{
|
||||
"model": "RepeatableTaskType",
|
||||
"name": "Fetch MissingKids ALL",
|
||||
"callable": "fetcher.tasks.fetch_missing_kids_all",
|
||||
"callable": "fetcher.tasks.fetch_missing_kids",
|
||||
"callable_args": [],
|
||||
"callable_kwargs": [],
|
||||
"callable_kwargs": [
|
||||
{
|
||||
"arg_type": "int",
|
||||
"key": "number_pages",
|
||||
"val": "-1"
|
||||
}
|
||||
],
|
||||
"enabled": false,
|
||||
"queue": "default",
|
||||
"repeat": null,
|
||||
|
||||
Reference in New Issue
Block a user