From 0cf61026e8659edce98b4d8c3ff50f940a0a87fb Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Tue, 8 Jul 2025 18:18:26 +0200 Subject: [PATCH] Selenium based fetch of different sources --- README.md | 8 +- app_selenium/app.py | 31 +++++++- app_selenium/logger.py | 6 +- app_selenium/missing_kids.py | 14 +--- app_selenium/search.py | 106 +++++++++++++++++++++++++ app_selenium/utils.py | 13 +++ app_urls/fetcher/src/fetch_selenium.py | 42 ++++++++++ app_urls/fetcher/tasks.py | 11 ++- app_urls/fetcher/views_base.py | 4 +- app_urls/scheduled_tasks.json | 31 +++++++- 10 files changed, 235 insertions(+), 31 deletions(-) create mode 100644 app_selenium/search.py create mode 100644 app_selenium/utils.py create mode 100644 app_urls/fetcher/src/fetch_selenium.py diff --git a/README.md b/README.md index 51a39c5..93e069e 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ - TODO: Proxy / VPN? - TooManyRequests, ... - TODO: Search per locale (nl-NL, fr-FR, en-GB) + - Fetch keyword search for selenium sources + - URLs Processing -> Updates raw URLs - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date @@ -52,12 +54,10 @@ * Dev mode ``` docker compose -f docker-compose-dev.yml down -v -docker compose -f docker-compose-dev.yml build --progress=plain -docker compose -f docker-compose-dev.yml up +docker compose -f docker-compose-dev.yml up --no-deps --build ``` * Prod mode ``` docker compose down -v -docker compose build --progress=plain -docker compose up -d +docker compose up -d --no-deps --build ``` \ No newline at end of file diff --git a/app_selenium/app.py b/app_selenium/app.py index d051f65..55841b8 100644 --- a/app_selenium/app.py +++ b/app_selenium/app.py @@ -1,6 +1,7 @@ from fastapi import FastAPI from pydantic import BaseModel from missing_kids import MissingKidsFetcher +from search import SearchFetcher from logger import get_logger logger = get_logger() @@ -12,17 +13,41 @@ def get_missing_kids(pages: int = -1): logger.info("Get missing kids, #pages={}".format(pages)) res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)} except Exception as e: + logger.warning("Exception: {}".format(str(e)), exc_info=True) res = {} return res -class Body(BaseModel): +class BodyVerifyMissingKid(BaseModel): url: str @app.post("/verify_missing_kid/") -def get_missing_kids(data: Body): +def get_missing_kids(data: BodyVerifyMissingKid): try: logger.info("Verify missing kid, URL={}".format(data.url)) res = MissingKidsFetcher().verify_missing_kid_url(data.url) except Exception as e: + logger.warning("Exception: {}".format(str(e)), exc_info=True) res = {} - return res \ No newline at end of file + return res + +class BodyFetchSearch(BaseModel): + search: str + +@app.post("/fetch_search/") +def fetch_search(data: BodyFetchSearch): + try: + # Initialize + search_fetcher, results = SearchFetcher(), {} + # Iterate + for source in search_fetcher.get_available_sources(): + logger.info("Fetch based search source={} search={}".format(source, data.search)) + # Fetch + results[source] = SearchFetcher().search(source, data.search) + # Empty? + if (len(results[source]) == 0): + results.pop(source) + + except Exception as e: + logger.warning("Exception: {}".format(str(e)), exc_info=True) + results = {} + return results diff --git a/app_selenium/logger.py b/app_selenium/logger.py index 37894cd..2136e98 100644 --- a/app_selenium/logger.py +++ b/app_selenium/logger.py @@ -8,10 +8,10 @@ logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs") os.makedirs(logs_directory, exist_ok=True) logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') -logger = logging.getLogger("app_selenium") -logger.setLevel(logging.DEBUG) +logger = logging.getLogger("selenium") +logger.setLevel(logging.INFO) -# To file log: INFO / WARNING / ERROR / CRITICAL +# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) fh.setLevel(logging.DEBUG) diff --git a/app_selenium/missing_kids.py b/app_selenium/missing_kids.py index d373820..f6563de 100644 --- a/app_selenium/missing_kids.py +++ b/app_selenium/missing_kids.py @@ -1,7 +1,5 @@ -from selenium import webdriver +from utils import get_webdriver from selenium.webdriver.common.by import By -from selenium.webdriver.firefox.options import Options -from selenium.webdriver.firefox.service import Service from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException @@ -11,16 +9,6 @@ import os from logger import get_logger logger = get_logger() -def get_webdriver(): - options = Options() - options.add_argument('--headless') # Optional - options.binary_location = '/opt/firefox/firefox' - - service = Service('/usr/local/bin/geckodriver') - - driver = webdriver.Firefox(options=options, service=service) - return driver - class MissingKidsFetcher(): def __init__(self) -> None: pass diff --git a/app_selenium/search.py b/app_selenium/search.py new file mode 100644 index 0000000..e1d946b --- /dev/null +++ b/app_selenium/search.py @@ -0,0 +1,106 @@ +from utils import get_webdriver +from selenium.webdriver.common.by import By +from urllib.parse import quote +import time +from logger import get_logger +logger = get_logger() + +class SearchFetcher(): + def __init__(self): + pass + + def get_available_sources(self, ): + return ["foxnews", "breitbart", "zerohedge"] + + def search(self, source, search="child abuse"): + try: + if (source == "foxnews"): + return self._search_foxnews(search) + elif (source == "breitbart"): + return self._search_breitbart(search) + elif (source == "zerohedge"): + return self._search_zerohedge(search) + else: + logger.warning("Search not implemented for source={} search={}".format(source, search)) + return [] + except Exception as e: + logger.warning("Error searching for source={} search={}".format(source, search)) + return [] + + def _search_foxnews(self, search): + url_host = "foxnews.com" + # URL search + url_unquoted = "https://www.foxnews.com/search-results/search#q={}".format(search) + url = quote(url_unquoted, safe=":/?=&#") + + # Initialize + driver = get_webdriver() + # Load URL + driver.get(url) + time.sleep(2) + + # Find the element with class "page" + page_element = driver.find_element(By.CLASS_NAME, "page") + # Find the articles + articles = page_element.find_elements(By.CLASS_NAME, "article") + # Extract URLs + urls = [ art.find_element(By.CLASS_NAME, "m").find_element(By.TAG_NAME, "a").get_attribute("href") for art in articles ] + + # Remove duplicates, remove None + urls = [u for u in set(urls) if u is not None] + # Filter by URL host + urls = [u for u in urls if url_host in u] + + return urls + + def _search_breitbart(self, search): + url_host = "breitbart.com" + # URL search + url_unquoted = "https://www.breitbart.com/search/?s={}".format(search.replace(" ", "+")) + url = quote(url_unquoted, safe=":/?=&#") + + # Initialize + driver = get_webdriver() + # Load URL + driver.get(url) + time.sleep(4) + + # Find the element with class "page" + page_element = driver.find_element(By.CLASS_NAME, "gsc-expansionArea") + # Find the articles + articles = page_element.find_elements(By.CLASS_NAME, "gs-title") + # Extract URLs + urls = [ art.get_attribute("href") for art in articles ] + + # Remove duplicates, remove None + urls = [u for u in set(urls) if u is not None] + # Filter by URL host + urls = [u for u in urls if url_host in u] + + return urls + + def _search_zerohedge(self, search): + url_host = "zerohedge.com" + # URL search + url_unquoted = "https://www.zerohedge.com/search-content?qTitleBody={}".format(search.replace(" ", "+")) + url = quote(url_unquoted, safe=":/?=&#") + + # Initialize + driver = get_webdriver() + # Load URL + driver.get(url) + time.sleep(2) + + # Find the element with class "page" + page_element = driver.find_element(By.CLASS_NAME, "main-content") + # Find the articles + articles = page_element.find_elements(By.TAG_NAME, "a") + # Extract URLs + urls = [ art.get_attribute("href") for art in articles] + + # Remove duplicates, remove None + urls = [u for u in set(urls) if u is not None] + # Filter by URL host + urls = [u for u in urls if url_host in u] + + return urls \ No newline at end of file diff --git a/app_selenium/utils.py b/app_selenium/utils.py new file mode 100644 index 0000000..00736ca --- /dev/null +++ b/app_selenium/utils.py @@ -0,0 +1,13 @@ +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.firefox.service import Service + +def get_webdriver(): + options = Options() + options.add_argument('--headless') # Optional + options.binary_location = '/opt/firefox/firefox' + + service = Service('/usr/local/bin/geckodriver') + + driver = webdriver.Firefox(options=options, service=service) + return driver \ No newline at end of file diff --git a/app_urls/fetcher/src/fetch_selenium.py b/app_urls/fetcher/src/fetch_selenium.py new file mode 100644 index 0000000..f849118 --- /dev/null +++ b/app_urls/fetcher/src/fetch_selenium.py @@ -0,0 +1,42 @@ +from .db_utils import DB_Handler +from ..models import Search, Source +import traceback +import requests +import os +from .logger import get_logger +logger = get_logger() + +class FetchSeleniumSourceSearch(): + def __init__(self) -> None: + logger.debug("Initializing Selenium Source Search") + + def run(self): + try: + logger.debug("Starting FetchSeleniumSourceSearch.run()") + + # Get keyword searches + list_keyword_search = Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH) + logger.debug("Fetching news Selenium based for keyword searches: {}".format([e.search for e in list_keyword_search])) + + # Run selenium search for each keyword search + for obj_search in list_keyword_search: + try: + # Selenium fetching endpoint + selenium_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "fetch_search/") + data = {"search": obj_search.search} + # POST + r = requests.post(selenium_fetch_endpoint, json=data, timeout=900) + # Jsonify + results = r.json() + logger.debug("Selenium results for URL {}: {}".format(obj_search.search, str(results))) + + for source, urls_fetched in results.items(): + # Get source object + obj_source, created = Source.objects.get_or_create(source="selenium {}".format(source)) + + # Write to DB + DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search) + except Exception as e: + logger.warning("Exception while fetching selenium search: {}\n{}".format(obj_search.search, str(e))) + except Exception as e: + logger.warning("Exception in FetchSeleniumSourceSearch.run(): {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/fetcher/tasks.py b/app_urls/fetcher/tasks.py index 7f0a3cd..4696b27 100644 --- a/app_urls/fetcher/tasks.py +++ b/app_urls/fetcher/tasks.py @@ -4,6 +4,7 @@ from .src.fetch_feed import FetchFeeds from .src.fetch_parser import FetchParser from .src.fetch_search import FetchSearcher from .src.fetch_missing_kids import FetchMissingKids +from .src.fetch_selenium import FetchSeleniumSourceSearch from .src.db_utils import DB_Handler from .src.publisher import Publisher @@ -32,14 +33,14 @@ def fetch_search(): logger.info("Task completed: {}".format(task)) @job('default') -def fetch_missing_kids(number_pages=5): - task = "Fetch MissingKids" +def fetch_selenium_search(): + task = "Fetch Selenium search" logger.info("Task triggered: {}".format(task)) - FetchMissingKids().run(number_pages) + FetchSeleniumSourceSearch().run() logger.info("Task completed: {}".format(task)) @job('default') -def fetch_missing_kids_all(number_pages=-1): +def fetch_missing_kids(number_pages=5): task = "Fetch MissingKids" logger.info("Task triggered: {}".format(task)) FetchMissingKids().run(number_pages) @@ -85,6 +86,8 @@ def background_task(process_type: str): FetchParser().run() elif (process_type == "fetch_search"): FetchSearcher().run() + elif (process_type == "fetch_selenium_search"): + FetchSeleniumSourceSearch().run() elif (process_type == "fetch_missingkids_all"): FetchMissingKids().run(number_pages=-1) diff --git a/app_urls/fetcher/views_base.py b/app_urls/fetcher/views_base.py index c6eba73..52598db 100644 --- a/app_urls/fetcher/views_base.py +++ b/app_urls/fetcher/views_base.py @@ -14,8 +14,8 @@ def link_list(request): # Base URL path app_url = request.build_absolute_uri() # Tasks - links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"] - links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"] + links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "fetch_selenium_search"] + links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_valid_all", "process_missing_kids_urls_invalid_all", "process_missing_kids_urls_unknown_all", "process_missing_kids_urls_all", "clean_old_url_content_60"] # List of links list_links = \ [ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \ diff --git a/app_urls/scheduled_tasks.json b/app_urls/scheduled_tasks.json index 68fb307..6453671 100644 --- a/app_urls/scheduled_tasks.json +++ b/app_urls/scheduled_tasks.json @@ -212,6 +212,27 @@ "last_successful_run": null, "last_failed_run": null }, + { + "model": "RepeatableTaskType", + "name": "Fetch Selenium Search", + "callable": "fetcher.tasks.fetch_selenium_search", + "callable_args": [], + "callable_kwargs": [], + "enabled": false, + "queue": "default", + "repeat": null, + "at_front": false, + "timeout": 3600, + "result_ttl": 86400, + "cron_string": null, + "scheduled_time": "2025-01-01T00:00:00+00:00", + "interval": 1, + "interval_unit": "days", + "successful_runs": 0, + "failed_runs": 0, + "last_successful_run": null, + "last_failed_run": null + }, { "model": "RepeatableTaskType", "name": "Fetch MissingKids", @@ -236,9 +257,15 @@ { "model": "RepeatableTaskType", "name": "Fetch MissingKids ALL", - "callable": "fetcher.tasks.fetch_missing_kids_all", + "callable": "fetcher.tasks.fetch_missing_kids", "callable_args": [], - "callable_kwargs": [], + "callable_kwargs": [ + { + "arg_type": "int", + "key": "number_pages", + "val": "-1" + } + ], "enabled": false, "queue": "default", "repeat": null,