Selenium based fetch of different sources

2025-07-08 18:18:26 +02:00
parent f729bd1cb2
commit 0cf61026e8
10 changed files with 235 additions and 31 deletions
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@
        - TODO: Proxy / VPN?
            - TooManyRequests, ...
        - TODO: Search per locale (nl-NL, fr-FR, en-GB)
+    - Fetch keyword search for selenium sources
+

 - URLs Processing -> Updates raw URLs
    - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
@@ -52,12 +54,10 @@
 * Dev mode
 ```
 docker compose -f docker-compose-dev.yml down -v
-docker compose -f docker-compose-dev.yml build --progress=plain
-docker compose -f docker-compose-dev.yml up
+docker compose -f docker-compose-dev.yml up --no-deps --build
 ```
 * Prod mode
 ```
 docker compose down -v
-docker compose build --progress=plain
-docker compose up -d
+docker compose up -d --no-deps --build
 ```
--- a/app_selenium/app.py
+++ b/app_selenium/app.py
@@ -1,6 +1,7 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 from missing_kids import MissingKidsFetcher
+from search import SearchFetcher
 from logger import get_logger
 logger = get_logger()

@@ -12,17 +13,41 @@ def get_missing_kids(pages: int = -1):
        logger.info("Get missing kids, #pages={}".format(pages))
        res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)}
    except Exception as e:
+        logger.warning("Exception: {}".format(str(e)), exc_info=True)
        res = {}
    return res

-class Body(BaseModel):
+class BodyVerifyMissingKid(BaseModel):
    url: str

@app.post("/verify_missing_kid/")
-def get_missing_kids(data: Body):
+def get_missing_kids(data: BodyVerifyMissingKid):
    try:
        logger.info("Verify missing kid, URL={}".format(data.url))
        res = MissingKidsFetcher().verify_missing_kid_url(data.url)
    except Exception as e:
+        logger.warning("Exception: {}".format(str(e)), exc_info=True)
        res = {}
-    return res
+    return res
+
+class BodyFetchSearch(BaseModel):
+    search: str
+
+@app.post("/fetch_search/")
+def fetch_search(data: BodyFetchSearch):
+    try:
+        # Initialize
+        search_fetcher, results = SearchFetcher(), {}
+        # Iterate
+        for source in search_fetcher.get_available_sources():
+            logger.info("Fetch based search source={} search={}".format(source, data.search))
+            # Fetch
+            results[source] = SearchFetcher().search(source, data.search)
+            # Empty?
+            if (len(results[source]) == 0):
+                results.pop(source)
+
+    except Exception as e:
+        logger.warning("Exception: {}".format(str(e)), exc_info=True)
+        results = {}
+    return results
--- a/app_selenium/logger.py
+++ b/app_selenium/logger.py
@@ -8,10 +8,10 @@ logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
 os.makedirs(logs_directory, exist_ok=True)

 logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
-logger = logging.getLogger("app_selenium")
-logger.setLevel(logging.DEBUG)
+logger = logging.getLogger("selenium")
+logger.setLevel(logging.INFO)

-# To file log: INFO / WARNING / ERROR / CRITICAL
+# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL
 fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
 fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
 fh.setLevel(logging.DEBUG)
--- a/app_selenium/missing_kids.py
+++ b/app_selenium/missing_kids.py
@@ -1,7 +1,5 @@
-from selenium import webdriver
+from utils import get_webdriver
 from selenium.webdriver.common.by import By
-from selenium.webdriver.firefox.options import Options 
-from selenium.webdriver.firefox.service import Service
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import TimeoutException
@@ -11,16 +9,6 @@ import os
 from logger import get_logger
 logger = get_logger()

-def get_webdriver():
-    options = Options()
-    options.add_argument('--headless')  # Optional
-    options.binary_location = '/opt/firefox/firefox'
-
-    service = Service('/usr/local/bin/geckodriver')
-
-    driver = webdriver.Firefox(options=options, service=service)
-    return driver
-
 class MissingKidsFetcher():
    def __init__(self) -> None:
        pass
--- a/app_selenium/search.py
+++ b/app_selenium/search.py
@@ -0,0 +1,106 @@
+from utils import get_webdriver
+from selenium.webdriver.common.by import By
+from urllib.parse import quote
+import time
+from logger import get_logger
+logger = get_logger()
+
+class SearchFetcher():
+    def __init__(self):
+        pass
+
+    def get_available_sources(self, ):
+        return ["foxnews", "breitbart", "zerohedge"]
+
+    def search(self, source, search="child abuse"):
+        try:
+            if (source == "foxnews"):
+                return self._search_foxnews(search)
+            elif (source == "breitbart"):
+                return self._search_breitbart(search)
+            elif (source == "zerohedge"):
+                return self._search_zerohedge(search)
+            else:
+                logger.warning("Search not implemented for source={} search={}".format(source, search))
+                return []
+        except Exception as e:
+            logger.warning("Error searching for source={} search={}".format(source, search))
+            return []
+
+    def _search_foxnews(self, search):
+        url_host = "foxnews.com"
+        # URL search
+        url_unquoted = "https://www.foxnews.com/search-results/search#q={}".format(search)
+        url = quote(url_unquoted, safe=":/?=&#")
+
+        # Initialize
+        driver = get_webdriver()
+        # Load URL
+        driver.get(url)
+        time.sleep(2)
+
+        # Find the element with class "page"
+        page_element = driver.find_element(By.CLASS_NAME, "page")
+        # Find the articles
+        articles = page_element.find_elements(By.CLASS_NAME, "article")
+        # Extract URLs
+        urls = [ art.find_element(By.CLASS_NAME, "m").find_element(By.TAG_NAME, "a").get_attribute("href") for art in articles ]
+        
+        # Remove duplicates, remove None
+        urls = [u for u in set(urls) if u is not None]
+        # Filter by URL host
+        urls = [u for u in urls if url_host in u]
+
+        return urls
+    
+    def _search_breitbart(self, search):
+        url_host = "breitbart.com"
+        # URL search
+        url_unquoted = "https://www.breitbart.com/search/?s={}".format(search.replace(" ", "+"))
+        url = quote(url_unquoted, safe=":/?=&#")
+
+        # Initialize
+        driver = get_webdriver()
+        # Load URL
+        driver.get(url)
+        time.sleep(4)
+
+        # Find the element with class "page"
+        page_element = driver.find_element(By.CLASS_NAME, "gsc-expansionArea")
+        # Find the articles
+        articles = page_element.find_elements(By.CLASS_NAME, "gs-title")
+        # Extract URLs
+        urls = [ art.get_attribute("href") for art in articles ]
+
+        # Remove duplicates, remove None
+        urls = [u for u in set(urls) if u is not None]
+        # Filter by URL host
+        urls = [u for u in urls if url_host in u]
+
+        return urls
+
+    def _search_zerohedge(self, search):
+        url_host = "zerohedge.com"
+        # URL search
+        url_unquoted = "https://www.zerohedge.com/search-content?qTitleBody={}".format(search.replace(" ", "+"))
+        url = quote(url_unquoted, safe=":/?=&#")
+
+        # Initialize
+        driver = get_webdriver()
+        # Load URL
+        driver.get(url)
+        time.sleep(2)
+
+        # Find the element with class "page"
+        page_element = driver.find_element(By.CLASS_NAME, "main-content")
+        # Find the articles
+        articles = page_element.find_elements(By.TAG_NAME, "a")
+        # Extract URLs
+        urls = [ art.get_attribute("href") for art in articles]
+        
+        # Remove duplicates, remove None
+        urls = [u for u in set(urls) if u is not None]
+        # Filter by URL host
+        urls = [u for u in urls if url_host in u]
+        
+        return urls
--- a/app_selenium/utils.py
+++ b/app_selenium/utils.py
@@ -0,0 +1,13 @@
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options 
+from selenium.webdriver.firefox.service import Service
+
+def get_webdriver():
+    options = Options()
+    options.add_argument('--headless')  # Optional
+    options.binary_location = '/opt/firefox/firefox'
+
+    service = Service('/usr/local/bin/geckodriver')
+
+    driver = webdriver.Firefox(options=options, service=service)
+    return driver
--- a/app_urls/fetcher/src/fetch_selenium.py
+++ b/app_urls/fetcher/src/fetch_selenium.py
@@ -0,0 +1,42 @@
+from .db_utils import DB_Handler
+from ..models import Search, Source
+import traceback
+import requests
+import os
+from .logger import get_logger
+logger = get_logger()
+
+class FetchSeleniumSourceSearch():
+    def __init__(self) -> None:
+        logger.debug("Initializing Selenium Source Search")
+
+    def run(self):
+        try:
+            logger.debug("Starting FetchSeleniumSourceSearch.run()")
+
+            # Get keyword searches
+            list_keyword_search = Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH)
+            logger.debug("Fetching news Selenium based for keyword searches: {}".format([e.search for e in list_keyword_search]))
+
+            # Run selenium search for each keyword search
+            for obj_search in list_keyword_search:
+                try:
+                    # Selenium fetching endpoint
+                    selenium_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "fetch_search/")
+                    data = {"search": obj_search.search}
+                    # POST
+                    r = requests.post(selenium_fetch_endpoint, json=data, timeout=900)
+                    # Jsonify
+                    results = r.json()
+                    logger.debug("Selenium results for URL {}: {}".format(obj_search.search, str(results)))
+
+                    for source, urls_fetched in results.items():
+                        # Get source object
+                        obj_source, created = Source.objects.get_or_create(source="selenium {}".format(source))
+
+                        # Write to DB
+                        DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
+                except Exception as e:
+                    logger.warning("Exception while fetching selenium search: {}\n{}".format(obj_search.search, str(e)))
+        except Exception as e:
+            logger.warning("Exception in FetchSeleniumSourceSearch.run(): {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/fetcher/tasks.py
+++ b/app_urls/fetcher/tasks.py
@@ -4,6 +4,7 @@ from .src.fetch_feed import FetchFeeds
 from .src.fetch_parser import FetchParser
 from .src.fetch_search import FetchSearcher
 from .src.fetch_missing_kids import FetchMissingKids
+from .src.fetch_selenium import FetchSeleniumSourceSearch
 from .src.db_utils import DB_Handler
 from .src.publisher import Publisher

@@ -32,14 +33,14 @@ def fetch_search():
    logger.info("Task completed: {}".format(task))

@job('default')
-def fetch_missing_kids(number_pages=5):
-    task = "Fetch MissingKids"
+def fetch_selenium_search():
+    task = "Fetch Selenium search"
    logger.info("Task triggered: {}".format(task))
-    FetchMissingKids().run(number_pages)
+    FetchSeleniumSourceSearch().run()
    logger.info("Task completed: {}".format(task))

@job('default')
-def fetch_missing_kids_all(number_pages=-1):
+def fetch_missing_kids(number_pages=5):
    task = "Fetch MissingKids"
    logger.info("Task triggered: {}".format(task))
    FetchMissingKids().run(number_pages)
@@ -85,6 +86,8 @@ def background_task(process_type: str):
            FetchParser().run()
        elif (process_type == "fetch_search"):
            FetchSearcher().run()
+        elif (process_type == "fetch_selenium_search"):
+            FetchSeleniumSourceSearch().run()
        elif (process_type == "fetch_missingkids_all"):
            FetchMissingKids().run(number_pages=-1)

--- a/app_urls/fetcher/views_base.py
+++ b/app_urls/fetcher/views_base.py
@@ -14,8 +14,8 @@ def link_list(request):
    # Base URL path
    app_url = request.build_absolute_uri()
    # Tasks
-    links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
-    links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"]
+    links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "fetch_selenium_search"]
+    links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_valid_all", "process_missing_kids_urls_invalid_all", "process_missing_kids_urls_unknown_all", "process_missing_kids_urls_all", "clean_old_url_content_60"]
    # List of links
    list_links = \
        [ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
--- a/app_urls/scheduled_tasks.json
+++ b/app_urls/scheduled_tasks.json
@@ -212,6 +212,27 @@
    "last_successful_run": null,
    "last_failed_run": null
  },
+  {
+    "model": "RepeatableTaskType",
+    "name": "Fetch Selenium Search",
+    "callable": "fetcher.tasks.fetch_selenium_search",
+    "callable_args": [],
+    "callable_kwargs": [],
+    "enabled": false,
+    "queue": "default",
+    "repeat": null,
+    "at_front": false,
+    "timeout": 3600,
+    "result_ttl": 86400,
+    "cron_string": null,
+    "scheduled_time": "2025-01-01T00:00:00+00:00",
+    "interval": 1,
+    "interval_unit": "days",
+    "successful_runs": 0,
+    "failed_runs": 0,
+    "last_successful_run": null,
+    "last_failed_run": null
+  },
  {
    "model": "RepeatableTaskType",
    "name": "Fetch MissingKids",
@@ -236,9 +257,15 @@
  {
    "model": "RepeatableTaskType",
    "name": "Fetch MissingKids ALL",
-    "callable": "fetcher.tasks.fetch_missing_kids_all",
+    "callable": "fetcher.tasks.fetch_missing_kids",
    "callable_args": [],
-    "callable_kwargs": [],
+    "callable_kwargs": [
+      {
+        "arg_type": "int",
+        "key": "number_pages",
+        "val": "-1"
+      }
+    ],
    "enabled": false,
    "queue": "default",
    "repeat": null,