Selenium based fetch of different sources

This commit is contained in:
Luciano Gervasoni
2025-07-08 18:18:26 +02:00
parent f729bd1cb2
commit 0cf61026e8
10 changed files with 235 additions and 31 deletions

View File

@@ -15,6 +15,8 @@
- TODO: Proxy / VPN?
- TooManyRequests, ...
- TODO: Search per locale (nl-NL, fr-FR, en-GB)
- Fetch keyword search for selenium sources
- URLs Processing -> Updates raw URLs
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
@@ -52,12 +54,10 @@
* Dev mode
```
docker compose -f docker-compose-dev.yml down -v
docker compose -f docker-compose-dev.yml build --progress=plain
docker compose -f docker-compose-dev.yml up
docker compose -f docker-compose-dev.yml up --no-deps --build
```
* Prod mode
```
docker compose down -v
docker compose build --progress=plain
docker compose up -d
docker compose up -d --no-deps --build
```

View File

@@ -1,6 +1,7 @@
from fastapi import FastAPI
from pydantic import BaseModel
from missing_kids import MissingKidsFetcher
from search import SearchFetcher
from logger import get_logger
logger = get_logger()
@@ -12,17 +13,41 @@ def get_missing_kids(pages: int = -1):
logger.info("Get missing kids, #pages={}".format(pages))
res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)}
except Exception as e:
logger.warning("Exception: {}".format(str(e)), exc_info=True)
res = {}
return res
class Body(BaseModel):
class BodyVerifyMissingKid(BaseModel):
url: str
@app.post("/verify_missing_kid/")
def get_missing_kids(data: Body):
def get_missing_kids(data: BodyVerifyMissingKid):
try:
logger.info("Verify missing kid, URL={}".format(data.url))
res = MissingKidsFetcher().verify_missing_kid_url(data.url)
except Exception as e:
logger.warning("Exception: {}".format(str(e)), exc_info=True)
res = {}
return res
return res
class BodyFetchSearch(BaseModel):
search: str
@app.post("/fetch_search/")
def fetch_search(data: BodyFetchSearch):
try:
# Initialize
search_fetcher, results = SearchFetcher(), {}
# Iterate
for source in search_fetcher.get_available_sources():
logger.info("Fetch based search source={} search={}".format(source, data.search))
# Fetch
results[source] = SearchFetcher().search(source, data.search)
# Empty?
if (len(results[source]) == 0):
results.pop(source)
except Exception as e:
logger.warning("Exception: {}".format(str(e)), exc_info=True)
results = {}
return results

View File

@@ -8,10 +8,10 @@ logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
os.makedirs(logs_directory, exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("app_selenium")
logger.setLevel(logging.DEBUG)
logger = logging.getLogger("selenium")
logger.setLevel(logging.INFO)
# To file log: INFO / WARNING / ERROR / CRITICAL
# To file log: DEBUG / INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG)

View File

@@ -1,7 +1,5 @@
from selenium import webdriver
from utils import get_webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
@@ -11,16 +9,6 @@ import os
from logger import get_logger
logger = get_logger()
def get_webdriver():
options = Options()
options.add_argument('--headless') # Optional
options.binary_location = '/opt/firefox/firefox'
service = Service('/usr/local/bin/geckodriver')
driver = webdriver.Firefox(options=options, service=service)
return driver
class MissingKidsFetcher():
def __init__(self) -> None:
pass

106
app_selenium/search.py Normal file
View File

@@ -0,0 +1,106 @@
from utils import get_webdriver
from selenium.webdriver.common.by import By
from urllib.parse import quote
import time
from logger import get_logger
logger = get_logger()
class SearchFetcher():
def __init__(self):
pass
def get_available_sources(self, ):
return ["foxnews", "breitbart", "zerohedge"]
def search(self, source, search="child abuse"):
try:
if (source == "foxnews"):
return self._search_foxnews(search)
elif (source == "breitbart"):
return self._search_breitbart(search)
elif (source == "zerohedge"):
return self._search_zerohedge(search)
else:
logger.warning("Search not implemented for source={} search={}".format(source, search))
return []
except Exception as e:
logger.warning("Error searching for source={} search={}".format(source, search))
return []
def _search_foxnews(self, search):
url_host = "foxnews.com"
# URL search
url_unquoted = "https://www.foxnews.com/search-results/search#q={}".format(search)
url = quote(url_unquoted, safe=":/?=&#")
# Initialize
driver = get_webdriver()
# Load URL
driver.get(url)
time.sleep(2)
# Find the element with class "page"
page_element = driver.find_element(By.CLASS_NAME, "page")
# Find the articles
articles = page_element.find_elements(By.CLASS_NAME, "article")
# Extract URLs
urls = [ art.find_element(By.CLASS_NAME, "m").find_element(By.TAG_NAME, "a").get_attribute("href") for art in articles ]
# Remove duplicates, remove None
urls = [u for u in set(urls) if u is not None]
# Filter by URL host
urls = [u for u in urls if url_host in u]
return urls
def _search_breitbart(self, search):
url_host = "breitbart.com"
# URL search
url_unquoted = "https://www.breitbart.com/search/?s={}".format(search.replace(" ", "+"))
url = quote(url_unquoted, safe=":/?=&#")
# Initialize
driver = get_webdriver()
# Load URL
driver.get(url)
time.sleep(4)
# Find the element with class "page"
page_element = driver.find_element(By.CLASS_NAME, "gsc-expansionArea")
# Find the articles
articles = page_element.find_elements(By.CLASS_NAME, "gs-title")
# Extract URLs
urls = [ art.get_attribute("href") for art in articles ]
# Remove duplicates, remove None
urls = [u for u in set(urls) if u is not None]
# Filter by URL host
urls = [u for u in urls if url_host in u]
return urls
def _search_zerohedge(self, search):
url_host = "zerohedge.com"
# URL search
url_unquoted = "https://www.zerohedge.com/search-content?qTitleBody={}".format(search.replace(" ", "+"))
url = quote(url_unquoted, safe=":/?=&#")
# Initialize
driver = get_webdriver()
# Load URL
driver.get(url)
time.sleep(2)
# Find the element with class "page"
page_element = driver.find_element(By.CLASS_NAME, "main-content")
# Find the articles
articles = page_element.find_elements(By.TAG_NAME, "a")
# Extract URLs
urls = [ art.get_attribute("href") for art in articles]
# Remove duplicates, remove None
urls = [u for u in set(urls) if u is not None]
# Filter by URL host
urls = [u for u in urls if url_host in u]
return urls

13
app_selenium/utils.py Normal file
View File

@@ -0,0 +1,13 @@
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
def get_webdriver():
options = Options()
options.add_argument('--headless') # Optional
options.binary_location = '/opt/firefox/firefox'
service = Service('/usr/local/bin/geckodriver')
driver = webdriver.Firefox(options=options, service=service)
return driver

View File

@@ -0,0 +1,42 @@
from .db_utils import DB_Handler
from ..models import Search, Source
import traceback
import requests
import os
from .logger import get_logger
logger = get_logger()
class FetchSeleniumSourceSearch():
def __init__(self) -> None:
logger.debug("Initializing Selenium Source Search")
def run(self):
try:
logger.debug("Starting FetchSeleniumSourceSearch.run()")
# Get keyword searches
list_keyword_search = Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH)
logger.debug("Fetching news Selenium based for keyword searches: {}".format([e.search for e in list_keyword_search]))
# Run selenium search for each keyword search
for obj_search in list_keyword_search:
try:
# Selenium fetching endpoint
selenium_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "fetch_search/")
data = {"search": obj_search.search}
# POST
r = requests.post(selenium_fetch_endpoint, json=data, timeout=900)
# Jsonify
results = r.json()
logger.debug("Selenium results for URL {}: {}".format(obj_search.search, str(results)))
for source, urls_fetched in results.items():
# Get source object
obj_source, created = Source.objects.get_or_create(source="selenium {}".format(source))
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e:
logger.warning("Exception while fetching selenium search: {}\n{}".format(obj_search.search, str(e)))
except Exception as e:
logger.warning("Exception in FetchSeleniumSourceSearch.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -4,6 +4,7 @@ from .src.fetch_feed import FetchFeeds
from .src.fetch_parser import FetchParser
from .src.fetch_search import FetchSearcher
from .src.fetch_missing_kids import FetchMissingKids
from .src.fetch_selenium import FetchSeleniumSourceSearch
from .src.db_utils import DB_Handler
from .src.publisher import Publisher
@@ -32,14 +33,14 @@ def fetch_search():
logger.info("Task completed: {}".format(task))
@job('default')
def fetch_missing_kids(number_pages=5):
task = "Fetch MissingKids"
def fetch_selenium_search():
task = "Fetch Selenium search"
logger.info("Task triggered: {}".format(task))
FetchMissingKids().run(number_pages)
FetchSeleniumSourceSearch().run()
logger.info("Task completed: {}".format(task))
@job('default')
def fetch_missing_kids_all(number_pages=-1):
def fetch_missing_kids(number_pages=5):
task = "Fetch MissingKids"
logger.info("Task triggered: {}".format(task))
FetchMissingKids().run(number_pages)
@@ -85,6 +86,8 @@ def background_task(process_type: str):
FetchParser().run()
elif (process_type == "fetch_search"):
FetchSearcher().run()
elif (process_type == "fetch_selenium_search"):
FetchSeleniumSourceSearch().run()
elif (process_type == "fetch_missingkids_all"):
FetchMissingKids().run(number_pages=-1)

View File

@@ -14,8 +14,8 @@ def link_list(request):
# Base URL path
app_url = request.build_absolute_uri()
# Tasks
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all", "clean_old_url_content_60"]
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "fetch_selenium_search"]
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_valid_all", "process_missing_kids_urls_invalid_all", "process_missing_kids_urls_unknown_all", "process_missing_kids_urls_all", "clean_old_url_content_60"]
# List of links
list_links = \
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \

View File

@@ -212,6 +212,27 @@
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch Selenium Search",
"callable": "fetcher.tasks.fetch_selenium_search",
"callable_args": [],
"callable_kwargs": [],
"enabled": false,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": 3600,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-01-01T00:00:00+00:00",
"interval": 1,
"interval_unit": "days",
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch MissingKids",
@@ -236,9 +257,15 @@
{
"model": "RepeatableTaskType",
"name": "Fetch MissingKids ALL",
"callable": "fetcher.tasks.fetch_missing_kids_all",
"callable": "fetcher.tasks.fetch_missing_kids",
"callable_args": [],
"callable_kwargs": [],
"callable_kwargs": [
{
"arg_type": "int",
"key": "number_pages",
"val": "-1"
}
],
"enabled": false,
"queue": "default",
"repeat": null,