From 68b56eafea54dabae1deeadb4f0b23f80fcfc6b6 Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Thu, 3 Jul 2025 13:35:40 +0200 Subject: [PATCH] furl remove parameters on search results --- app_urls/fetcher/src/fetch_search_instances.py | 4 ++++ app_urls/requirements.txt | 1 + 2 files changed, 5 insertions(+) diff --git a/app_urls/fetcher/src/fetch_search_instances.py b/app_urls/fetcher/src/fetch_search_instances.py index ba87104..5c41e22 100644 --- a/app_urls/fetcher/src/fetch_search_instances.py +++ b/app_urls/fetcher/src/fetch_search_instances.py @@ -6,6 +6,7 @@ from .fetch_utils_gnews import decode_gnews_urls from .logger import get_logger logger = get_logger() +from furl import furl from gnews import GNews from duckduckgo_search import DDGS from GoogleNews import GoogleNews @@ -40,6 +41,9 @@ class FetcherAbstract(ABC): # Ensure URL host in URL raw_urls = [u for u in raw_urls if url_host_clean in u] + # Remove URL parameters, e.g. "?param=1234&h=yes" + raw_urls = [ furl(u).remove(furl(u).args).url for u in raw_urls ] + return raw_urls def fetch_articles(self, db_writer, obj_search): diff --git a/app_urls/requirements.txt b/app_urls/requirements.txt index fc0fe9b..69df09e 100644 --- a/app_urls/requirements.txt +++ b/app_urls/requirements.txt @@ -15,6 +15,7 @@ gnews GoogleNews duckduckgo_search git+https://github.com/tasos-py/Search-Engines-Scraper.git +furl langdetect ollama PyJWT \ No newline at end of file