diff --git a/app_urls/fetcher/src/fetch_search_instances.py b/app_urls/fetcher/src/fetch_search_instances.py index ba87104..5c41e22 100644 --- a/app_urls/fetcher/src/fetch_search_instances.py +++ b/app_urls/fetcher/src/fetch_search_instances.py @@ -6,6 +6,7 @@ from .fetch_utils_gnews import decode_gnews_urls from .logger import get_logger logger = get_logger() +from furl import furl from gnews import GNews from duckduckgo_search import DDGS from GoogleNews import GoogleNews @@ -40,6 +41,9 @@ class FetcherAbstract(ABC): # Ensure URL host in URL raw_urls = [u for u in raw_urls if url_host_clean in u] + # Remove URL parameters, e.g. "?param=1234&h=yes" + raw_urls = [ furl(u).remove(furl(u).args).url for u in raw_urls ] + return raw_urls def fetch_articles(self, db_writer, obj_search): diff --git a/app_urls/requirements.txt b/app_urls/requirements.txt index fc0fe9b..69df09e 100644 --- a/app_urls/requirements.txt +++ b/app_urls/requirements.txt @@ -15,6 +15,7 @@ gnews GoogleNews duckduckgo_search git+https://github.com/tasos-py/Search-Engines-Scraper.git +furl langdetect ollama PyJWT \ No newline at end of file