furl remove parameters on search results

This commit is contained in:
Luciano Gervasoni
2025-07-03 13:35:40 +02:00
parent e657c3bee1
commit 68b56eafea
2 changed files with 5 additions and 0 deletions

View File

@@ -6,6 +6,7 @@ from .fetch_utils_gnews import decode_gnews_urls
from .logger import get_logger
logger = get_logger()
from furl import furl
from gnews import GNews
from duckduckgo_search import DDGS
from GoogleNews import GoogleNews
@@ -40,6 +41,9 @@ class FetcherAbstract(ABC):
# Ensure URL host in URL
raw_urls = [u for u in raw_urls if url_host_clean in u]
# Remove URL parameters, e.g. "?param=1234&h=yes"
raw_urls = [ furl(u).remove(furl(u).args).url for u in raw_urls ]
return raw_urls
def fetch_articles(self, db_writer, obj_search):

View File

@@ -15,6 +15,7 @@ gnews
GoogleNews
duckduckgo_search
git+https://github.com/tasos-py/Search-Engines-Scraper.git
furl
langdetect
ollama
PyJWT