furl remove parameters on search results
This commit is contained in:
@@ -6,6 +6,7 @@ from .fetch_utils_gnews import decode_gnews_urls
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
from furl import furl
|
||||
from gnews import GNews
|
||||
from duckduckgo_search import DDGS
|
||||
from GoogleNews import GoogleNews
|
||||
@@ -40,6 +41,9 @@ class FetcherAbstract(ABC):
|
||||
# Ensure URL host in URL
|
||||
raw_urls = [u for u in raw_urls if url_host_clean in u]
|
||||
|
||||
# Remove URL parameters, e.g. "?param=1234&h=yes"
|
||||
raw_urls = [ furl(u).remove(furl(u).args).url for u in raw_urls ]
|
||||
|
||||
return raw_urls
|
||||
|
||||
def fetch_articles(self, db_writer, obj_search):
|
||||
|
||||
@@ -15,6 +15,7 @@ gnews
|
||||
GoogleNews
|
||||
duckduckgo_search
|
||||
git+https://github.com/tasos-py/Search-Engines-Scraper.git
|
||||
furl
|
||||
langdetect
|
||||
ollama
|
||||
PyJWT
|
||||
Reference in New Issue
Block a user