furl remove parameters on search results
This commit is contained in:
@@ -6,6 +6,7 @@ from .fetch_utils_gnews import decode_gnews_urls
|
|||||||
from .logger import get_logger
|
from .logger import get_logger
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
|
||||||
|
from furl import furl
|
||||||
from gnews import GNews
|
from gnews import GNews
|
||||||
from duckduckgo_search import DDGS
|
from duckduckgo_search import DDGS
|
||||||
from GoogleNews import GoogleNews
|
from GoogleNews import GoogleNews
|
||||||
@@ -40,6 +41,9 @@ class FetcherAbstract(ABC):
|
|||||||
# Ensure URL host in URL
|
# Ensure URL host in URL
|
||||||
raw_urls = [u for u in raw_urls if url_host_clean in u]
|
raw_urls = [u for u in raw_urls if url_host_clean in u]
|
||||||
|
|
||||||
|
# Remove URL parameters, e.g. "?param=1234&h=yes"
|
||||||
|
raw_urls = [ furl(u).remove(furl(u).args).url for u in raw_urls ]
|
||||||
|
|
||||||
return raw_urls
|
return raw_urls
|
||||||
|
|
||||||
def fetch_articles(self, db_writer, obj_search):
|
def fetch_articles(self, db_writer, obj_search):
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ gnews
|
|||||||
GoogleNews
|
GoogleNews
|
||||||
duckduckgo_search
|
duckduckgo_search
|
||||||
git+https://github.com/tasos-py/Search-Engines-Scraper.git
|
git+https://github.com/tasos-py/Search-Engines-Scraper.git
|
||||||
|
furl
|
||||||
langdetect
|
langdetect
|
||||||
ollama
|
ollama
|
||||||
PyJWT
|
PyJWT
|
||||||
Reference in New Issue
Block a user