Better lang detect, fetch parser handling, rss google news search

This commit is contained in:
Luciano Gervasoni
2025-03-31 17:44:53 +02:00
parent b3f896b35a
commit 077219fcb6
6 changed files with 201 additions and 140 deletions

View File

@@ -28,11 +28,18 @@ class FetchParser():
# Make sure no requests made for the last X seconds
url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
# Source object
url_host_built = newspaper.build(url_host_protocol)
# Get articles URL list
urls_fetched = url_host_built.article_urls()
try:
# Source object
url_host_built = newspaper.build(url_host_protocol)
# Get articles URL list
urls_fetched = url_host_built.article_urls()
except newspaper.exceptions.ArticleException as e:
logger.debug("ArticleException while parsing input URL {}\n{}".format(url_host_protocol, str(e.args)))
urls_fetched = []
except Exception as e:
logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
urls_fetched = []
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e:

View File

@@ -3,10 +3,36 @@ from ..models import Search, Source
from django.db.models import Q
import traceback
import time
from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news
from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news, search_googlenews_rss
from .logger import get_logger
logger = get_logger()
'''
from abc import ABC, abstractmethod
# Generic fetcher (fetches articles, writes to DB)
class FetcherAbstract(ABC):
@abstractmethod
def _fetch_raw_urls_list(self):
pass
def fetch_articles(self, db_writer):
logger.debug("Starting fetch() for {}".format(self.name))
# Fetch articles
list_news = self._fetch()
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
# Write to DB
db_writer.write_batch(list_news, self.name)
self._fetch_raw_urls_list()
raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
'''
class FetchSearcher():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Searcher")
@@ -18,6 +44,16 @@ class FetchSearcher():
obj_source, created = Source.objects.get_or_create(source=source)
return obj_source
def _post_process_urls(self, raw_urls, obj_search):
# Searching URL Host based? Make sure results belong to that site
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
# Get clean URL host
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
# Ensure URL host in URL
raw_urls = [u for u in raw_urls if url_host_clean in u]
return raw_urls
def run(self):
try:
logger.debug("Starting FetchSearcher.run()")
@@ -33,49 +69,55 @@ class FetchSearcher():
# TODO: intitle: "child abuse"
# Search
keyword_search = "{}{}".format("site:" if obj_search.type is Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
logger.debug("Starting keyword search: {}".format(keyword_search))
logger.debug("Search type: {}".format(obj_search.type))
# news.google.com/rss
time.sleep(5)
raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# DDG News
time.sleep(5)
raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "wt-wt")
raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "en-US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# GNews
time.sleep(5)
raw_urls, source = search_gnews(keyword_search, language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# DDG Text
# DDG Text (week, 20 results)
time.sleep(5)
raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=None, region = "wt-wt")
raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=20, region = "en-US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# GoogleNews news
time.sleep(5)
raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# GoogleNews general
time.sleep(5)
raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5)
raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=2)
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# TODO:
# SearxNG
"""
period = "day"
for searx_instance in get_searxng_instances():
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
# Append thread
FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)"
"""
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
except Exception as e:
logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -2,6 +2,9 @@ from django.core.cache import cache
import traceback
import random
import time
import feedparser
import urllib
import dateutil
from .logger import get_logger
logger = get_logger()
@@ -33,7 +36,7 @@ def decode_gnews_urls(encoded_urls, interval=2):
# Cache decoded URL
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
else:
logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, decoded_url["message"]))
logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, str(decoded_url)))
except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
return list_decoded_urls
@@ -124,7 +127,7 @@ def search_googlenews_general(keyword_search, period="1d", language="en", countr
# Iterate pages
for i in range(max_pages):
time.sleep(random.uniform(1, 2.5))
time.sleep(random.uniform(2, 4.5))
num_before = len(set_links)
# Get page
@@ -148,4 +151,47 @@ def search_googlenews_general(keyword_search, period="1d", language="en", countr
return urls, source
###########################################################################
###########################################################################
def search_googlenews_rss(keyword_search, language="en", country="US"):
# [source] [category] [period] [language-country] [max_results]
source = "googlenews-rss {}-{}".format(language, country).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
try:
# Search URL with parameters filled
search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(language, country.upper()), country.upper(), country.upper(), language)
# Control characters
search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
# Initialize
encoded_urls = []
# Fetch feeds
feeds = feedparser.parse(search_url)
# Parse
for f in feeds.get("entries", []):
# Encoded URL
encoded_url = f.get("link", None)
'''
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)'
'''
# Append
encoded_urls.append(encoded_url)
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source

View File

@@ -4,9 +4,8 @@ logger = get_logger()
import newspaper
import time
from urllib.parse import unquote
# pip install langdetect
#import langdetect
#langdetect.DetectorFactory.seed = 0
import langdetect
langdetect.DetectorFactory.seed = 0
def get_with_protocol(url):
# http:// -> https://
@@ -76,7 +75,8 @@ def process_url(url):
"url_host": article.source_url,
"site_name": article.meta_site_name,
"publish_date": article.publish_date,
"language": article.meta_lang, # langdetect.detect(article.text)
# article.meta_lang -> Not always reliable
"language": langdetect.detect("\n".join([article.title, article.meta_description, article.text]) ),
"title": article.title,
"description": article.meta_description,
"content": article.text,