Better lang detect, fetch parser handling, rss google news search
This commit is contained in:
@@ -28,11 +28,18 @@ class FetchParser():
|
||||
|
||||
# Make sure no requests made for the last X seconds
|
||||
url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
|
||||
# Source object
|
||||
url_host_built = newspaper.build(url_host_protocol)
|
||||
# Get articles URL list
|
||||
urls_fetched = url_host_built.article_urls()
|
||||
|
||||
try:
|
||||
# Source object
|
||||
url_host_built = newspaper.build(url_host_protocol)
|
||||
# Get articles URL list
|
||||
urls_fetched = url_host_built.article_urls()
|
||||
except newspaper.exceptions.ArticleException as e:
|
||||
logger.debug("ArticleException while parsing input URL {}\n{}".format(url_host_protocol, str(e.args)))
|
||||
urls_fetched = []
|
||||
except Exception as e:
|
||||
logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
|
||||
urls_fetched = []
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
|
||||
@@ -3,10 +3,36 @@ from ..models import Search, Source
|
||||
from django.db.models import Q
|
||||
import traceback
|
||||
import time
|
||||
from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news
|
||||
from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news, search_googlenews_rss
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
'''
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
# Generic fetcher (fetches articles, writes to DB)
|
||||
class FetcherAbstract(ABC):
|
||||
@abstractmethod
|
||||
def _fetch_raw_urls_list(self):
|
||||
pass
|
||||
|
||||
def fetch_articles(self, db_writer):
|
||||
logger.debug("Starting fetch() for {}".format(self.name))
|
||||
# Fetch articles
|
||||
list_news = self._fetch()
|
||||
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
|
||||
# Write to DB
|
||||
db_writer.write_batch(list_news, self.name)
|
||||
|
||||
|
||||
self._fetch_raw_urls_list()
|
||||
raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
|
||||
raw_urls = self._post_process_urls(raw_urls, obj_search)
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
|
||||
'''
|
||||
|
||||
|
||||
class FetchSearcher():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher Searcher")
|
||||
@@ -18,6 +44,16 @@ class FetchSearcher():
|
||||
obj_source, created = Source.objects.get_or_create(source=source)
|
||||
return obj_source
|
||||
|
||||
def _post_process_urls(self, raw_urls, obj_search):
|
||||
# Searching URL Host based? Make sure results belong to that site
|
||||
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
|
||||
# Get clean URL host
|
||||
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
|
||||
# Ensure URL host in URL
|
||||
raw_urls = [u for u in raw_urls if url_host_clean in u]
|
||||
|
||||
return raw_urls
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchSearcher.run()")
|
||||
@@ -33,49 +69,55 @@ class FetchSearcher():
|
||||
# TODO: intitle: "child abuse"
|
||||
|
||||
# Search
|
||||
keyword_search = "{}{}".format("site:" if obj_search.type is Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
|
||||
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
|
||||
|
||||
logger.debug("Starting keyword search: {}".format(keyword_search))
|
||||
logger.debug("Search type: {}".format(obj_search.type))
|
||||
|
||||
# news.google.com/rss
|
||||
time.sleep(5)
|
||||
raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
|
||||
raw_urls = self._post_process_urls(raw_urls, obj_search)
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
|
||||
|
||||
|
||||
# DDG News
|
||||
time.sleep(5)
|
||||
raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "wt-wt")
|
||||
raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "en-US")
|
||||
raw_urls = self._post_process_urls(raw_urls, obj_search)
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
|
||||
|
||||
# GNews
|
||||
time.sleep(5)
|
||||
raw_urls, source = search_gnews(keyword_search, language="en", country="US")
|
||||
raw_urls = self._post_process_urls(raw_urls, obj_search)
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
|
||||
|
||||
# DDG Text
|
||||
# DDG Text (week, 20 results)
|
||||
time.sleep(5)
|
||||
raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=None, region = "wt-wt")
|
||||
raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=20, region = "en-US")
|
||||
raw_urls = self._post_process_urls(raw_urls, obj_search)
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
|
||||
|
||||
# GoogleNews news
|
||||
time.sleep(5)
|
||||
raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US")
|
||||
raw_urls = self._post_process_urls(raw_urls, obj_search)
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
|
||||
|
||||
# GoogleNews general
|
||||
time.sleep(5)
|
||||
raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5)
|
||||
raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=2)
|
||||
raw_urls = self._post_process_urls(raw_urls, obj_search)
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
|
||||
|
||||
|
||||
# TODO:
|
||||
# SearxNG
|
||||
"""
|
||||
period = "day"
|
||||
for searx_instance in get_searxng_instances():
|
||||
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
# Append thread
|
||||
FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
|
||||
FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)"
|
||||
"""
|
||||
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
@@ -2,6 +2,9 @@ from django.core.cache import cache
|
||||
import traceback
|
||||
import random
|
||||
import time
|
||||
import feedparser
|
||||
import urllib
|
||||
import dateutil
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
@@ -33,7 +36,7 @@ def decode_gnews_urls(encoded_urls, interval=2):
|
||||
# Cache decoded URL
|
||||
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
|
||||
else:
|
||||
logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, decoded_url["message"]))
|
||||
logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, str(decoded_url)))
|
||||
except Exception as e:
|
||||
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
|
||||
return list_decoded_urls
|
||||
@@ -124,7 +127,7 @@ def search_googlenews_general(keyword_search, period="1d", language="en", countr
|
||||
|
||||
# Iterate pages
|
||||
for i in range(max_pages):
|
||||
time.sleep(random.uniform(1, 2.5))
|
||||
time.sleep(random.uniform(2, 4.5))
|
||||
num_before = len(set_links)
|
||||
|
||||
# Get page
|
||||
@@ -148,4 +151,47 @@ def search_googlenews_general(keyword_search, period="1d", language="en", countr
|
||||
|
||||
return urls, source
|
||||
|
||||
###########################################################################
|
||||
###########################################################################
|
||||
|
||||
def search_googlenews_rss(keyword_search, language="en", country="US"):
|
||||
# [source] [category] [period] [language-country] [max_results]
|
||||
source = "googlenews-rss {}-{}".format(language, country).replace("None", "").strip()
|
||||
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
|
||||
|
||||
# https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
|
||||
|
||||
try:
|
||||
# Search URL with parameters filled
|
||||
search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(language, country.upper()), country.upper(), country.upper(), language)
|
||||
# Control characters
|
||||
search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
|
||||
# Initialize
|
||||
encoded_urls = []
|
||||
# Fetch feeds
|
||||
feeds = feedparser.parse(search_url)
|
||||
# Parse
|
||||
for f in feeds.get("entries", []):
|
||||
# Encoded URL
|
||||
encoded_url = f.get("link", None)
|
||||
'''
|
||||
# Available publish date?
|
||||
publish_date_parsed = f.get("published_parsed")
|
||||
if (publish_date_parsed is None):
|
||||
publish_date = f.get("published", None)
|
||||
if (publish_date is not None):
|
||||
publish_date_parsed = dateutil.parser.parse(publish_date)
|
||||
|
||||
# Published date
|
||||
urls_publish_date.append(publish_date_parsed)'
|
||||
'''
|
||||
# Append
|
||||
encoded_urls.append(encoded_url)
|
||||
|
||||
# Decode
|
||||
urls = decode_gnews_urls(encoded_urls)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
|
||||
urls = []
|
||||
|
||||
return urls, source
|
||||
|
||||
@@ -4,9 +4,8 @@ logger = get_logger()
|
||||
import newspaper
|
||||
import time
|
||||
from urllib.parse import unquote
|
||||
# pip install langdetect
|
||||
#import langdetect
|
||||
#langdetect.DetectorFactory.seed = 0
|
||||
import langdetect
|
||||
langdetect.DetectorFactory.seed = 0
|
||||
|
||||
def get_with_protocol(url):
|
||||
# http:// -> https://
|
||||
@@ -76,7 +75,8 @@ def process_url(url):
|
||||
"url_host": article.source_url,
|
||||
"site_name": article.meta_site_name,
|
||||
"publish_date": article.publish_date,
|
||||
"language": article.meta_lang, # langdetect.detect(article.text)
|
||||
# article.meta_lang -> Not always reliable
|
||||
"language": langdetect.detect("\n".join([article.title, article.meta_description, article.text]) ),
|
||||
"title": article.title,
|
||||
"description": article.meta_description,
|
||||
"content": article.text,
|
||||
|
||||
Reference in New Issue
Block a user