Working fetch search, refactoring DB towards source search

This commit is contained in:
Luciano Gervasoni
2025-03-20 11:42:33 +01:00
parent 83f76232b2
commit 05e17266f1
14 changed files with 558 additions and 120 deletions

View File

@@ -12,7 +12,6 @@ logger = get_logger()
class DB_Handler():
def __init__(self):
logger.debug("Initializing URL DB Handler")
# Inserting raw URL, cache time: 1 day
self._cache_timeout_insert_url = 86400
# Processing error URL, cache time: 2 days
@@ -37,16 +36,15 @@ class DB_Handler():
else:
return cache.get(cache_key) is not None
def insert_raw_urls(self, urls, source):
def clean_protocol(url):
# http:// -> https://
url = url.replace("http://", "https://")
# "" -> https://
if not (url.startswith("https://")):
url = "https://" + url
return url
def _clean_protocol(self, url):
# http:// -> https://
url = url.replace("http://", "https://")
# "" -> https://
if not (url.startswith("https://")):
url = "https://" + url
return url
def insert_raw_urls(self, urls, source):
try:
logger.debug("Inserting raw URLs")
# Empty?
@@ -55,7 +53,7 @@ class DB_Handler():
return
# Default protocol https://
urls_clean = [clean_protocol(url) for url in urls]
urls_clean = [self._clean_protocol(url) for url in urls]
# Get the source (create if not exists)
source_obj, created = Source.objects.get_or_create(source=source)
@@ -90,7 +88,7 @@ class DB_Handler():
UrlsSource.objects.bulk_create([UrlsSource(id_source=source_obj, id_url=url_obj) for url_obj in bulk_created_urls], ignore_conflicts=True)
except IntegrityError as e:
### Fallback to one-by-one insert
logger.debug("bulk_create exception while inserting raw URLs, falling back to non-bulk method")
logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method")
# One by one
for url in urls_to_insert:
# URL
@@ -177,9 +175,16 @@ class DB_Handler():
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
# Invalid? e.g. binary data
if (dict_url_data.get("override_status") == "invalid"):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
##### Canonical URL different? -> Duplicate
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
@@ -194,6 +199,10 @@ class DB_Handler():
# URLs duplciate association
obj_urls_duplicate, created = UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
# TODO: return obj_url_canonical so as to directly process the recently inserted URL
# Whever this function is called, add:
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
# Next URL
return
@@ -273,6 +282,7 @@ class DB_Handler():
for obj_url in error_urls:
# URL ID cached? -> Tried to process recently already, skip
if (self._is_cached_key("error_{}".format(obj_url.id), hash_encoded=False)):
logger.debug("Already cached URL ID: {}".format(obj_url.id))
num_urls_skipped += 1
continue
@@ -299,7 +309,7 @@ class DB_Handler():
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
&
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID))
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
)[:batch_size]
# Per URL

View File

@@ -1,5 +1,5 @@
from .db_utils import DB_Handler
from ..models import Feed
from ..models import Search
import feedparser
import dateutil
import traceback
@@ -15,7 +15,7 @@ class FetchFeeds():
logger.debug("Starting FetchFeeds.run()")
# Get feeds
list_url_feeds = list(Feed.objects.values_list('rss_feed', flat=True))
list_url_feeds = list(Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED).values_list('search', flat=True))
logger.debug("Fetching from feeds: {}".format(list_url_feeds))
# Process via RSS feeds

View File

@@ -1,5 +1,5 @@
from .db_utils import DB_Handler
from ..models import WebsiteOfInterest
from ..models import Search
import newspaper
import traceback
from .logger import get_logger
@@ -14,7 +14,7 @@ class FetchParser():
logger.debug("Starting FetchParser.run() for {}")
# Get URL hosts
list_url_host = list(WebsiteOfInterest.objects.values_list('url_host', flat=True))
list_url_host = list(Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST).values_list('search', flat=True))
logger.debug("Fetching news by parsing URL hosts: {}".format(list_url_host))
# Process newspaper4k build method

View File

@@ -0,0 +1,75 @@
from .db_utils import DB_Handler
from ..models import Search
import traceback
import time
from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news
from .logger import get_logger
logger = get_logger()
class FetchSearcher():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Searcher")
def run(self):
try:
logger.debug("Starting FetchSearcher.run()")
# Get keyword searches of interest
list_keyword_search = list(Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH).values_list('search', flat=True))
# Get URL host of interest
list_url_host = list(Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST).values_list('search', flat=True))
# TODO: allintitle: "child abuse"
# TODO: intitle: "child abuse"
# list_keyword_search + ['allintitle: "{}"'.format(s) for s in list_keyword_search] + ['intitle: "{}"'.format(s) for s in list_keyword_search]
# Merge searches
list_search = list_keyword_search + ["site:{}".format(u) for u in list_url_host]
logger.debug("Fetching from keyword search: {}".format(list_search))
# Search
for keyword_search in list_search:
# TODO: language & country customization
# DDG News
time.sleep(5)
raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "wt-wt")
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, source)
# GNews
time.sleep(5)
raw_urls, source = search_gnews(keyword_search, language="en", country="US")
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, source)
# DDG Text
time.sleep(5)
raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=None, region = "wt-wt")
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, source)
# GoogleNews news
time.sleep(5)
raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US")
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, source)
# GoogleNews general
time.sleep(5)
raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, source)
# TODO:
# SearxNG
"""
period = "day"
for searx_instance in get_searxng_instances():
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
# Append thread
FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)"
"""
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
except Exception as e:
logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,129 @@
import traceback
import random
import time
from .logger import get_logger
logger = get_logger()
from googlenewsdecoder import gnewsdecoder
from gnews import GNews
from duckduckgo_search import DDGS
from GoogleNews import GoogleNews
###########################################################################
def decode_gnews_urls(encoded_urls):
# DecodeURLs
list_decoded_urls = []
for url in encoded_urls:
try:
# Decode URL, with interval time to avoid block
decoded_url = gnewsdecoder(url, interval=5)
# Ok?
if decoded_url.get("status"):
list_decoded_urls.append(decoded_url["decoded_url"])
else:
logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, decoded_url["message"]))
except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
return list_decoded_urls
###########################################################################
def search_gnews(keyword_search, period="1d", language="en", country="US", max_results=100):
# [source] [category] [period] [language-country] [max_results]
source = "gnews {} {} {}-{} max_results={}".format("news", period, language, country, max_results).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# Get news
results_gnews = GNews(language=language, country=country).get_news(keyword_search)
# Get list of encoded urls
encoded_urls = [e.get("url") for e in results_gnews]
# Decode
list_decoded_urls = decode_gnews_urls(encoded_urls)
return list_decoded_urls, source
###########################################################################
def search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region="wt-wt"):
# [source] [category] [period] [language-country] [max_results]
source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# region="{}-{}".format(langauge, country.lower())
# timelimit= # Options: d, w, m
# max_results # max number of results. If None, returns results only from the first response. Defaults to None
if (category == "news"):
news = DDGS().news(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
urls = [e.get("url") for e in news]
if (category == "text"):
news = DDGS().text(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
urls = [e.get("href") for e in news]
return urls, source
###########################################################################
def search_googlenews_news(keyword_search, period="1d", language="en", country="US"):
category = "news"
# [source] [category] [period] [language-country]
source = "googlenews {} {} {}-{}".format(category, period, language, country).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# Initialize
googlenews = GoogleNews(period=period, lang=language, region=country)
googlenews.enableException(True)
try:
# Search
googlenews.get_news(keyword_search)
# Fetch
encoded_urls = googlenews.get_links()
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source
def search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5):
category="general"
# [source] [category] [period] [language-country] [max_results]
source = "googlenews {} {} {}-{} max_pages={}".format(category, period, language, country, max_pages).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# Initialize
googlenews = GoogleNews(period=period, lang=language, region=country)
googlenews.enableException(True)
try:
set_links = set()
# Search
googlenews.search(keyword_search)
# Iterate pages
for i in range(max_pages):
time.sleep(random.uniform(1, 2.5))
num_before = len(set_links)
# Get page
try:
links = googlenews.page_at(i+1)
except Exception as e:
logger.warning("Exception fetching page in GoogleNews {}: {}".format(source, str(e)))
break
# Links
for l in links:
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
set_links.add( l.get("link").split("&ved=")[0] )
# Finished?
if (num_before == len(set_links)):
break
# To list
urls = list(set_links)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source
###########################################################################

View File

@@ -12,6 +12,9 @@ def process_url(url):
try:
# Process
article = newspaper.article(url)
except newspaper.ArticleBinaryDataException:
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
return {"override_status": "invalid"}
except newspaper.ArticleException as e:
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
return None