General search fix, status pattern match regex, find feeds on startup

This commit is contained in:
Luciano Gervasoni
2025-04-09 15:52:35 +02:00
parent 296a8fe8a8
commit f369b23d81
22 changed files with 538 additions and 356 deletions

View File

@@ -81,6 +81,7 @@ class DB_Handler():
except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
def set_status(obj_url, status):
@@ -89,17 +90,17 @@ class DB_Handler():
obj_url.status = status
obj_url.save()
##### Filter URL? -> Invalid
if (status_pattern_match == "invalid"):
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
# Found a pattern match -> Override status
if (status_pattern_match is not None):
logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
set_status(obj_url, status_pattern_match)
##### Filter URL? -> Invalid (don't extract content)
if (status_pattern_match == "invalid"):
return
##### Process URL
try:
# Get data
# Extract URL content
dict_url_data = process_url(obj_url.url)
except Exception as e:
if (raise_exception_on_error):
@@ -110,25 +111,10 @@ class DB_Handler():
# Set status to error
dict_url_data = None
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
if (dict_url_data is None):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
# Invalid? e.g. binary data
if (dict_url_data.get("override_status") == "invalid"):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
##### Canonical URL different? -> Duplicate
if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
# Get or create URL with canonical form
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
# Get the source-search IDs associated to obj_url.id
@@ -136,42 +122,54 @@ class DB_Handler():
for obj_url_source_search in list_url_source_search:
# Associate same sources to url_canonical (it might already exist)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
# URLs duplciate association
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
# TODO: return obj_url_canonical so as to directly process the recently inserted URL
# Wherever this function is called, add:
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
# Next URL
return
##### Valid URL
# Update status
set_status(obj_url, Urls.STATUS_ENUM.VALID)
# Not overriding status given pattern matching?
if (status_pattern_match is None):
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
if (dict_url_data is None):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
# Invalid? e.g. binary data
if (dict_url_data.get("override_status") == "invalid"):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
##### Valid URL
# Update status
set_status(obj_url, Urls.STATUS_ENUM.VALID)
try:
# Create or update extracted URL data
UrlContent.objects.update_or_create(
id_url=obj_url,
defaults = {
"date_published" : dict_url_data.get("publish_date"),
"title" : dict_url_data.get("title"),
"description" : dict_url_data.get("description"),
"content" : dict_url_data.get("content"),
"valid_content" : dict_url_data.get("valid_content"),
"language" : dict_url_data.get("language"),
"keywords" : dict_url_data.get("keywords"),
"tags" : dict_url_data.get("tags"),
"authors" : dict_url_data.get("authors"),
"image_main_url" : dict_url_data.get("image_main_url"),
"images_url" : dict_url_data.get("images_url"),
"videos_url" : dict_url_data.get("videos_url"),
"url_host" : dict_url_data.get("url_host"),
"site_name" : dict_url_data.get("site_name"),
}
)
if (dict_url_data is not None):
# Create or update extracted URL data
UrlContent.objects.update_or_create(
id_url=obj_url,
defaults = {
"date_published" : dict_url_data.get("publish_date"),
"title" : dict_url_data.get("title"),
"description" : dict_url_data.get("description"),
"content" : dict_url_data.get("content"),
"valid_content" : dict_url_data.get("valid_content"),
"language" : dict_url_data.get("language"),
"keywords" : dict_url_data.get("keywords"),
"tags" : dict_url_data.get("tags"),
"authors" : dict_url_data.get("authors"),
"image_main_url" : dict_url_data.get("image_main_url"),
"images_url" : dict_url_data.get("images_url"),
"videos_url" : dict_url_data.get("videos_url"),
"url_host" : dict_url_data.get("url_host"),
"site_name" : dict_url_data.get("site_name"),
}
)
except Exception as e:
logger.debug("Error in update_or_create UrlContent: {}\ndict_url_data: {}\n{}\n{}".format(obj_url.url, dict_url_data, str(e), traceback.format_exc()))
@@ -179,13 +177,12 @@ class DB_Handler():
def process_raw_urls(self, batch_size):
def _get_status_pattern_matching(url, list_pattern_status_tuple):
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
"""
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only """
# Sort pattern tuples by priority. (pattern, priority, status)
for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
# Regular expression pattern matching: https://regexr.com/
if bool(re.match(regex_pattern, obj_url.url)):
logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
# logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
return status_if_match
return None

View File

@@ -49,11 +49,11 @@ class FetchSearcher():
"language": "en",
"country": "US",
# "period": ["7d", "1d"], # TODO: List of periods to iterate
}
}
for SearchInstance in ListSearchInstances:
# Sleep between requests, avoid too many requests...
time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
SearchInstance(args).fetch_articles(db_writer, obj_search)
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master

View File

@@ -55,8 +55,8 @@ class FetcherAbstract(ABC):
keyword_search = "{}{}".format("site:", keyword_search)
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
start_date = timezone.now() - timedelta(days=7)
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
logger.debug("Appending news to general search")
keyword_search = "{}{}".format(keyword_search, "news")
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
# Fetch
@@ -194,7 +194,7 @@ class SearchGoogleGeneral(FetcherAbstract):
# Iterate pages
for i in range(self.pages):
# Sleep between pages fetch
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
time.sleep(float(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
# Number of URLs fetched so far
num_before = len(set_links)
# Get page

View File

@@ -6,7 +6,7 @@ logger = get_logger()
from googlenewsdecoder import gnewsdecoder
def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
def decode_gnews_urls(encoded_urls, interval=float(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
logger.debug("Decoding gnews URLs")
# DecodeURLs
list_decoded_urls = []

View File

@@ -41,7 +41,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
def process_url(url):
try:
# Slow down if required to avoid too many requests error
url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
# Process
article = newspaper.article(url)
except newspaper.ArticleBinaryDataException: