Refactoring fetcher, working feeds and raw url writer

This commit is contained in:
Luciano Gervasoni
2025-03-12 17:56:40 +01:00
parent e124dbc21a
commit 61c31ee9aa
24 changed files with 2085 additions and 194 deletions

View File

@@ -0,0 +1,190 @@
from ..models import Urls, UrlContent, UrlsSource, Source, WebsiteToFilter, StatusPatternMatching
from .url_processor import process_url
from django.utils import timezone
from django.core.cache import cache
import hashlib
from datetime import timedelta
import re
import traceback
from .logger import get_logger
logger = get_logger()
class DB_Handler():
def __init__(self):
logger.debug("Initializing URL DB Handler")
def _get_safe_cache_key(self, raw_key):
"""Generate a safe cache key using an MD5 hash"""
return hashlib.md5(raw_key.encode()).hexdigest()
def _cache_key(self, cache_key, cache_timeout=86400):
cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout)
def _is_cached_key(self, cache_key):
# Returns True if cached
return cache.get(self._get_safe_cache_key(cache_key)) is not None
def insert_raw_urls(self, urls, source):
try:
logger.debug("Inserting raw URLs")
# Empty?
if (len(urls) == 0):
logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
return
url_object_to_insert = []
# Per URL
for url in urls:
### Already processed URL?
if (self._is_cached_key(url)):
logger.debug("Already cached URL: {}".format(url))
if (self._is_cached_key("{}{}".format(source, url))):
logger.debug("Already cached (source, URL): {} {}".format(source, url))
else:
### Insert source
# Get the source (create if not exists)
source_obj, created = Source.objects.get_or_create(source=source)
# Get URL ID
url_obj = Urls.objects.get(url=url)
# Create (id_source, id_url)
UrlsSource.objects.create(id_source=source_obj.id, id_url=url_obj.id)
else:
# Add object to insert
url_object_to_insert.append(Urls(url=url))
### Bulk insert URLs, ignore conflicts if a url exists
bulk_created_obj = Urls.objects.bulk_create(url_object_to_insert, ignore_conflicts=True)
# Insert or update cache
for url in urls:
self._cache_key(url)
self._cache_key("{}{}".format(source, url))
logger.info("Inserted #{} raw URLs".format(len(url_object_to_insert)))
except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
def _get_status_pattern_matching(self, url, article_status, list_pattern_status_tuple):
# Sort pattern tuples by priority. (pattern, priority, status)
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
# Status "raw", "duplicated" and "error" should remain the way they are
# Assumption: List of patterns sorted by importance
if (article_status in ["valid", "invalid", "unknown"]):
# Regular expression pattern matching: https://regexr.com/
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
# Matching? Update article status
if bool(re.match(regex_pattern, url)):
if (status_if_match != article_status):
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
return status_if_match
# Pattern matching not required or not found, original article status
return article_status
def process_raw_urls(self, time_delta=timedelta(days=1), batch_size=50):
try:
logger.debug("Processing raw URLs")
# Get list of domains to filter
list_domains_to_filter = WebsiteToFilter.objects.values_list('url_host', flat=True)
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
# Fetched during last 24 hours
time_delta_ts = timezone.now() - time_delta
# Get batch of URLs, status='raw' and fetched X days ago
raw_urls = Urls.objects.filter(status='raw', ts_fetch__gte=time_delta_ts)[:batch_size]
# List of objects to bulk update
updating_urls = []
# Per URL
for obj_url in raw_urls:
##### Any domain to filter included in URL? -> Invalid
if (any([d in obj_url.url for d in list_domains_to_filter])):
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
# Update status
obj_url.status = 'invalid'
# Append to bulk update
updating_urls.append(obj_url)
# Next URL
continue
##### Process URL
try:
# Get data
dict_url_data = process_url(obj_url.url)
# Not none or handle as exception
assert(dict_url_data is not None)
except Exception as e:
logger.debug("Error processing URL: {}\n{}".format(obj_url.url, str(e)))
# Update status
obj_url.status = 'error'
# Append to bulk update
updating_urls.append(obj_url)
# Next URL
continue
##### Canonical URL different? -> Duplicate
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status
obj_url.status = 'duplicate'
# Append to bulk update
updating_urls.append(obj_url)
# Get or create URL with canonical form
obj_url_canonical = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
# Associate same sources to url -> url_canonical
# Get the sources id associated to obj_url.id
url_sources = UrlsSource.objects.filter(id_url=obj_url.id)
for url_source_obj in url_sources:
# Associate same sources to url_canonical (it might already exist)
UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical.id)
# Next URL
continue
##### Valid URL
# Update status
obj_url.status = 'valid'
# Append to bulk update
updating_urls.append(obj_url)
# Create extracted URL data
UrlContent.objects.create_or_update(
id_url=obj_url.id,
date_published=dict_url_data.get("publish_date"),
title=dict_url_data.get("title"),
description=dict_url_data.get("description"),
content=dict_url_data.get("content"),
valid_content=dict_url_data.get("valid_content"),
language=dict_url_data.get("language"),
keywords=dict_url_data.get("keywords"),
tags=dict_url_data.get("tags"),
authors=dict_url_data.get("authors"),
image_main=dict_url_data.get("image_main"),
images_url=dict_url_data.get("images_url"),
videos_url=dict_url_data.get("videos_url"),
url_host=dict_url_data.get("url_host"),
site_name=dict_url_data.get("site_name"),
)
##### Override status if pattern matching?
for obj_url in updating_urls:
# Check if article status needs to be updated with pattern matching
status_pattern_matching = self._get_status_pattern_matching(obj_url.url, obj_url.status, list_pattern_status_tuple)
# Update status?
if (status_pattern_matching != obj_url.status):
logger.debug("Pattern matching, overriding with status {} for URL: {}".format(status_pattern_matching, obj_url.url))
# Update, no need to append to updating_urls, already included
obj_url.status = status_pattern_matching
# Bulk update
Urls.objects.bulk_update(updating_urls, ['status'])
logger.debug("Finished processing raw URLs")
except Exception as e:
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))