Working fetch feeds and parser, process raw and error urls
This commit is contained in:
@@ -1,11 +1,10 @@
|
||||
from ..models import Urls, UrlContent, UrlsSource, Source, WebsiteToFilter, StatusPatternMatching
|
||||
from ..models import Urls, UrlContent, UrlsSource, UrlsDuplicate, Source, StatusPatternMatching
|
||||
from .url_processor import process_url
|
||||
from django.utils import timezone
|
||||
from django.core.cache import cache
|
||||
from django.db import IntegrityError
|
||||
import hashlib
|
||||
from datetime import timedelta
|
||||
import re
|
||||
import time
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
@@ -13,17 +12,29 @@ logger = get_logger()
|
||||
class DB_Handler():
|
||||
def __init__(self):
|
||||
logger.debug("Initializing URL DB Handler")
|
||||
# Inserting raw URL, cache time: 1 day
|
||||
self._cache_timeout_insert_url = 86400
|
||||
# Processing error URL, cache time: 2 days
|
||||
self._cache_timeout_error_url = 86400*2
|
||||
# URL host slowdown
|
||||
self.url_host_slowdown_seconds = 5
|
||||
|
||||
def _get_safe_cache_key(self, raw_key):
|
||||
"""Generate a safe cache key using an MD5 hash"""
|
||||
return hashlib.md5(raw_key.encode()).hexdigest()
|
||||
|
||||
def _cache_key(self, cache_key, cache_timeout=86400):
|
||||
cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout)
|
||||
def _cache_key(self, cache_key, hash_encode, cache_timeout):
|
||||
if (hash_encode):
|
||||
cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout)
|
||||
else:
|
||||
cache.set(cache_key, True, timeout=cache_timeout)
|
||||
|
||||
def _is_cached_key(self, cache_key):
|
||||
def _is_cached_key(self, cache_key, hash_encoded):
|
||||
# Returns True if cached
|
||||
return cache.get(self._get_safe_cache_key(cache_key)) is not None
|
||||
if (hash_encoded):
|
||||
return cache.get(self._get_safe_cache_key(cache_key)) is not None
|
||||
else:
|
||||
return cache.get(cache_key) is not None
|
||||
|
||||
def insert_raw_urls(self, urls, source):
|
||||
|
||||
@@ -53,10 +64,10 @@ class DB_Handler():
|
||||
for url in urls_clean:
|
||||
|
||||
### Already processed URL?
|
||||
if (self._is_cached_key(url)):
|
||||
if (self._is_cached_key(url, hash_encoded=True)):
|
||||
logger.debug("Already cached URL: {}".format(url))
|
||||
|
||||
if (self._is_cached_key("{}{}".format(source, url))):
|
||||
if (self._is_cached_key("{}{}".format(source, url), hash_encoded=True)):
|
||||
logger.debug("Already cached (source, URL): {} {}".format(source, url))
|
||||
else:
|
||||
### Insert (URL_id, source_id), since not cached
|
||||
@@ -92,139 +103,189 @@ class DB_Handler():
|
||||
|
||||
# Insert or update cache
|
||||
for url in urls_clean:
|
||||
self._cache_key(url)
|
||||
self._cache_key("{}{}".format(source, url))
|
||||
# Hash encode URLs for special characters
|
||||
self._cache_key(url, hash_encode=True, cache_timeout=self._cache_timeout_insert_url)
|
||||
self._cache_key("{}{}".format(source, url), hash_encode=True, cache_timeout=self._cache_timeout_insert_url)
|
||||
|
||||
logger.info("Inserted #{} raw URLs".format(len(urls_to_insert)))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def _get_status_pattern_matching(self, url, article_status, list_pattern_status_tuple):
|
||||
# Sort pattern tuples by priority. (pattern, priority, status)
|
||||
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
|
||||
def _get_url_host(self, url):
|
||||
# URL no protocol, first substring before '/'
|
||||
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||
return url_host
|
||||
|
||||
def _url_host_slowdown(self, url, url_host_slowdown_seconds):
|
||||
### Avoid (frequent) too many requests to the same URL host
|
||||
# Get URL host
|
||||
url_host = self._get_url_host(url)
|
||||
# Recently processed URL host? -> Slow down required
|
||||
last_cached_timestamp = cache.get("processed_{}".format(url_host), None)
|
||||
if last_cached_timestamp:
|
||||
# Get time since last processed URL host (in seconds)
|
||||
time_since_last_processed = time.time() - last_cached_timestamp
|
||||
# Amount of time required to sleep?
|
||||
slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
|
||||
logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
|
||||
# Sleep
|
||||
time.sleep(slowdown_required)
|
||||
# About to process URL host, cache time
|
||||
cache.set("processed_{}".format(url_host), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||
|
||||
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
|
||||
# Status "raw", "duplicated" and "error" should remain the way they are
|
||||
# Assumption: List of patterns sorted by importance
|
||||
if (article_status in ["valid", "invalid", "unknown"]):
|
||||
# Regular expression pattern matching: https://regexr.com/
|
||||
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
|
||||
# Matching? Update article status
|
||||
if bool(re.match(regex_pattern, url)):
|
||||
if (status_if_match != article_status):
|
||||
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
|
||||
##### Filter URL? -> Invalid
|
||||
if (status_pattern_match == "invalid"):
|
||||
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.INVALID
|
||||
obj_url.save()
|
||||
# updating_urls.append(obj_url)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Process URL
|
||||
try:
|
||||
# Slow down if required to avoid too many requests error
|
||||
self._url_host_slowdown(obj_url.url, self.url_host_slowdown_seconds)
|
||||
# Get data
|
||||
dict_url_data = process_url(obj_url.url)
|
||||
# Not none or handle as exception
|
||||
assert(dict_url_data is not None)
|
||||
except Exception as e:
|
||||
if (raise_exception_on_error):
|
||||
# Simply raise exception
|
||||
raise Exception("Error processing URL")
|
||||
else:
|
||||
# Set status to error
|
||||
logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc()))
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.ERROR
|
||||
obj_url.save()
|
||||
# updating_urls.append(obj_url)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Canonical URL different? -> Duplicate
|
||||
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.DUPLICATE
|
||||
obj_url.save()
|
||||
# updating_urls.append(obj_url)
|
||||
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||
# Get the sources id associated to obj_url.id
|
||||
url_sources = UrlsSource.objects.filter(id_url=obj_url)
|
||||
for url_source_obj in url_sources:
|
||||
# Associate same sources to url_canonical (it might already exist)
|
||||
obj_urls_source, created = UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical)
|
||||
|
||||
# URLs duplciate association
|
||||
obj_urls_duplicate, created = UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.VALID
|
||||
obj_url.save()
|
||||
# updating_urls.append(obj_url)
|
||||
|
||||
# Create or update extracted URL data
|
||||
UrlContent.objects.update_or_create(
|
||||
id_url=obj_url,
|
||||
defaults = {
|
||||
"date_published" : dict_url_data.get("publish_date"),
|
||||
"title" : dict_url_data.get("title"),
|
||||
"description" : dict_url_data.get("description"),
|
||||
"content" : dict_url_data.get("content"),
|
||||
"valid_content" : dict_url_data.get("valid_content"),
|
||||
"language" : dict_url_data.get("language"),
|
||||
"keywords" : dict_url_data.get("keywords"),
|
||||
"tags" : dict_url_data.get("tags"),
|
||||
"authors" : dict_url_data.get("authors"),
|
||||
"image_main_url" : dict_url_data.get("image_main_url"),
|
||||
"images_url" : dict_url_data.get("images_url"),
|
||||
"videos_url" : dict_url_data.get("videos_url"),
|
||||
"url_host" : dict_url_data.get("url_host"),
|
||||
"site_name" : dict_url_data.get("site_name"),
|
||||
}
|
||||
)
|
||||
|
||||
def process_raw_urls(self, batch_size):
|
||||
|
||||
def _get_status_pattern_matching(url, list_pattern_status_tuple):
|
||||
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
|
||||
"""
|
||||
# Sort pattern tuples by priority. (pattern, priority, status)
|
||||
for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
|
||||
# Regular expression pattern matching: https://regexr.com/
|
||||
if bool(re.match(regex_pattern, obj_url.url)):
|
||||
logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
|
||||
return status_if_match
|
||||
# Pattern matching not required or not found, original article status
|
||||
return article_status
|
||||
return None
|
||||
|
||||
|
||||
def process_error_urls(self, batch_size=50):
|
||||
# Get batch of URLs, status='error'
|
||||
#error_urls = Urls.objects.SORTBY TS_FETCH....filter(status=Urls.STATUS_ENUM.RAW, ts_fetch__gte=time_delta_ts)[:batch_size]
|
||||
pass
|
||||
|
||||
def process_raw_urls(self, time_delta=timedelta(days=1), batch_size=50):
|
||||
try:
|
||||
logger.debug("Processing raw URLs")
|
||||
|
||||
# Get list of domains to filter
|
||||
list_domains_to_filter = WebsiteToFilter.objects.values_list('url_host', flat=True)
|
||||
# Get batch of URLs, status='raw'
|
||||
raw_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.RAW)[:batch_size]
|
||||
|
||||
if (len(raw_urls) == 0):
|
||||
logger.debug("No raw URLs to process")
|
||||
return
|
||||
|
||||
# Get list of (pattern, priority, status) tuples to override status if required
|
||||
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
|
||||
|
||||
# Fetched during last 24 hours
|
||||
time_delta_ts = timezone.now() - time_delta
|
||||
# Get batch of URLs, status='raw' and fetched X days ago
|
||||
raw_urls = Urls.objects.filter(status=Urls.STATUS_ENUM.RAW, ts_fetch__gte=time_delta_ts)[:batch_size]
|
||||
# List of objects to bulk update
|
||||
updating_urls = []
|
||||
# updating_urls = []
|
||||
|
||||
# Per URL
|
||||
for obj_url in raw_urls:
|
||||
##### Any domain to filter included in URL? -> Invalid
|
||||
if (any([d in obj_url.url for d in list_domains_to_filter])):
|
||||
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.INVALID
|
||||
obj_url.save()
|
||||
updating_urls.append(obj_url)
|
||||
# Next URL
|
||||
continue
|
||||
|
||||
##### Process URL
|
||||
try:
|
||||
# Get data
|
||||
dict_url_data = process_url(obj_url.url)
|
||||
# Not none or handle as exception
|
||||
assert(dict_url_data is not None)
|
||||
except Exception as e:
|
||||
logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc()))
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.ERROR
|
||||
obj_url.save()
|
||||
updating_urls.append(obj_url)
|
||||
# Next URL
|
||||
continue
|
||||
# Override status if pattern matching?
|
||||
status_pattern_match = _get_status_pattern_matching(obj_url.url, list_pattern_status_tuple)
|
||||
# Process URL
|
||||
self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False)
|
||||
|
||||
##### Canonical URL different? -> Duplicate
|
||||
if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.DUPLICATE
|
||||
obj_url.save()
|
||||
updating_urls.append(obj_url)
|
||||
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||
# Get the sources id associated to obj_url.id
|
||||
url_sources = UrlsSource.objects.filter(id_url=obj_url)
|
||||
for url_source_obj in url_sources:
|
||||
# Associate same sources to url_canonical (it might already exist)
|
||||
UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical)
|
||||
|
||||
# Next URL
|
||||
continue
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
obj_url.status = Urls.STATUS_ENUM.VALID
|
||||
obj_url.save()
|
||||
updating_urls.append(obj_url)
|
||||
|
||||
# Create extracted URL data
|
||||
UrlContent.objects.create(
|
||||
id_url=obj_url,
|
||||
date_published=dict_url_data.get("publish_date"),
|
||||
title=dict_url_data.get("title"),
|
||||
description=dict_url_data.get("description"),
|
||||
content=dict_url_data.get("content"),
|
||||
valid_content=dict_url_data.get("valid_content"),
|
||||
language=dict_url_data.get("language"),
|
||||
keywords=dict_url_data.get("keywords"),
|
||||
tags=dict_url_data.get("tags"),
|
||||
authors=dict_url_data.get("authors"),
|
||||
image_main_url=dict_url_data.get("image_main_url"),
|
||||
images_url=dict_url_data.get("images_url"),
|
||||
videos_url=dict_url_data.get("videos_url"),
|
||||
url_host=dict_url_data.get("url_host"),
|
||||
site_name=dict_url_data.get("site_name"),
|
||||
)
|
||||
|
||||
|
||||
##### Override status if pattern matching?
|
||||
for obj_url in updating_urls:
|
||||
# Check if article status needs to be updated with pattern matching
|
||||
status_pattern_matching = self._get_status_pattern_matching(obj_url.url, obj_url.status, list_pattern_status_tuple)
|
||||
# Update status?
|
||||
if (status_pattern_matching != obj_url.status):
|
||||
logger.debug("Pattern matching, overriding with status {} for URL: {}".format(status_pattern_matching, obj_url.url))
|
||||
# Update, no need to append to updating_urls, already included
|
||||
obj_url.status = status_pattern_matching
|
||||
obj_url.save()
|
||||
|
||||
# TODO: Fix enum type issue. Bulk update
|
||||
# TODO: Fix enum type issue. Bulk update instead of .save() for each object
|
||||
# Urls.objects.bulk_update(updating_urls, ['status'])
|
||||
|
||||
logger.info("Updated #{} raw URLs".format(len(updating_urls)))
|
||||
logger.info("Updated #{} raw URLs".format(len(raw_urls)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def process_error_urls(self, batch_size):
|
||||
try:
|
||||
logger.debug("Processing error URLs")
|
||||
|
||||
# Keep track of processed and skipped "error" URLs
|
||||
num_urls_skipped, num_urls_processed = 0, 0
|
||||
# Get batch of URLs, status='error'
|
||||
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
|
||||
|
||||
while ((len(error_urls) > 0) and (num_urls_processed < batch_size)):
|
||||
# Per URL
|
||||
for obj_url in error_urls:
|
||||
# URL ID cached? -> Tried to process recently already, skip
|
||||
if (self._is_cached_key("error_{}".format(obj_url.id), hash_encoded=False)):
|
||||
num_urls_skipped += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
# Process URL
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
|
||||
num_urls_processed += 1
|
||||
except Exception as e:
|
||||
# Error, cache to avoid re-processing for X time
|
||||
self._cache_key("error_{}".format(obj_url.id), hash_encode=False, cache_timeout=self._cache_timeout_error_url)
|
||||
num_urls_skipped += 1
|
||||
|
||||
# Get following batch of URLs, status='error'
|
||||
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
|
||||
|
||||
logger.info("Updated #{}, skipped #{} error URLs".format(num_urls_processed, num_urls_skipped))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
Reference in New Issue
Block a user