Error URLs proces with marreta. Env vars update
This commit is contained in:
@@ -6,16 +6,14 @@ from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from .url_processor import process_url, get_with_protocol
|
||||
import re
|
||||
import os
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class DB_Handler():
|
||||
def __init__(self):
|
||||
# Inserting raw URL, cache time: 1 day
|
||||
self._cache_timeout_insert_url = 86400
|
||||
# Processing error URL, cache time: 2 days
|
||||
self._cache_timeout_error_url = 86400*2
|
||||
pass
|
||||
|
||||
def insert_raw_urls(self, urls, obj_source, obj_search):
|
||||
try:
|
||||
@@ -75,8 +73,8 @@ class DB_Handler():
|
||||
|
||||
# Insert or update cache
|
||||
for url in urls_clean:
|
||||
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
|
||||
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
|
||||
cache.set("insert_{}".format(url), True, timeout=int(os.getenv("FETCHER_INSERT_URL_CACHE_TIME", 86400)))
|
||||
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=int(os.getenv("FETCHER_INSERT_URL_CACHE_TIME", 86400)))
|
||||
|
||||
logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search))
|
||||
|
||||
@@ -84,7 +82,7 @@ class DB_Handler():
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, override_url=None):
|
||||
|
||||
def set_status(obj_url, status):
|
||||
# Update status if setting a new value
|
||||
@@ -102,8 +100,13 @@ class DB_Handler():
|
||||
return
|
||||
|
||||
try:
|
||||
# Override URL for request?
|
||||
if (override_url is not None):
|
||||
url_of_interest = override_url
|
||||
else:
|
||||
url_of_interest = obj_url.url
|
||||
# Extract URL content
|
||||
dict_url_data = process_url(obj_url.url)
|
||||
dict_url_data = process_url(url_of_interest)
|
||||
except Exception as e:
|
||||
if (raise_exception_on_error):
|
||||
# Simply raise exception, handled in a different way
|
||||
@@ -232,11 +235,12 @@ class DB_Handler():
|
||||
|
||||
try:
|
||||
# Process URL
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
|
||||
override_url = "https://marreta.pcdomanual.com/p/{}".format(obj_url.url)
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, override_url=override_url)
|
||||
num_urls_processed += 1
|
||||
except Exception as e:
|
||||
# Error, cache to avoid re-processing for X time
|
||||
cache.set("error_{}".format(obj_url.id), True, timeout=self._cache_timeout_insert_url)
|
||||
cache.set("error_{}".format(obj_url.id), True, timeout=int(os.getenv("FETCHER_ERROR_URL_CACHE_TIME", 86400)))
|
||||
num_urls_skipped += 1
|
||||
|
||||
# Get following batch of URLs, status='error'
|
||||
|
||||
@@ -100,6 +100,7 @@ def background_task(process_type: str):
|
||||
number_pages = int(process_type.split("_")[-1])
|
||||
except Exception as e:
|
||||
number_pages = -1
|
||||
|
||||
FetchMissingKids().run(number_pages=number_pages)
|
||||
|
||||
elif ("process_" in process_type):
|
||||
@@ -108,6 +109,7 @@ def background_task(process_type: str):
|
||||
batch_size = int(process_type.split("_")[-1])
|
||||
except Exception as e:
|
||||
batch_size = None
|
||||
|
||||
# Task type
|
||||
if ("process_raw_urls" in process_type):
|
||||
DB_Handler().process_raw_urls(batch_size=batch_size)
|
||||
@@ -122,6 +124,7 @@ def background_task(process_type: str):
|
||||
older_than_days = float(process_type.split("_")[-1])
|
||||
except Exception as e:
|
||||
older_than_days = None
|
||||
|
||||
DB_Handler().clean_old_url_content(older_than_days=older_than_days)
|
||||
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user