Process all missing kids task, urls views cleaner, adding language filter WIP
This commit is contained in:
@@ -61,9 +61,9 @@ class DB_Handler():
|
||||
# URL
|
||||
obj_url, created = Urls.objects.get_or_create(url=url)
|
||||
if (created):
|
||||
logger.info("CREATED: {}".format(obj_url.url))
|
||||
logger.debug("Inserted: {}".format(obj_url.url))
|
||||
else:
|
||||
logger.info("NOT CREATED: {}".format(obj_url.url))
|
||||
logger.debug("Not inserted: {}".format(obj_url.url))
|
||||
# (URL, source, search)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
|
||||
except Exception as e:
|
||||
@@ -76,7 +76,7 @@ class DB_Handler():
|
||||
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
|
||||
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
|
||||
|
||||
logger.info("Inserted #{} raw URLs".format(len(urls_to_insert)))
|
||||
logger.info("Inserted #{} raw URLs, Source-Search {}-{}".format(len(urls_to_insert), obj_source.source, obj_search.search))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
@@ -243,15 +243,19 @@ class DB_Handler():
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def process_missing_kids_urls(self, batch_size):
|
||||
def process_missing_kids_urls(self, batch_size=None):
|
||||
try:
|
||||
logger.debug("Processing MissingKids URLs")
|
||||
logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size))
|
||||
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
|
||||
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
|
||||
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
||||
&
|
||||
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
|
||||
)[:batch_size]
|
||||
)
|
||||
|
||||
# Get batch size
|
||||
if (batch_size is not None):
|
||||
missingkids_urls = missingkids_urls[:batch_size]
|
||||
|
||||
# Per URL
|
||||
for obj_url in missingkids_urls:
|
||||
|
||||
@@ -64,7 +64,7 @@ def process_url(url):
|
||||
if ("Website protected with PerimeterX" in str(e.args)):
|
||||
logger.debug("TODO: process_url Implement bypass PerimeterX")
|
||||
|
||||
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e.args)))
|
||||
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||
|
||||
Reference in New Issue
Block a user