Process all missing kids task, urls views cleaner, adding language filter WIP

This commit is contained in:
Luciano Gervasoni
2025-03-28 16:21:35 +01:00
parent e34284abbe
commit b3f896b35a
11 changed files with 284 additions and 196 deletions

View File

@@ -61,9 +61,9 @@ class DB_Handler():
# URL
obj_url, created = Urls.objects.get_or_create(url=url)
if (created):
logger.info("CREATED: {}".format(obj_url.url))
logger.debug("Inserted: {}".format(obj_url.url))
else:
logger.info("NOT CREATED: {}".format(obj_url.url))
logger.debug("Not inserted: {}".format(obj_url.url))
# (URL, source, search)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
except Exception as e:
@@ -76,7 +76,7 @@ class DB_Handler():
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
logger.info("Inserted #{} raw URLs".format(len(urls_to_insert)))
logger.info("Inserted #{} raw URLs, Source-Search {}-{}".format(len(urls_to_insert), obj_source.source, obj_search.search))
except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
@@ -243,15 +243,19 @@ class DB_Handler():
except Exception as e:
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
def process_missing_kids_urls(self, batch_size):
def process_missing_kids_urls(self, batch_size=None):
try:
logger.debug("Processing MissingKids URLs")
logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size))
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
&
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
)[:batch_size]
)
# Get batch size
if (batch_size is not None):
missingkids_urls = missingkids_urls[:batch_size]
# Per URL
for obj_url in missingkids_urls:

View File

@@ -64,7 +64,7 @@ def process_url(url):
if ("Website protected with PerimeterX" in str(e.args)):
logger.debug("TODO: process_url Implement bypass PerimeterX")
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e.args)))
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
return None
except Exception as e:
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))