diff --git a/app_urls/api/src/db_utils.py b/app_urls/api/src/db_utils.py index d34c075..79a02c8 100644 --- a/app_urls/api/src/db_utils.py +++ b/app_urls/api/src/db_utils.py @@ -61,9 +61,9 @@ class DB_Handler(): # URL obj_url, created = Urls.objects.get_or_create(url=url) if (created): - logger.info("CREATED: {}".format(obj_url.url)) + logger.debug("Inserted: {}".format(obj_url.url)) else: - logger.info("NOT CREATED: {}".format(obj_url.url)) + logger.debug("Not inserted: {}".format(obj_url.url)) # (URL, source, search) UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search) except Exception as e: @@ -76,7 +76,7 @@ class DB_Handler(): cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url) cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url) - logger.info("Inserted #{} raw URLs".format(len(urls_to_insert))) + logger.info("Inserted #{} raw URLs, Source-Search {}-{}".format(len(urls_to_insert), obj_source.source, obj_search.search)) except Exception as e: logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) @@ -243,15 +243,19 @@ class DB_Handler(): except Exception as e: logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc())) - def process_missing_kids_urls(self, batch_size): + def process_missing_kids_urls(self, batch_size=None): try: - logger.debug("Processing MissingKids URLs") + logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size)) # Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid') missingkids_urls = Urls.objects.order_by("-ts_fetch").filter( (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster")) & (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR)) - )[:batch_size] + ) + + # Get batch size + if (batch_size is not None): + missingkids_urls = missingkids_urls[:batch_size] # Per URL for obj_url in missingkids_urls: diff --git a/app_urls/api/src/url_processor.py b/app_urls/api/src/url_processor.py index 56b5a33..04f5e5e 100644 --- a/app_urls/api/src/url_processor.py +++ b/app_urls/api/src/url_processor.py @@ -64,7 +64,7 @@ def process_url(url): if ("Website protected with PerimeterX" in str(e.args)): logger.debug("TODO: process_url Implement bypass PerimeterX") - logger.warning("ArticleException for input URL {}\n{}".format(url, str(e.args))) + logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args))) return None except Exception as e: logger.warning("Exception for input URL {}\n{}".format(url, str(e))) diff --git a/app_urls/api/tasks.py b/app_urls/api/tasks.py index d7ea90f..4193bab 100644 --- a/app_urls/api/tasks.py +++ b/app_urls/api/tasks.py @@ -56,6 +56,12 @@ def process_missing_kids_urls(batch_size=50): DB_Handler().process_missing_kids_urls(batch_size=batch_size) logger.info("Task completed: {}".format(task)) +@job('default') +def process_missing_kids_urls_all(batch_size=None): + task = "Process Missing Kids URLs ALL" + logger.info("Task triggered: {}".format(task)) + DB_Handler().process_missing_kids_urls(batch_size=batch_size) + logger.info("Task completed: {}".format(task)) @@ -76,7 +82,10 @@ def background_task(process_type: str): # FetchMissingKids().run() elif ("process_" in process_type): # Batch size encoded in URL - batch_size = int(process_type.split("_")[-1]) + try: + batch_size = int(process_type.split("_")[-1]) + except Exception as e: + batch_size = None # Task type if ("process_raw_urls" in process_type): DB_Handler().process_raw_urls(batch_size=batch_size) @@ -87,7 +96,6 @@ def background_task(process_type: str): else: logger.info("Task unknown!: {}".format(process_type)) - ''' # Selenium based elif (process_type == "fetch_missing_kids_reduced"): diff --git a/app_urls/api/templates/urls.html b/app_urls/api/templates/OBSOLETE_urls.html similarity index 100% rename from app_urls/api/templates/urls.html rename to app_urls/api/templates/OBSOLETE_urls.html diff --git a/app_urls/api/templates/urls_partial.html b/app_urls/api/templates/OBSOLETE_urls_partial.html similarity index 100% rename from app_urls/api/templates/urls_partial.html rename to app_urls/api/templates/OBSOLETE_urls_partial.html diff --git a/app_urls/api/templates/charts.html b/app_urls/api/templates/charts.html index f39445d..9cffde6 100644 --- a/app_urls/api/templates/charts.html +++ b/app_urls/api/templates/charts.html @@ -60,6 +60,7 @@
| URL | @@ -126,7 +185,7 @@||
|---|---|---|
| Fetch Date | -{{ url_item.ts_fetch }} UTC | +|
| Source | @@ -142,59 +201,59 @@||
| URL host | -{{ url_content.url_host }} | +{{ url_content.url_host }} |
| Site name | -{{ url_content.site_name }} | +{{ url_content.site_name|default:"" }} |
| Published Date | -{{ url_content.date_published }} UTC | +|
| Valid news article content? | +Valid news content? | {{ url_content.valid_content }} |
| Tags | -{{ url_content.tags }} | +{{ url_content.tags|default:"" }} |
| Authors | -{{ url_content.authors }} | +{{ url_content.authors|default:"" }} |
| Keywords | -{{ url_content.keywords }} | +{{ url_content.keywords|default:"" }} |
| Language | -{{ url_content.language }} | +{{ url_content.language|default:"" }} |
| Main image | -{{ url_content.image_main_url }} | +{{ url_content.image_main_url|default:"" }} |
| Image URLs | -{{ url_content.image_urls }} | +{{ url_content.image_urls|default:"" }} |
| Video URLs | -{{ url_content.videos_url }} | +{{ url_content.videos_url|default:"" }} |
| Title | -{{ url_content.title }} | +{{ url_content.title|default:"" }} |
| Description | -{{ url_content.description }} | +{{ url_content.description|default:"" }} |
| Content | -{{ url_content.content }} | +{{ url_content.content|default:"" }} |