diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index de8956a..47e1853 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -6,6 +6,7 @@ from django.utils import timezone from datetime import timedelta from .fetch_utils_url_processor import process_url, get_with_protocol import re +import requests import os import traceback from .logger import get_logger @@ -80,24 +81,56 @@ class DB_Handler(): except Exception as e: logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) + def _set_status(self, obj_url, status): + # Update status if setting a new value + if (obj_url.status != status): + obj_url.status = status + obj_url.save() + + def _set_duplicate_and_insert_canonical(self, obj_url, url_canonical): + # Update status + self._set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE) + # Get or create URL with canonical form + obj_url_canonical, created = Urls.objects.get_or_create(url=url_canonical) + # Get the source-search IDs associated to obj_url.id + list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url) + for obj_url_source_search in list_url_source_search: + # Associate same sources to url_canonical (it might already exist) + UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search) + # URLs duplciate association + UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url) def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, paywall_bypass=False): - - def set_status(obj_url, status): - # Update status if setting a new value - if (obj_url.status != status): - obj_url.status = status - obj_url.save() + ########################################################################## + # URL pattern: missingkids.org/poster OR missingkids.org/new-poster + if ("missingkids.org" in obj_url.url) and ("poster" in obj_url.url): + # Request + r = requests.get(obj_url.url, allow_redirects=True) + + if (r.url != obj_url.url): + # Canonical + url_canonical = r.url + # Set duplicate, and insert new canonical form + self._set_duplicate_and_insert_canonical(obj_url, url_canonical) + elif (r.status_code == 200): + self._set_status(self, obj_url, Urls.STATUS_ENUM.VALID) + elif (r.status_code == 404): + self._set_status(self, obj_url, Urls.STATUS_ENUM.INVALID) + else: + logger.info("Unknown request status: {} for missing kids request: {}".format(r.status_code, obj_url.url)) + + return + ########################################################################## # Found a pattern match -> Override status if (status_pattern_match is not None): logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url)) # Update status - set_status(obj_url, status_pattern_match) + self._set_status(obj_url, status_pattern_match) ##### Filter URL? -> Invalid (don't extract content) if (status_pattern_match == "invalid"): return - + try: # Extract URL content dict_url_data = process_url(obj_url.url, paywall_bypass) @@ -111,19 +144,9 @@ class DB_Handler(): dict_url_data = None ##### Canonical URL different? -> Duplicate - if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")): - # Update status - set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE) - # Get or create URL with canonical form - obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical")) - # Get the source-search IDs associated to obj_url.id - list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url) - for obj_url_source_search in list_url_source_search: - # Associate same sources to url_canonical (it might already exist) - UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search) - # URLs duplciate association - UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url) - + if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")): + # URL as duplicate, insert canonical URL + self._set_duplicate_and_insert_canonical(obj_url, dict_url_data.get("url_canonical")) # Next URL return @@ -132,20 +155,20 @@ class DB_Handler(): # (dict_url_data is None) or (Exception while processing URL) ? -> Error status if (dict_url_data is None): # Update status - set_status(obj_url, Urls.STATUS_ENUM.ERROR) + self._set_status(obj_url, Urls.STATUS_ENUM.ERROR) # Next URL return # Invalid? e.g. binary data if (dict_url_data.get("override_status") == "invalid"): # Update status - set_status(obj_url, Urls.STATUS_ENUM.INVALID) + self._set_status(obj_url, Urls.STATUS_ENUM.INVALID) # Next URL return ##### Valid URL # Update status - set_status(obj_url, Urls.STATUS_ENUM.VALID) + self._set_status(obj_url, Urls.STATUS_ENUM.VALID) try: if (dict_url_data is not None): @@ -260,14 +283,12 @@ class DB_Handler(): # Per URL for obj_url in missingkids_urls: try: - # Process URL. If no exception -> Valid - self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True) + # Process URL + self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=None) except Exception as e: - # Raised exception -> Invalid (404 error) - obj_url.status = Urls.STATUS_ENUM.INVALID - obj_url.save() + logger.warning("Unknown error processing missing kids poster for URL: {}\n{}".format(obj_url.url, str(e))) - logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls))) + logger.info("Verified status of #{} missingkids.org/poster / missingkids.org/new-poster URLs".format(len(missingkids_urls))) except Exception as e: logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/init_data.json b/app_urls/init_data.json index e3d3e8f..0f2b15e 100644 --- a/app_urls/init_data.json +++ b/app_urls/init_data.json @@ -30,6 +30,5 @@ [".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50], [".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50], [".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50] - [".*missingkids\\.org\\/poster\\/.*", "valid", 50] ] }