General search fix, status pattern match regex, find feeds on startup

2025-04-09 15:52:35 +02:00
parent 296a8fe8a8
commit f369b23d81
22 changed files with 538 additions and 356 deletions
--- a/app_urls/fetcher/src/db_utils.py
+++ b/app_urls/fetcher/src/db_utils.py
@@ -81,6 +81,7 @@ class DB_Handler():
        except Exception as e:
            logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))

+
    def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
        
        def set_status(obj_url, status):
@@ -89,17 +90,17 @@ class DB_Handler():
                obj_url.status = status
                obj_url.save()

-        ##### Filter URL? -> Invalid
-        if (status_pattern_match == "invalid"):
-            logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
+        # Found a pattern match -> Override status
+        if (status_pattern_match is not None):
+            logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
            # Update status
-            set_status(obj_url, Urls.STATUS_ENUM.INVALID)
-            # Next URL
-            return
+            set_status(obj_url, status_pattern_match)
+            ##### Filter URL? -> Invalid (don't extract content)
+            if (status_pattern_match == "invalid"):
+                return
        
-        ##### Process URL
        try:
-            # Get data
+            # Extract URL content
            dict_url_data = process_url(obj_url.url)
        except Exception as e:
            if (raise_exception_on_error):
@@ -110,25 +111,10 @@ class DB_Handler():
                # Set status to error
                dict_url_data = None
        
-        # (dict_url_data is None) or (Exception while processing URL) ? -> Error status
-        if (dict_url_data is None):
-            # Update status
-            set_status(obj_url, Urls.STATUS_ENUM.ERROR)
-            # Next URL
-            return
-
-        # Invalid? e.g. binary data
-        if (dict_url_data.get("override_status") == "invalid"):
-            # Update status
-            set_status(obj_url, Urls.STATUS_ENUM.INVALID)
-            # Next URL
-            return
-
        ##### Canonical URL different? -> Duplicate
-        if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
+        if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
            # Update status
            set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
-            
            # Get or create URL with canonical form
            obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
            # Get the source-search IDs associated to obj_url.id
@@ -136,42 +122,54 @@ class DB_Handler():
            for obj_url_source_search in list_url_source_search:
                # Associate same sources to url_canonical (it might already exist)
                UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
-            
            # URLs duplciate association
            UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)

-            # TODO: return obj_url_canonical so as to directly process the recently inserted URL
-            # Wherever this function is called, add:
-            # self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
-
            # Next URL
            return
-        
-        ##### Valid URL
-        # Update status
-        set_status(obj_url, Urls.STATUS_ENUM.VALID)
+
+        # Not overriding status given pattern matching?
+        if (status_pattern_match is None):
+            # (dict_url_data is None) or (Exception while processing URL) ? -> Error status
+            if (dict_url_data is None):
+                # Update status
+                set_status(obj_url, Urls.STATUS_ENUM.ERROR)
+                # Next URL
+                return
+            
+            # Invalid? e.g. binary data
+            if (dict_url_data.get("override_status") == "invalid"):
+                # Update status
+                set_status(obj_url, Urls.STATUS_ENUM.INVALID)
+                # Next URL
+                return
+
+            ##### Valid URL
+            # Update status
+            set_status(obj_url, Urls.STATUS_ENUM.VALID)

        try:
-            # Create or update extracted URL data
-            UrlContent.objects.update_or_create(
-                id_url=obj_url,
-                defaults = {
-                    "date_published" : dict_url_data.get("publish_date"),
-                    "title" : dict_url_data.get("title"),
-                    "description" : dict_url_data.get("description"),
-                    "content" : dict_url_data.get("content"),
-                    "valid_content" : dict_url_data.get("valid_content"),
-                    "language" : dict_url_data.get("language"),
-                    "keywords" : dict_url_data.get("keywords"),
-                    "tags" : dict_url_data.get("tags"),
-                    "authors" : dict_url_data.get("authors"),
-                    "image_main_url" : dict_url_data.get("image_main_url"),
-                    "images_url" : dict_url_data.get("images_url"),
-                    "videos_url" : dict_url_data.get("videos_url"),
-                    "url_host" : dict_url_data.get("url_host"),
-                    "site_name" : dict_url_data.get("site_name"),
-                }
-            )
+            if (dict_url_data is not None):
+                # Create or update extracted URL data
+                UrlContent.objects.update_or_create(
+                    id_url=obj_url,
+                    defaults = {
+                        "date_published" : dict_url_data.get("publish_date"),
+                        "title" : dict_url_data.get("title"),
+                        "description" : dict_url_data.get("description"),
+                        "content" : dict_url_data.get("content"),
+                        "valid_content" : dict_url_data.get("valid_content"),
+                        "language" : dict_url_data.get("language"),
+                        "keywords" : dict_url_data.get("keywords"),
+                        "tags" : dict_url_data.get("tags"),
+                        "authors" : dict_url_data.get("authors"),
+                        "image_main_url" : dict_url_data.get("image_main_url"),
+                        "images_url" : dict_url_data.get("images_url"),
+                        "videos_url" : dict_url_data.get("videos_url"),
+                        "url_host" : dict_url_data.get("url_host"),
+                        "site_name" : dict_url_data.get("site_name"),
+                    }
+                )
        except Exception as e:
            logger.debug("Error in update_or_create UrlContent: {}\ndict_url_data: {}\n{}\n{}".format(obj_url.url, dict_url_data, str(e), traceback.format_exc()))

@@ -179,13 +177,12 @@ class DB_Handler():
    def process_raw_urls(self, batch_size):

        def _get_status_pattern_matching(url, list_pattern_status_tuple):
-            """ Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
-            """
+            """ Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only """
            # Sort pattern tuples by priority. (pattern, priority, status)
            for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
                # Regular expression pattern matching: https://regexr.com/
                if bool(re.match(regex_pattern, obj_url.url)):
-                    logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
+                    # logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
                    return status_if_match
            return None