General search fix, status pattern match regex, find feeds on startup

2025-04-09 15:52:35 +02:00
parent 296a8fe8a8
commit f369b23d81
22 changed files with 538 additions and 356 deletions
--- a/app_urls/fetcher/src/db_utils.py
+++ b/app_urls/fetcher/src/db_utils.py
@@ -81,6 +81,7 @@ class DB_Handler():
        except Exception as e:
            logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))

+
    def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
        
        def set_status(obj_url, status):
@@ -89,17 +90,17 @@ class DB_Handler():
                obj_url.status = status
                obj_url.save()

-        ##### Filter URL? -> Invalid
-        if (status_pattern_match == "invalid"):
-            logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
+        # Found a pattern match -> Override status
+        if (status_pattern_match is not None):
+            logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
            # Update status
-            set_status(obj_url, Urls.STATUS_ENUM.INVALID)
-            # Next URL
-            return
+            set_status(obj_url, status_pattern_match)
+            ##### Filter URL? -> Invalid (don't extract content)
+            if (status_pattern_match == "invalid"):
+                return
        
-        ##### Process URL
        try:
-            # Get data
+            # Extract URL content
            dict_url_data = process_url(obj_url.url)
        except Exception as e:
            if (raise_exception_on_error):
@@ -110,25 +111,10 @@ class DB_Handler():
                # Set status to error
                dict_url_data = None
        
-        # (dict_url_data is None) or (Exception while processing URL) ? -> Error status
-        if (dict_url_data is None):
-            # Update status
-            set_status(obj_url, Urls.STATUS_ENUM.ERROR)
-            # Next URL
-            return
-
-        # Invalid? e.g. binary data
-        if (dict_url_data.get("override_status") == "invalid"):
-            # Update status
-            set_status(obj_url, Urls.STATUS_ENUM.INVALID)
-            # Next URL
-            return
-
        ##### Canonical URL different? -> Duplicate
-        if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
+        if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
            # Update status
            set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
-            
            # Get or create URL with canonical form
            obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
            # Get the source-search IDs associated to obj_url.id
@@ -136,42 +122,54 @@ class DB_Handler():
            for obj_url_source_search in list_url_source_search:
                # Associate same sources to url_canonical (it might already exist)
                UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
-            
            # URLs duplciate association
            UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)

-            # TODO: return obj_url_canonical so as to directly process the recently inserted URL
-            # Wherever this function is called, add:
-            # self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
-
            # Next URL
            return
-        
-        ##### Valid URL
-        # Update status
-        set_status(obj_url, Urls.STATUS_ENUM.VALID)
+
+        # Not overriding status given pattern matching?
+        if (status_pattern_match is None):
+            # (dict_url_data is None) or (Exception while processing URL) ? -> Error status
+            if (dict_url_data is None):
+                # Update status
+                set_status(obj_url, Urls.STATUS_ENUM.ERROR)
+                # Next URL
+                return
+            
+            # Invalid? e.g. binary data
+            if (dict_url_data.get("override_status") == "invalid"):
+                # Update status
+                set_status(obj_url, Urls.STATUS_ENUM.INVALID)
+                # Next URL
+                return
+
+            ##### Valid URL
+            # Update status
+            set_status(obj_url, Urls.STATUS_ENUM.VALID)

        try:
-            # Create or update extracted URL data
-            UrlContent.objects.update_or_create(
-                id_url=obj_url,
-                defaults = {
-                    "date_published" : dict_url_data.get("publish_date"),
-                    "title" : dict_url_data.get("title"),
-                    "description" : dict_url_data.get("description"),
-                    "content" : dict_url_data.get("content"),
-                    "valid_content" : dict_url_data.get("valid_content"),
-                    "language" : dict_url_data.get("language"),
-                    "keywords" : dict_url_data.get("keywords"),
-                    "tags" : dict_url_data.get("tags"),
-                    "authors" : dict_url_data.get("authors"),
-                    "image_main_url" : dict_url_data.get("image_main_url"),
-                    "images_url" : dict_url_data.get("images_url"),
-                    "videos_url" : dict_url_data.get("videos_url"),
-                    "url_host" : dict_url_data.get("url_host"),
-                    "site_name" : dict_url_data.get("site_name"),
-                }
-            )
+            if (dict_url_data is not None):
+                # Create or update extracted URL data
+                UrlContent.objects.update_or_create(
+                    id_url=obj_url,
+                    defaults = {
+                        "date_published" : dict_url_data.get("publish_date"),
+                        "title" : dict_url_data.get("title"),
+                        "description" : dict_url_data.get("description"),
+                        "content" : dict_url_data.get("content"),
+                        "valid_content" : dict_url_data.get("valid_content"),
+                        "language" : dict_url_data.get("language"),
+                        "keywords" : dict_url_data.get("keywords"),
+                        "tags" : dict_url_data.get("tags"),
+                        "authors" : dict_url_data.get("authors"),
+                        "image_main_url" : dict_url_data.get("image_main_url"),
+                        "images_url" : dict_url_data.get("images_url"),
+                        "videos_url" : dict_url_data.get("videos_url"),
+                        "url_host" : dict_url_data.get("url_host"),
+                        "site_name" : dict_url_data.get("site_name"),
+                    }
+                )
        except Exception as e:
            logger.debug("Error in update_or_create UrlContent: {}\ndict_url_data: {}\n{}\n{}".format(obj_url.url, dict_url_data, str(e), traceback.format_exc()))

@@ -179,13 +177,12 @@ class DB_Handler():
    def process_raw_urls(self, batch_size):

        def _get_status_pattern_matching(url, list_pattern_status_tuple):
-            """ Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
-            """
+            """ Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only """
            # Sort pattern tuples by priority. (pattern, priority, status)
            for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
                # Regular expression pattern matching: https://regexr.com/
                if bool(re.match(regex_pattern, obj_url.url)):
-                    logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
+                    # logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
                    return status_if_match
            return None

--- a/app_urls/fetcher/src/fetch_search.py
+++ b/app_urls/fetcher/src/fetch_search.py
@@ -49,11 +49,11 @@ class FetchSearcher():
                    "language": "en",
                    "country": "US",
                    # "period": ["7d", "1d"], # TODO: List of periods to iterate
-                }                
+                }

                for SearchInstance in ListSearchInstances:
                    # Sleep between requests, avoid too many requests...
-                    time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
+                    time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
                    SearchInstance(args).fetch_articles(db_writer, obj_search)

                # TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
--- a/app_urls/fetcher/src/fetch_search_instances.py
+++ b/app_urls/fetcher/src/fetch_search_instances.py
@@ -55,8 +55,8 @@ class FetcherAbstract(ABC):
            keyword_search = "{}{}".format("site:", keyword_search)
        # Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
        if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
-            start_date = timezone.now() - timedelta(days=7)
-            keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
+            logger.debug("Appending news to general search")
+            keyword_search = "{}{}".format(keyword_search, "news")

        logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
        # Fetch
@@ -194,7 +194,7 @@ class SearchGoogleGeneral(FetcherAbstract):
            # Iterate pages
            for i in range(self.pages):
                # Sleep between pages fetch
-                time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
+                time.sleep(float(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
                # Number of URLs fetched so far
                num_before = len(set_links)
                # Get page
--- a/app_urls/fetcher/src/fetch_utils.py
+++ b/app_urls/fetcher/src/fetch_utils.py
@@ -6,7 +6,7 @@ logger = get_logger()
 from googlenewsdecoder import gnewsdecoder


-def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
+def decode_gnews_urls(encoded_urls, interval=float(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
    logger.debug("Decoding gnews URLs")
    # DecodeURLs
    list_decoded_urls = []
--- a/app_urls/fetcher/src/url_processor.py
+++ b/app_urls/fetcher/src/url_processor.py
@@ -41,7 +41,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
 def process_url(url):
    try:
        # Slow down if required to avoid too many requests error
-        url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
+        url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
        # Process
        article = newspaper.article(url)
    except newspaper.ArticleBinaryDataException:
--- a/app_urls/fetcher/urls.py
+++ b/app_urls/fetcher/urls.py
@@ -4,6 +4,7 @@ from . import views
 urlpatterns = [
    path('', views.link_list, name='link_list'),
    # 
+    path('logs/database', views.log_db, name='log_db'),
    path('logs/<str:log_type>', views.logs, name='logs'),
    #
    path('task/<str:task>', views.trigger_task, name='trigger_task'),
@@ -17,4 +18,5 @@ urlpatterns = [
    path('urls/', views.filtered_urls, name='filtered_urls'),
    path('urls/<int:id>/', views.url_detail_view, name='url_detail'),
    path('urls/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
+    path('urls/content_generation', views.content_generation, name='content_generation'),
 ]
--- a/app_urls/fetcher/views.py
+++ b/app_urls/fetcher/views.py
@@ -1,44 +1,16 @@
-from .tasks import background_task
+from .views_base import link_list, logs, log_db, trigger_task
+
 from django.core.paginator import Paginator
 from django.shortcuts import render, get_object_or_404
-from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
-from django.contrib.auth.decorators import login_required
-import ollama
+from django.http import StreamingHttpResponse, JsonResponse
+from django.db.models import Q, Count
+from django.utils import timezone
+from django.utils.timezone import now, timedelta
 from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
+import ollama
 import os
-from .src.logger import get_logger
-logger = get_logger()
+#from datetime import timedelta

-####################################################################################################
-def trigger_task(request, task):
-    # Enqueue function in "default" queue
-    background_task.delay(task)  
-    return JsonResponse({"message": "Task has been enqueued!", "task": task})
-
-####################################################################################################
-def link_list(request):
-    # Base URL path
-    app_url = request.build_absolute_uri()
-    # Tasks
-    links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
-    links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
-    # List of links
-    list_links = \
-        [ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
-        [ os.path.join(app_url, "logs", log_type) for log_type in ["debug", "info", "warning"] ] + \
-        [ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
-    # Json
-    return JsonResponse({"links": list_links })
-
-####################################################################################################
-def logs(request, log_type):
-    # Capture output: python manage.py rqstats
-    try:
-        with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
-            file_content = f.read()
-    except Exception as e:
-        file_content = "Error reading logs for log type :{}".format(log_type)
-    return HttpResponse(file_content, content_type="text/plain")

 ####################################################################################################
 class OllamaClient():
@@ -57,13 +29,6 @@ class OllamaClient():
    
    def get_prompt(self):
        return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
-        #return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
-        #return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
-        #return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
-        #return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
-        # return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
-        #return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
-

 def fetch_details(request, id):
    url_item = get_object_or_404(Urls, id=id)
@@ -83,7 +48,6 @@ def fetch_details(request, id):
    
    return StreamingHttpResponse(stream_response(), content_type="text/plain")

-
 def url_detail_view(request, id):
    url_item = get_object_or_404(Urls, id=id)
    url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
@@ -114,13 +78,6 @@ def url_detail_view(request, id):
    return render(request, 'url_detail.html', context)

 ####################################################################################################
-from django.shortcuts import render
-from django.http import JsonResponse
-from django.db.models import Count
-from datetime import timedelta
-from django.utils import timezone
-from .models import Urls, UrlsSourceSearch
-
 def charts(request):
    return render(request, 'charts.html')

@@ -202,14 +159,7 @@ def urls_per_search(request):
    
    return JsonResponse(data)

-
-
 ####################################################################################################
-from django.shortcuts import render
-from .models import Urls, Search, Source
-from django.db.models import Q
-from django.utils.timezone import now, timedelta
-

 def filtered_urls(request):
    statuses = Urls.STATUS_ENUM.choices
@@ -342,4 +292,16 @@ def filtered_urls(request):
    }

    return render(request, 'filtered_urls.html', context)
+####################################################################################################
+
+def content_generation(request):
+    # https://fetcher.matitos.org/urls/?per_page=100&days=1&valid_content=True&min_sources=1&search=13&status=all&language=all&source=all
+    '''
+    # Get list of URLs ID
+    selected_urls = request.GET.getlist('urls', [])
+    
+    # Sample URLs
+    selected_urls = [13460, 13455, 13454, 13452, 13210]
+    '''
+
 ####################################################################################################
--- a/app_urls/fetcher/views_base.py
+++ b/app_urls/fetcher/views_base.py
@@ -0,0 +1,74 @@
+import os
+import psycopg
+from .tasks import background_task
+from django.http import JsonResponse, HttpResponse
+
+####################################################################################################
+def trigger_task(request, task):
+    # Enqueue function in "default" queue
+    background_task.delay(task)  
+    return JsonResponse({"message": "Task has been enqueued!", "task": task})
+
+####################################################################################################
+def link_list(request):
+    # Base URL path
+    app_url = request.build_absolute_uri()
+    # Tasks
+    links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
+    links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
+    # List of links
+    list_links = \
+        [ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
+        [ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning"] ] + \
+        [ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
+    # Json
+    return JsonResponse({"links": list_links })
+
+####################################################################################################
+def logs(request, log_type):
+    # Capture output: python manage.py rqstats
+    try:
+        with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
+            file_content = f.read()
+    except Exception as e:
+        file_content = "Error reading logs for log type :{}".format(log_type)
+    return HttpResponse(file_content, content_type="text/plain")
+
+####################################################################################################
+def log_db(request):
+    # TODO: Django connection
+    connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
+        os.environ.get("DB_HOST", "localhost"),
+        os.environ.get("DB_PORT", "5432"),
+        os.environ.get("DB_NAME", "matitos"),
+        os.environ.get("DB_USER", "supermatitos"),
+        os.environ.get("DB_PASSWORD", "supermatitos")
+    )
+
+    # Connect to an existing database
+    with psycopg.connect(connection_info) as conn:
+        # Open a cursor to perform database operations
+        with conn.cursor() as cur:
+            # Create URLs table
+            r = cur.execute(""" 
+                SELECT
+                    relname AS "relation",
+                    pg_size_pretty (
+                        pg_total_relation_size (C .oid)
+                    ) AS "total_size"
+                FROM
+                    pg_class C
+                LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
+                WHERE
+                    nspname NOT IN (
+                        'pg_catalog',
+                        'information_schema'
+                    )
+                AND C .relkind <> 'i'
+                AND nspname !~ '^pg_toast'
+                ORDER BY
+                    pg_total_relation_size (C .oid) DESC
+                LIMIT 100;
+            """).fetchall()
+    return HttpResponse( "\n".join([str(e) for e in r]) )
+####################################################################################################