Min num of sources filter, initialization scripts, docker ready to use dev mode

2025-04-04 16:56:27 +02:00
parent 76079d7bd0
commit 9127552bfd
10 changed files with 132 additions and 83 deletions
--- a/README.md
+++ b/README.md
@@ -1,44 +1,46 @@
 # Matitos

- Scheduled tasks
-    - Fetcher -> Inserts raw URLs
-        - Fetch parsing URL host
-        - Fetch from RSS feed
-        - Fetch keyword search (Google search & news, DuckDuckGo, ...)
-            ++ Sources -> Robustness to TooManyRequests block
-                - Selenium based
-                    - Sites change their logic, request captcha, ...
-                - Brave Search API
-                    - Free up to X requests per day. Need credit card association (no charges)
-                - Bing API
-                    - Subscription required
-                - Yandex. No API?
-            ++ Proxy / VPN?
-                TooManyRequests, ...
-            ++ Search per locale (nl-NL, fr-FR, en-GB)
-    - Process URLs -> Updates raw URLs
-        - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
-        - Determines if it is a valid article content
+- URLs Fetcher -> Inserts raw URLs
+    - Fetch parsing URL host
+    - Fetch from RSS feed
+    - Fetch keyword search (Google search & news, DuckDuckGo, ...)
+        ++ Sources -> Robustness to TooManyRequests block
+            - Selenium based
+                - Sites change their logic, request captcha, ...
+            - Brave Search API
+                - Free up to X requests per day. Need credit card association (no charges)
+            - Bing API
+                - Subscription required
+            - Yandex. No API?
        ++ Proxy / VPN?
-            Bypass geoblock
-    - Valid URLs
-        - Generate summary
-            - One paragraph
-            - At most three paragraphs
-        - Classification
-            - 5W: Who, What, When, Where, Why of a Story
-            - Related to child abuse?
-            - ...
+            TooManyRequests, ...
+        ++ Search per locale (nl-NL, fr-FR, en-GB)
+
+- URLs Processing -> Updates raw URLs
+    - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
+    - Determines if it is a valid article content
+    ++ Proxy / VPN?
+        Bypass geoblock

 - Visualization of URLs
    - Filter URLs
-        - By status, search, source, language
+        - By status, search, source, language, ...
    - Charts

+- Valid URLs
+    - Generate summary
+        - One paragraph
+        - At most three paragraphs
+    - Classification
+        - 5W: Who, What, When, Where, Why of a Story
+        - Related to child abuse?
+        - ...
+
 - Content generation
-    - Select URLs:
+    - URLs Selection
        - Valid content
-        - language=en
-        - published_date during last_week
-        - Use classifications
+        - Language of interest
+        - Published (or fetch) date during last_week
+        - Fetched by at least N sources
+        - Use classifications and summaries
    - Merge summaries, ...
--- a/app_urls/Dockerfile
+++ b/app_urls/Dockerfile
@@ -19,31 +19,10 @@ RUN pip install --no-cache-dir -r requirements.txt

 COPY --chown=appuser:appuser . /opt/app/

-RUN chmod -R 755 /opt/app
-RUN chown -R appuser:appuser /opt/app
+RUN chmod -R 755 /opt
+RUN chown -R appuser:appuser /opt
+
 USER appuser

-# Initialization script
-RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
-   echo 'if [ "${INITIALIZE_DB}" = false ]; then' >> /opt/app/initialize.sh && \
-   echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \
-   echo 'else' >> /opt/app/initialize.sh && \
-   echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \
-   echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \
-   echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \
-   echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \
-   echo 'python manage.py collectstatic --no-input' >> /opt/app/initialize.sh && \
-   echo 'python manage.py import --filename scheduled_tasks.json' >> /opt/app/initialize.sh && \
-   echo 'fi' >> /opt/app/initialize.sh && \
-   chmod +x /opt/app/initialize.sh
-
-# Serving script
-RUN echo '#!/bin/bash' > /opt/app/run.sh && \
-   # Prod mode:
-   echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
-   # Dev mode:
-   #echo 'gunicorn core.wsgi:application --reload --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
-   chmod +x /opt/app/run.sh
-
 # Run Django’s server & workers
-CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"]
+CMD ["sh", "-c", "/opt/app/script_initialize.sh && /opt/app/script_run.sh"]
--- a/app_urls/db.py
+++ b/app_urls/db.py
@@ -134,10 +134,18 @@ def initialize_data():
            with conn.transaction() as tx:
                # Feeds
                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" )
+                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://feeds.feedburner.com/breitbart', 'rss_feed');" )
+                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('http://feeds.feedburner.com/zerohedge/feed', 'rss_feed');" )
+                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://moxie.foxnews.com/google-publisher/latest.xml', 'rss_feed');" )
+                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362', 'rss_feed');" )
+                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362', 'rss_feed');" )
                # Websites of interest
                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" )
                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" )
                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" )
+                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('zerohedge.com', 'url_host');" )
+                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('foxnews.com', 'url_host');" )
+                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('cnbc.com', 'url_host');" )
                # Search keywords
                cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" )
                # TODO: Language per search
@@ -146,12 +154,34 @@ def initialize_data():
                
                # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) )
+                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("yewtu.be/"))) )
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) )
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) )
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) )
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) )
                cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) )

+                """ # TODO: To review with new scheme
+                # Status update based on pattern matching (with priority to apply in order)
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/(video|quotes)/.*', 100, 'invalid');" )
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(video|category)/.*', 100, 'invalid');" )
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(tag|author)/.*', 100, 'invalid');" )
+
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*zerohedge.com/(economics|political|markets)/.*', 50, 'valid');" )
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(economy|entertainment|border|crime|clips)/.*', 50, 'valid');" )
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(lifestyle|opinion|sports|world)/.*', 50, 'valid');" )
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/[0-9]{4}/[0-9]{2}/[0-9]{2}/.*', 50, 'valid');" )
+
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*bbc.com/news/.*', 50, 'valid');" )
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*msn.com/[A-z]{2}-[A-z]{2}/news/.*', 50, 'valid');" )
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*newschannel9.com/news/.*', 50, 'valid');" )
+                
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*radaronline.com/p.*', 25, 'valid');" )
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*okmagazine.com/p.*', 25, 'valid');" )
+                cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*9news.com.au/national.*', 25, 'valid');" )
+                """
+
+
 def main(name):
    print('Hello, %s!' % name)

--- a/app_urls/fetcher/models.py
+++ b/app_urls/fetcher/models.py
@@ -99,6 +99,7 @@ class UrlsDuplicate(models.Model):

 class UrlsSourceSearch(models.Model):
    id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True)  # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
+    #id_url = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url')
    id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
    id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')

--- a/app_urls/fetcher/templates/filtered_urls.html
+++ b/app_urls/fetcher/templates/filtered_urls.html
@@ -331,6 +331,12 @@ input[type="checkbox"] {
                    </label><br>
                {% endfor %}

+                <!-- Minimum Sources Count Box -->
+                <h3>Min #Sources</h3>
+                <div>
+                    <input type="number" id="minSourceCount" name="min_sources" value="{{ selected_min_sources }}" min="1" style="width: 60px; text-align: center;">
+                </div>
+
                <!-- Filter by language -->
                <h3>Language</h3>
                <button type="button" class="toggle-all-btn" data-toggle="language">Toggle All</button><br>
@@ -538,6 +544,10 @@ input[type="checkbox"] {
                }
            });

+            // Min number of sources
+            //const minSearchCount = document.getElementById('minSourceCount').value;
+            //params.set('min_search_count', minSearchCount);
+
            // Submit the form after updating all sections
            document.getElementById("filterForm").submit();
        }
@@ -566,6 +576,9 @@ input[type="checkbox"] {
                updateFormParameters();
            });
        });
+        document.getElementById('minSourceCount').addEventListener('change', function() {
+            updateFormParameters();
+        });
        document.getElementById('perPageSelect').addEventListener('change', function() {
            updateFormParameters();
        });
--- a/app_urls/fetcher/templates/url_detail.html
+++ b/app_urls/fetcher/templates/url_detail.html
@@ -198,7 +198,7 @@
            </tr>
            <tr>
                <th>Status</th>
-                <td>{{ url_item.status }}</td>
+                <td>{{ url_item.status }} {% if url_canonical != None %}<a href="/urls/{{ url_canonical.id }}" target="_blank">[{{ url_canonical.id }}]</a>{% endif %} </td>
            </tr>
            <tr>
                <th>URL host</th>
--- a/app_urls/fetcher/views.py
+++ b/app_urls/fetcher/views.py
@@ -6,6 +6,8 @@ from django.contrib.auth.decorators import login_required
 import ollama
 from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
 import os
+from .src.logger import get_logger
+logger = get_logger()

 ####################################################################################################
 def trigger_task(request, task):
@@ -94,13 +96,11 @@ def url_detail_view(request, id):
    url_item = get_object_or_404(Urls, id=id)
    url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
    url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
-    # url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)

-    url_duplicate = UrlsDuplicate.objects.get(id_url_duplicated=url_item)
-    #id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True)  # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
-    #id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
-    
-    url_duplicate.id_url_duplicated
+    if (url_item.status == Urls.STATUS_ENUM.DUPLICATE):
+        url_canonical = UrlsDuplicate.objects.get(id_url_duplicated=url_item).id_url_canonical
+    else:
+        url_canonical = None

    try:
        url_content = UrlContent.objects.get(pk=id)
@@ -117,6 +117,7 @@ def url_detail_view(request, id):
        'models': ollama.get_models(),
        'prompt': ollama.get_prompt(),
        'url_content': url_content,
+        'url_canonical': url_canonical,
    }
    return render(request, 'url_detail.html', context)

@@ -232,6 +233,7 @@ def filtered_urls(request):
    selected_source = request.GET.getlist('source', ["null"])
    selected_language = request.GET.getlist('language', ["null"])
    selected_valid_contents = request.GET.getlist('valid_content', ["null"])
+    selected_min_sources = int(request.GET.get('min_sources', 1))
    selected_days = request.GET.get("days", 30)
    per_page = request.GET.get('per_page', 100)  # Default is X URLs per page
    page_number = request.GET.get('page')  # Get the current page number
@@ -298,6 +300,9 @@ def filtered_urls(request):
            # Update query
            query &= (subquery)

+        if (selected_min_sources > 1):
+            query &= Q(pk__in=UrlsSourceSearch.objects.values('id_url').annotate(search_count=Count('id_source', distinct=True)).filter(search_count__gte=selected_min_sources).values('id_url'))
+
        # Run query
        urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')

@@ -333,6 +338,7 @@ def filtered_urls(request):
        'selected_source': selected_source,
        'selected_language': selected_language,
        'selected_valid_contents': selected_valid_contents,
+        "selected_min_sources": selected_min_sources,
        "selected_days": selected_days,
        # Map
        "sources_map": sources_map,
--- a/app_urls/script_initialize.sh
+++ b/app_urls/script_initialize.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+if [ "${INITIALIZE_DB}" = false ]; then
+    echo "Initialization not required"
+else
+    echo "Initializating database"
+    python db.py --initialize_tables --initialize_data
+    python manage.py makemigrations fetcher; python manage.py migrate --fake-initial
+    python manage.py createsuperuser --noinput
+    python manage.py collectstatic --no-input
+    python manage.py import --filename scheduled_tasks.json
+fi
--- a/app_urls/script_run.sh
+++ b/app_urls/script_run.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+if [ "${DJANGO_DEBUG}" = true ] | [ "${DJANGO_DEBUG}" == "True" ]; then
+    gunicorn core.wsgi:application --reload --log-level debug --bind 0.0.0.0:8000 --timeout 300 & python manage.py rqworker high default low
+else
+    gunicorn core.wsgi:application --bind 0.0.0.0:8000 --timeout 300 & python manage.py rqworker high default low
+fi
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -7,7 +7,7 @@ services:
    build:
      context: ./app_selenium
    container_name: fetcher_app_selenium
-    # restart: unless-stopped
+    restart: unless-stopped
    shm_size: 512mb
    environment:
      - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
@@ -28,17 +28,18 @@ services:
    build:
      context: ./app_urls
    container_name: fetcher_app_urls
-    # restart: unless-stopped
+    restart: unless-stopped
    environment:
      # Initialization
-      - INITIALIZE_DB=${INITIALIZE_DB:-true}
+      - INITIALIZE_DB=${INITIALIZE_DB:-false}  # Related to DB persistence
      - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME:-matitos}
      - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD:-matitos}
      - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL:-matitos@matitos.org}
      # Django
+      - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2
      - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:-abc123456789qwerty}
      - DJANGO_DEBUG=${DJANGO_DEBUG:-False}
-      - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2
+      - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-/opt/logs}
      # Database
      - DB_NAME=${DB_NAME:-matitos}
      - DB_USER=${DB_USER:-supermatitos}
@@ -49,8 +50,6 @@ services:
      - REDIS_PORT=${REDIS_PORT:-6379}
      # Job timeout: 30 min
      - JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
-      # Logs path
-      - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
      # Fetcher
      - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
      - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
@@ -60,8 +59,8 @@ services:
      - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
      - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
    ########################
-    #volumes:   # Dev mode
-    #  - ./app_urls:/opt/app
+    volumes:   # Dev mode
+      - ./app_urls:/opt/app
    ########################
    ports:
      - 8000:8000
@@ -100,12 +99,12 @@ services:
    ports:
      - 6379 #:6379

-  fetcher_dozzle:
-    container_name: fetcher_dozzle
-    image: amir20/dozzle:latest
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock:ro
-    ports:
-      - 8888:8080
-    environment:
-      - DOZZLE_FILTER="name=fetcher_"
+  #fetcher_dozzle:
+  #  container_name: fetcher_dozzle
+  #  image: amir20/dozzle:latest
+  #  volumes:
+  #    - /var/run/docker.sock:/var/run/docker.sock:ro
+  #  ports:
+  #    - 8888:8080
+  #  environment:
+  #    - DOZZLE_FILTER="name=fetcher_"