diff --git a/README.md b/README.md index 54ff357..a3a9df0 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,46 @@ # Matitos -- Scheduled tasks - - Fetcher -> Inserts raw URLs - - Fetch parsing URL host - - Fetch from RSS feed - - Fetch keyword search (Google search & news, DuckDuckGo, ...) - ++ Sources -> Robustness to TooManyRequests block - - Selenium based - - Sites change their logic, request captcha, ... - - Brave Search API - - Free up to X requests per day. Need credit card association (no charges) - - Bing API - - Subscription required - - Yandex. No API? - ++ Proxy / VPN? - TooManyRequests, ... - ++ Search per locale (nl-NL, fr-FR, en-GB) - - Process URLs -> Updates raw URLs - - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date - - Determines if it is a valid article content +- URLs Fetcher -> Inserts raw URLs + - Fetch parsing URL host + - Fetch from RSS feed + - Fetch keyword search (Google search & news, DuckDuckGo, ...) + ++ Sources -> Robustness to TooManyRequests block + - Selenium based + - Sites change their logic, request captcha, ... + - Brave Search API + - Free up to X requests per day. Need credit card association (no charges) + - Bing API + - Subscription required + - Yandex. No API? ++ Proxy / VPN? - Bypass geoblock - - Valid URLs - - Generate summary - - One paragraph - - At most three paragraphs - - Classification - - 5W: Who, What, When, Where, Why of a Story - - Related to child abuse? - - ... + TooManyRequests, ... + ++ Search per locale (nl-NL, fr-FR, en-GB) + +- URLs Processing -> Updates raw URLs + - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date + - Determines if it is a valid article content + ++ Proxy / VPN? + Bypass geoblock - Visualization of URLs - Filter URLs - - By status, search, source, language + - By status, search, source, language, ... - Charts +- Valid URLs + - Generate summary + - One paragraph + - At most three paragraphs + - Classification + - 5W: Who, What, When, Where, Why of a Story + - Related to child abuse? + - ... + - Content generation - - Select URLs: + - URLs Selection - Valid content - - language=en - - published_date during last_week - - Use classifications + - Language of interest + - Published (or fetch) date during last_week + - Fetched by at least N sources + - Use classifications and summaries - Merge summaries, ... \ No newline at end of file diff --git a/app_urls/Dockerfile b/app_urls/Dockerfile index b1dd674..03d18bf 100644 --- a/app_urls/Dockerfile +++ b/app_urls/Dockerfile @@ -19,31 +19,10 @@ RUN pip install --no-cache-dir -r requirements.txt COPY --chown=appuser:appuser . /opt/app/ -RUN chmod -R 755 /opt/app -RUN chown -R appuser:appuser /opt/app +RUN chmod -R 755 /opt +RUN chown -R appuser:appuser /opt + USER appuser -# Initialization script -RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \ - echo 'if [ "${INITIALIZE_DB}" = false ]; then' >> /opt/app/initialize.sh && \ - echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \ - echo 'else' >> /opt/app/initialize.sh && \ - echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \ - echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \ - echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \ - echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \ - echo 'python manage.py collectstatic --no-input' >> /opt/app/initialize.sh && \ - echo 'python manage.py import --filename scheduled_tasks.json' >> /opt/app/initialize.sh && \ - echo 'fi' >> /opt/app/initialize.sh && \ - chmod +x /opt/app/initialize.sh - -# Serving script -RUN echo '#!/bin/bash' > /opt/app/run.sh && \ - # Prod mode: - echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \ - # Dev mode: - #echo 'gunicorn core.wsgi:application --reload --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \ - chmod +x /opt/app/run.sh - # Run Django’s server & workers -CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"] +CMD ["sh", "-c", "/opt/app/script_initialize.sh && /opt/app/script_run.sh"] diff --git a/app_urls/db.py b/app_urls/db.py index 7f1bedf..4e8fc80 100644 --- a/app_urls/db.py +++ b/app_urls/db.py @@ -134,10 +134,18 @@ def initialize_data(): with conn.transaction() as tx: # Feeds cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" ) + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://feeds.feedburner.com/breitbart', 'rss_feed');" ) + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('http://feeds.feedburner.com/zerohedge/feed', 'rss_feed');" ) + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://moxie.foxnews.com/google-publisher/latest.xml', 'rss_feed');" ) + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362', 'rss_feed');" ) + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362', 'rss_feed');" ) # Websites of interest cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" ) cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" ) cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" ) + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('zerohedge.com', 'url_host');" ) + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('foxnews.com', 'url_host');" ) + cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('cnbc.com', 'url_host');" ) # Search keywords cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" ) # TODO: Language per search @@ -146,12 +154,34 @@ def initialize_data(): # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/ cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) ) + cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("yewtu.be/"))) ) cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) ) cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) ) cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) ) cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) ) cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) ) + """ # TODO: To review with new scheme + # Status update based on pattern matching (with priority to apply in order) + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/(video|quotes)/.*', 100, 'invalid');" ) + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(video|category)/.*', 100, 'invalid');" ) + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(tag|author)/.*', 100, 'invalid');" ) + + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*zerohedge.com/(economics|political|markets)/.*', 50, 'valid');" ) + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(economy|entertainment|border|crime|clips)/.*', 50, 'valid');" ) + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(lifestyle|opinion|sports|world)/.*', 50, 'valid');" ) + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/[0-9]{4}/[0-9]{2}/[0-9]{2}/.*', 50, 'valid');" ) + + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*bbc.com/news/.*', 50, 'valid');" ) + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*msn.com/[A-z]{2}-[A-z]{2}/news/.*', 50, 'valid');" ) + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*newschannel9.com/news/.*', 50, 'valid');" ) + + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*radaronline.com/p.*', 25, 'valid');" ) + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*okmagazine.com/p.*', 25, 'valid');" ) + cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*9news.com.au/national.*', 25, 'valid');" ) + """ + + def main(name): print('Hello, %s!' % name) diff --git a/app_urls/fetcher/models.py b/app_urls/fetcher/models.py index 72c2811..f33dd46 100644 --- a/app_urls/fetcher/models.py +++ b/app_urls/fetcher/models.py @@ -99,6 +99,7 @@ class UrlsDuplicate(models.Model): class UrlsSourceSearch(models.Model): id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected. + #id_url = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url') id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source') id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search') diff --git a/app_urls/fetcher/templates/filtered_urls.html b/app_urls/fetcher/templates/filtered_urls.html index 81efd19..040a32b 100644 --- a/app_urls/fetcher/templates/filtered_urls.html +++ b/app_urls/fetcher/templates/filtered_urls.html @@ -331,6 +331,12 @@ input[type="checkbox"] {
{% endfor %} + +

Min #Sources

+
+ +
+

Language


@@ -538,6 +544,10 @@ input[type="checkbox"] { } }); + // Min number of sources + //const minSearchCount = document.getElementById('minSourceCount').value; + //params.set('min_search_count', minSearchCount); + // Submit the form after updating all sections document.getElementById("filterForm").submit(); } @@ -566,6 +576,9 @@ input[type="checkbox"] { updateFormParameters(); }); }); + document.getElementById('minSourceCount').addEventListener('change', function() { + updateFormParameters(); + }); document.getElementById('perPageSelect').addEventListener('change', function() { updateFormParameters(); }); diff --git a/app_urls/fetcher/templates/url_detail.html b/app_urls/fetcher/templates/url_detail.html index 4a4377f..f774cd7 100644 --- a/app_urls/fetcher/templates/url_detail.html +++ b/app_urls/fetcher/templates/url_detail.html @@ -198,7 +198,7 @@ Status - {{ url_item.status }} + {{ url_item.status }} {% if url_canonical != None %}[{{ url_canonical.id }}]{% endif %} URL host diff --git a/app_urls/fetcher/views.py b/app_urls/fetcher/views.py index 49335ce..739098c 100644 --- a/app_urls/fetcher/views.py +++ b/app_urls/fetcher/views.py @@ -6,6 +6,8 @@ from django.contrib.auth.decorators import login_required import ollama from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate import os +from .src.logger import get_logger +logger = get_logger() #################################################################################################### def trigger_task(request, task): @@ -94,13 +96,11 @@ def url_detail_view(request, id): url_item = get_object_or_404(Urls, id=id) url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct()) url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct()) - # url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item) - - url_duplicate = UrlsDuplicate.objects.get(id_url_duplicated=url_item) - #id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected. - #id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set') - - url_duplicate.id_url_duplicated + + if (url_item.status == Urls.STATUS_ENUM.DUPLICATE): + url_canonical = UrlsDuplicate.objects.get(id_url_duplicated=url_item).id_url_canonical + else: + url_canonical = None try: url_content = UrlContent.objects.get(pk=id) @@ -117,6 +117,7 @@ def url_detail_view(request, id): 'models': ollama.get_models(), 'prompt': ollama.get_prompt(), 'url_content': url_content, + 'url_canonical': url_canonical, } return render(request, 'url_detail.html', context) @@ -232,6 +233,7 @@ def filtered_urls(request): selected_source = request.GET.getlist('source', ["null"]) selected_language = request.GET.getlist('language', ["null"]) selected_valid_contents = request.GET.getlist('valid_content', ["null"]) + selected_min_sources = int(request.GET.get('min_sources', 1)) selected_days = request.GET.get("days", 30) per_page = request.GET.get('per_page', 100) # Default is X URLs per page page_number = request.GET.get('page') # Get the current page number @@ -298,6 +300,9 @@ def filtered_urls(request): # Update query query &= (subquery) + if (selected_min_sources > 1): + query &= Q(pk__in=UrlsSourceSearch.objects.values('id_url').annotate(search_count=Count('id_source', distinct=True)).filter(search_count__gte=selected_min_sources).values('id_url')) + # Run query urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch') @@ -333,6 +338,7 @@ def filtered_urls(request): 'selected_source': selected_source, 'selected_language': selected_language, 'selected_valid_contents': selected_valid_contents, + "selected_min_sources": selected_min_sources, "selected_days": selected_days, # Map "sources_map": sources_map, diff --git a/app_urls/script_initialize.sh b/app_urls/script_initialize.sh new file mode 100755 index 0000000..522a4d7 --- /dev/null +++ b/app_urls/script_initialize.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +if [ "${INITIALIZE_DB}" = false ]; then + echo "Initialization not required" +else + echo "Initializating database" + python db.py --initialize_tables --initialize_data + python manage.py makemigrations fetcher; python manage.py migrate --fake-initial + python manage.py createsuperuser --noinput + python manage.py collectstatic --no-input + python manage.py import --filename scheduled_tasks.json +fi diff --git a/app_urls/script_run.sh b/app_urls/script_run.sh new file mode 100755 index 0000000..27de589 --- /dev/null +++ b/app_urls/script_run.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +if [ "${DJANGO_DEBUG}" = true ] | [ "${DJANGO_DEBUG}" == "True" ]; then + gunicorn core.wsgi:application --reload --log-level debug --bind 0.0.0.0:8000 --timeout 300 & python manage.py rqworker high default low +else + gunicorn core.wsgi:application --bind 0.0.0.0:8000 --timeout 300 & python manage.py rqworker high default low +fi diff --git a/docker-compose.yml b/docker-compose.yml index f695204..795254f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,7 +7,7 @@ services: build: context: ./app_selenium container_name: fetcher_app_selenium - # restart: unless-stopped + restart: unless-stopped shm_size: 512mb environment: - SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4} @@ -28,17 +28,18 @@ services: build: context: ./app_urls container_name: fetcher_app_urls - # restart: unless-stopped + restart: unless-stopped environment: # Initialization - - INITIALIZE_DB=${INITIALIZE_DB:-true} + - INITIALIZE_DB=${INITIALIZE_DB:-false} # Related to DB persistence - DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME:-matitos} - DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD:-matitos} - DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL:-matitos@matitos.org} # Django + - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2 - DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:-abc123456789qwerty} - DJANGO_DEBUG=${DJANGO_DEBUG:-False} - - DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2 + - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-/opt/logs} # Database - DB_NAME=${DB_NAME:-matitos} - DB_USER=${DB_USER:-supermatitos} @@ -49,8 +50,6 @@ services: - REDIS_PORT=${REDIS_PORT:-6379} # Job timeout: 30 min - JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800} - # Logs path - - PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs} # Fetcher - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2} - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5} @@ -60,8 +59,8 @@ services: - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80} - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org} ######################## - #volumes: # Dev mode - # - ./app_urls:/opt/app + volumes: # Dev mode + - ./app_urls:/opt/app ######################## ports: - 8000:8000 @@ -100,12 +99,12 @@ services: ports: - 6379 #:6379 - fetcher_dozzle: - container_name: fetcher_dozzle - image: amir20/dozzle:latest - volumes: - - /var/run/docker.sock:/var/run/docker.sock:ro - ports: - - 8888:8080 - environment: - - DOZZLE_FILTER="name=fetcher_" + #fetcher_dozzle: + # container_name: fetcher_dozzle + # image: amir20/dozzle:latest + # volumes: + # - /var/run/docker.sock:/var/run/docker.sock:ro + # ports: + # - 8888:8080 + # environment: + # - DOZZLE_FILTER="name=fetcher_"