Min num of sources filter, initialization scripts, docker ready to use dev mode
This commit is contained in:
68
README.md
68
README.md
@@ -1,44 +1,46 @@
|
|||||||
# Matitos
|
# Matitos
|
||||||
|
|
||||||
- Scheduled tasks
|
- URLs Fetcher -> Inserts raw URLs
|
||||||
- Fetcher -> Inserts raw URLs
|
- Fetch parsing URL host
|
||||||
- Fetch parsing URL host
|
- Fetch from RSS feed
|
||||||
- Fetch from RSS feed
|
- Fetch keyword search (Google search & news, DuckDuckGo, ...)
|
||||||
- Fetch keyword search (Google search & news, DuckDuckGo, ...)
|
++ Sources -> Robustness to TooManyRequests block
|
||||||
++ Sources -> Robustness to TooManyRequests block
|
- Selenium based
|
||||||
- Selenium based
|
- Sites change their logic, request captcha, ...
|
||||||
- Sites change their logic, request captcha, ...
|
- Brave Search API
|
||||||
- Brave Search API
|
- Free up to X requests per day. Need credit card association (no charges)
|
||||||
- Free up to X requests per day. Need credit card association (no charges)
|
- Bing API
|
||||||
- Bing API
|
- Subscription required
|
||||||
- Subscription required
|
- Yandex. No API?
|
||||||
- Yandex. No API?
|
|
||||||
++ Proxy / VPN?
|
|
||||||
TooManyRequests, ...
|
|
||||||
++ Search per locale (nl-NL, fr-FR, en-GB)
|
|
||||||
- Process URLs -> Updates raw URLs
|
|
||||||
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
|
|
||||||
- Determines if it is a valid article content
|
|
||||||
++ Proxy / VPN?
|
++ Proxy / VPN?
|
||||||
Bypass geoblock
|
TooManyRequests, ...
|
||||||
- Valid URLs
|
++ Search per locale (nl-NL, fr-FR, en-GB)
|
||||||
- Generate summary
|
|
||||||
- One paragraph
|
- URLs Processing -> Updates raw URLs
|
||||||
- At most three paragraphs
|
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
|
||||||
- Classification
|
- Determines if it is a valid article content
|
||||||
- 5W: Who, What, When, Where, Why of a Story
|
++ Proxy / VPN?
|
||||||
- Related to child abuse?
|
Bypass geoblock
|
||||||
- ...
|
|
||||||
|
|
||||||
- Visualization of URLs
|
- Visualization of URLs
|
||||||
- Filter URLs
|
- Filter URLs
|
||||||
- By status, search, source, language
|
- By status, search, source, language, ...
|
||||||
- Charts
|
- Charts
|
||||||
|
|
||||||
|
- Valid URLs
|
||||||
|
- Generate summary
|
||||||
|
- One paragraph
|
||||||
|
- At most three paragraphs
|
||||||
|
- Classification
|
||||||
|
- 5W: Who, What, When, Where, Why of a Story
|
||||||
|
- Related to child abuse?
|
||||||
|
- ...
|
||||||
|
|
||||||
- Content generation
|
- Content generation
|
||||||
- Select URLs:
|
- URLs Selection
|
||||||
- Valid content
|
- Valid content
|
||||||
- language=en
|
- Language of interest
|
||||||
- published_date during last_week
|
- Published (or fetch) date during last_week
|
||||||
- Use classifications
|
- Fetched by at least N sources
|
||||||
|
- Use classifications and summaries
|
||||||
- Merge summaries, ...
|
- Merge summaries, ...
|
||||||
@@ -19,31 +19,10 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|||||||
|
|
||||||
COPY --chown=appuser:appuser . /opt/app/
|
COPY --chown=appuser:appuser . /opt/app/
|
||||||
|
|
||||||
RUN chmod -R 755 /opt/app
|
RUN chmod -R 755 /opt
|
||||||
RUN chown -R appuser:appuser /opt/app
|
RUN chown -R appuser:appuser /opt
|
||||||
|
|
||||||
USER appuser
|
USER appuser
|
||||||
|
|
||||||
# Initialization script
|
|
||||||
RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
|
|
||||||
echo 'if [ "${INITIALIZE_DB}" = false ]; then' >> /opt/app/initialize.sh && \
|
|
||||||
echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \
|
|
||||||
echo 'else' >> /opt/app/initialize.sh && \
|
|
||||||
echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \
|
|
||||||
echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \
|
|
||||||
echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \
|
|
||||||
echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \
|
|
||||||
echo 'python manage.py collectstatic --no-input' >> /opt/app/initialize.sh && \
|
|
||||||
echo 'python manage.py import --filename scheduled_tasks.json' >> /opt/app/initialize.sh && \
|
|
||||||
echo 'fi' >> /opt/app/initialize.sh && \
|
|
||||||
chmod +x /opt/app/initialize.sh
|
|
||||||
|
|
||||||
# Serving script
|
|
||||||
RUN echo '#!/bin/bash' > /opt/app/run.sh && \
|
|
||||||
# Prod mode:
|
|
||||||
echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
|
|
||||||
# Dev mode:
|
|
||||||
#echo 'gunicorn core.wsgi:application --reload --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
|
|
||||||
chmod +x /opt/app/run.sh
|
|
||||||
|
|
||||||
# Run Django’s server & workers
|
# Run Django’s server & workers
|
||||||
CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"]
|
CMD ["sh", "-c", "/opt/app/script_initialize.sh && /opt/app/script_run.sh"]
|
||||||
|
|||||||
@@ -134,10 +134,18 @@ def initialize_data():
|
|||||||
with conn.transaction() as tx:
|
with conn.transaction() as tx:
|
||||||
# Feeds
|
# Feeds
|
||||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" )
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" )
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://feeds.feedburner.com/breitbart', 'rss_feed');" )
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('http://feeds.feedburner.com/zerohedge/feed', 'rss_feed');" )
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://moxie.foxnews.com/google-publisher/latest.xml', 'rss_feed');" )
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362', 'rss_feed');" )
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362', 'rss_feed');" )
|
||||||
# Websites of interest
|
# Websites of interest
|
||||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" )
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" )
|
||||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" )
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" )
|
||||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" )
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" )
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('zerohedge.com', 'url_host');" )
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('foxnews.com', 'url_host');" )
|
||||||
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('cnbc.com', 'url_host');" )
|
||||||
# Search keywords
|
# Search keywords
|
||||||
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" )
|
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" )
|
||||||
# TODO: Language per search
|
# TODO: Language per search
|
||||||
@@ -146,12 +154,34 @@ def initialize_data():
|
|||||||
|
|
||||||
# Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/
|
# Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/
|
||||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) )
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) )
|
||||||
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("yewtu.be/"))) )
|
||||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) )
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) )
|
||||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) )
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) )
|
||||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) )
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) )
|
||||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) )
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) )
|
||||||
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) )
|
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) )
|
||||||
|
|
||||||
|
""" # TODO: To review with new scheme
|
||||||
|
# Status update based on pattern matching (with priority to apply in order)
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/(video|quotes)/.*', 100, 'invalid');" )
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(video|category)/.*', 100, 'invalid');" )
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(tag|author)/.*', 100, 'invalid');" )
|
||||||
|
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*zerohedge.com/(economics|political|markets)/.*', 50, 'valid');" )
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(economy|entertainment|border|crime|clips)/.*', 50, 'valid');" )
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(lifestyle|opinion|sports|world)/.*', 50, 'valid');" )
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/[0-9]{4}/[0-9]{2}/[0-9]{2}/.*', 50, 'valid');" )
|
||||||
|
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*bbc.com/news/.*', 50, 'valid');" )
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*msn.com/[A-z]{2}-[A-z]{2}/news/.*', 50, 'valid');" )
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*newschannel9.com/news/.*', 50, 'valid');" )
|
||||||
|
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*radaronline.com/p.*', 25, 'valid');" )
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*okmagazine.com/p.*', 25, 'valid');" )
|
||||||
|
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*9news.com.au/national.*', 25, 'valid');" )
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
def main(name):
|
def main(name):
|
||||||
print('Hello, %s!' % name)
|
print('Hello, %s!' % name)
|
||||||
|
|
||||||
|
|||||||
@@ -99,6 +99,7 @@ class UrlsDuplicate(models.Model):
|
|||||||
|
|
||||||
class UrlsSourceSearch(models.Model):
|
class UrlsSourceSearch(models.Model):
|
||||||
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
|
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
|
||||||
|
#id_url = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url')
|
||||||
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
|
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
|
||||||
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
|
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
|
||||||
|
|
||||||
|
|||||||
@@ -331,6 +331,12 @@ input[type="checkbox"] {
|
|||||||
</label><br>
|
</label><br>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
|
<!-- Minimum Sources Count Box -->
|
||||||
|
<h3>Min #Sources</h3>
|
||||||
|
<div>
|
||||||
|
<input type="number" id="minSourceCount" name="min_sources" value="{{ selected_min_sources }}" min="1" style="width: 60px; text-align: center;">
|
||||||
|
</div>
|
||||||
|
|
||||||
<!-- Filter by language -->
|
<!-- Filter by language -->
|
||||||
<h3>Language</h3>
|
<h3>Language</h3>
|
||||||
<button type="button" class="toggle-all-btn" data-toggle="language">Toggle All</button><br>
|
<button type="button" class="toggle-all-btn" data-toggle="language">Toggle All</button><br>
|
||||||
@@ -538,6 +544,10 @@ input[type="checkbox"] {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Min number of sources
|
||||||
|
//const minSearchCount = document.getElementById('minSourceCount').value;
|
||||||
|
//params.set('min_search_count', minSearchCount);
|
||||||
|
|
||||||
// Submit the form after updating all sections
|
// Submit the form after updating all sections
|
||||||
document.getElementById("filterForm").submit();
|
document.getElementById("filterForm").submit();
|
||||||
}
|
}
|
||||||
@@ -566,6 +576,9 @@ input[type="checkbox"] {
|
|||||||
updateFormParameters();
|
updateFormParameters();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
document.getElementById('minSourceCount').addEventListener('change', function() {
|
||||||
|
updateFormParameters();
|
||||||
|
});
|
||||||
document.getElementById('perPageSelect').addEventListener('change', function() {
|
document.getElementById('perPageSelect').addEventListener('change', function() {
|
||||||
updateFormParameters();
|
updateFormParameters();
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -198,7 +198,7 @@
|
|||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Status</th>
|
<th>Status</th>
|
||||||
<td>{{ url_item.status }}</td>
|
<td>{{ url_item.status }} {% if url_canonical != None %}<a href="/urls/{{ url_canonical.id }}" target="_blank">[{{ url_canonical.id }}]</a>{% endif %} </td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>URL host</th>
|
<th>URL host</th>
|
||||||
|
|||||||
@@ -6,6 +6,8 @@ from django.contrib.auth.decorators import login_required
|
|||||||
import ollama
|
import ollama
|
||||||
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
|
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
|
||||||
import os
|
import os
|
||||||
|
from .src.logger import get_logger
|
||||||
|
logger = get_logger()
|
||||||
|
|
||||||
####################################################################################################
|
####################################################################################################
|
||||||
def trigger_task(request, task):
|
def trigger_task(request, task):
|
||||||
@@ -94,13 +96,11 @@ def url_detail_view(request, id):
|
|||||||
url_item = get_object_or_404(Urls, id=id)
|
url_item = get_object_or_404(Urls, id=id)
|
||||||
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
||||||
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
||||||
# url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
|
|
||||||
|
if (url_item.status == Urls.STATUS_ENUM.DUPLICATE):
|
||||||
url_duplicate = UrlsDuplicate.objects.get(id_url_duplicated=url_item)
|
url_canonical = UrlsDuplicate.objects.get(id_url_duplicated=url_item).id_url_canonical
|
||||||
#id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
|
else:
|
||||||
#id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
|
url_canonical = None
|
||||||
|
|
||||||
url_duplicate.id_url_duplicated
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
url_content = UrlContent.objects.get(pk=id)
|
url_content = UrlContent.objects.get(pk=id)
|
||||||
@@ -117,6 +117,7 @@ def url_detail_view(request, id):
|
|||||||
'models': ollama.get_models(),
|
'models': ollama.get_models(),
|
||||||
'prompt': ollama.get_prompt(),
|
'prompt': ollama.get_prompt(),
|
||||||
'url_content': url_content,
|
'url_content': url_content,
|
||||||
|
'url_canonical': url_canonical,
|
||||||
}
|
}
|
||||||
return render(request, 'url_detail.html', context)
|
return render(request, 'url_detail.html', context)
|
||||||
|
|
||||||
@@ -232,6 +233,7 @@ def filtered_urls(request):
|
|||||||
selected_source = request.GET.getlist('source', ["null"])
|
selected_source = request.GET.getlist('source', ["null"])
|
||||||
selected_language = request.GET.getlist('language', ["null"])
|
selected_language = request.GET.getlist('language', ["null"])
|
||||||
selected_valid_contents = request.GET.getlist('valid_content', ["null"])
|
selected_valid_contents = request.GET.getlist('valid_content', ["null"])
|
||||||
|
selected_min_sources = int(request.GET.get('min_sources', 1))
|
||||||
selected_days = request.GET.get("days", 30)
|
selected_days = request.GET.get("days", 30)
|
||||||
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
|
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
|
||||||
page_number = request.GET.get('page') # Get the current page number
|
page_number = request.GET.get('page') # Get the current page number
|
||||||
@@ -298,6 +300,9 @@ def filtered_urls(request):
|
|||||||
# Update query
|
# Update query
|
||||||
query &= (subquery)
|
query &= (subquery)
|
||||||
|
|
||||||
|
if (selected_min_sources > 1):
|
||||||
|
query &= Q(pk__in=UrlsSourceSearch.objects.values('id_url').annotate(search_count=Count('id_source', distinct=True)).filter(search_count__gte=selected_min_sources).values('id_url'))
|
||||||
|
|
||||||
# Run query
|
# Run query
|
||||||
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
|
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
|
||||||
|
|
||||||
@@ -333,6 +338,7 @@ def filtered_urls(request):
|
|||||||
'selected_source': selected_source,
|
'selected_source': selected_source,
|
||||||
'selected_language': selected_language,
|
'selected_language': selected_language,
|
||||||
'selected_valid_contents': selected_valid_contents,
|
'selected_valid_contents': selected_valid_contents,
|
||||||
|
"selected_min_sources": selected_min_sources,
|
||||||
"selected_days": selected_days,
|
"selected_days": selected_days,
|
||||||
# Map
|
# Map
|
||||||
"sources_map": sources_map,
|
"sources_map": sources_map,
|
||||||
|
|||||||
12
app_urls/script_initialize.sh
Executable file
12
app_urls/script_initialize.sh
Executable file
@@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ "${INITIALIZE_DB}" = false ]; then
|
||||||
|
echo "Initialization not required"
|
||||||
|
else
|
||||||
|
echo "Initializating database"
|
||||||
|
python db.py --initialize_tables --initialize_data
|
||||||
|
python manage.py makemigrations fetcher; python manage.py migrate --fake-initial
|
||||||
|
python manage.py createsuperuser --noinput
|
||||||
|
python manage.py collectstatic --no-input
|
||||||
|
python manage.py import --filename scheduled_tasks.json
|
||||||
|
fi
|
||||||
7
app_urls/script_run.sh
Executable file
7
app_urls/script_run.sh
Executable file
@@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ "${DJANGO_DEBUG}" = true ] | [ "${DJANGO_DEBUG}" == "True" ]; then
|
||||||
|
gunicorn core.wsgi:application --reload --log-level debug --bind 0.0.0.0:8000 --timeout 300 & python manage.py rqworker high default low
|
||||||
|
else
|
||||||
|
gunicorn core.wsgi:application --bind 0.0.0.0:8000 --timeout 300 & python manage.py rqworker high default low
|
||||||
|
fi
|
||||||
@@ -7,7 +7,7 @@ services:
|
|||||||
build:
|
build:
|
||||||
context: ./app_selenium
|
context: ./app_selenium
|
||||||
container_name: fetcher_app_selenium
|
container_name: fetcher_app_selenium
|
||||||
# restart: unless-stopped
|
restart: unless-stopped
|
||||||
shm_size: 512mb
|
shm_size: 512mb
|
||||||
environment:
|
environment:
|
||||||
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
|
- SELENIUM_SLEEP_PER_PAGE=${SELENIUM_SLEEP_PER_PAGE:-4}
|
||||||
@@ -28,17 +28,18 @@ services:
|
|||||||
build:
|
build:
|
||||||
context: ./app_urls
|
context: ./app_urls
|
||||||
container_name: fetcher_app_urls
|
container_name: fetcher_app_urls
|
||||||
# restart: unless-stopped
|
restart: unless-stopped
|
||||||
environment:
|
environment:
|
||||||
# Initialization
|
# Initialization
|
||||||
- INITIALIZE_DB=${INITIALIZE_DB:-true}
|
- INITIALIZE_DB=${INITIALIZE_DB:-false} # Related to DB persistence
|
||||||
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME:-matitos}
|
- DJANGO_SUPERUSER_USERNAME=${DJANGO_SUPERUSER_USERNAME:-matitos}
|
||||||
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD:-matitos}
|
- DJANGO_SUPERUSER_PASSWORD=${DJANGO_SUPERUSER_PASSWORD:-matitos}
|
||||||
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL:-matitos@matitos.org}
|
- DJANGO_SUPERUSER_EMAIL=${DJANGO_SUPERUSER_EMAIL:-matitos@matitos.org}
|
||||||
# Django
|
# Django
|
||||||
|
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2
|
||||||
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:-abc123456789qwerty}
|
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:-abc123456789qwerty}
|
||||||
- DJANGO_DEBUG=${DJANGO_DEBUG:-False}
|
- DJANGO_DEBUG=${DJANGO_DEBUG:-False}
|
||||||
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2
|
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-/opt/logs}
|
||||||
# Database
|
# Database
|
||||||
- DB_NAME=${DB_NAME:-matitos}
|
- DB_NAME=${DB_NAME:-matitos}
|
||||||
- DB_USER=${DB_USER:-supermatitos}
|
- DB_USER=${DB_USER:-supermatitos}
|
||||||
@@ -49,8 +50,6 @@ services:
|
|||||||
- REDIS_PORT=${REDIS_PORT:-6379}
|
- REDIS_PORT=${REDIS_PORT:-6379}
|
||||||
# Job timeout: 30 min
|
# Job timeout: 30 min
|
||||||
- JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
|
- JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
|
||||||
# Logs path
|
|
||||||
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-logs}
|
|
||||||
# Fetcher
|
# Fetcher
|
||||||
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
|
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
|
||||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
|
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
|
||||||
@@ -60,8 +59,8 @@ services:
|
|||||||
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
|
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
|
||||||
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
|
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
|
||||||
########################
|
########################
|
||||||
#volumes: # Dev mode
|
volumes: # Dev mode
|
||||||
# - ./app_urls:/opt/app
|
- ./app_urls:/opt/app
|
||||||
########################
|
########################
|
||||||
ports:
|
ports:
|
||||||
- 8000:8000
|
- 8000:8000
|
||||||
@@ -100,12 +99,12 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- 6379 #:6379
|
- 6379 #:6379
|
||||||
|
|
||||||
fetcher_dozzle:
|
#fetcher_dozzle:
|
||||||
container_name: fetcher_dozzle
|
# container_name: fetcher_dozzle
|
||||||
image: amir20/dozzle:latest
|
# image: amir20/dozzle:latest
|
||||||
volumes:
|
# volumes:
|
||||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
# - /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
ports:
|
# ports:
|
||||||
- 8888:8080
|
# - 8888:8080
|
||||||
environment:
|
# environment:
|
||||||
- DOZZLE_FILTER="name=fetcher_"
|
# - DOZZLE_FILTER="name=fetcher_"
|
||||||
|
|||||||
Reference in New Issue
Block a user