diff --git a/README.md b/README.md
index 54ff357..a3a9df0 100644
--- a/README.md
+++ b/README.md
@@ -1,44 +1,46 @@
# Matitos
-- Scheduled tasks
- - Fetcher -> Inserts raw URLs
- - Fetch parsing URL host
- - Fetch from RSS feed
- - Fetch keyword search (Google search & news, DuckDuckGo, ...)
- ++ Sources -> Robustness to TooManyRequests block
- - Selenium based
- - Sites change their logic, request captcha, ...
- - Brave Search API
- - Free up to X requests per day. Need credit card association (no charges)
- - Bing API
- - Subscription required
- - Yandex. No API?
- ++ Proxy / VPN?
- TooManyRequests, ...
- ++ Search per locale (nl-NL, fr-FR, en-GB)
- - Process URLs -> Updates raw URLs
- - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
- - Determines if it is a valid article content
+- URLs Fetcher -> Inserts raw URLs
+ - Fetch parsing URL host
+ - Fetch from RSS feed
+ - Fetch keyword search (Google search & news, DuckDuckGo, ...)
+ ++ Sources -> Robustness to TooManyRequests block
+ - Selenium based
+ - Sites change their logic, request captcha, ...
+ - Brave Search API
+ - Free up to X requests per day. Need credit card association (no charges)
+ - Bing API
+ - Subscription required
+ - Yandex. No API?
++ Proxy / VPN?
- Bypass geoblock
- - Valid URLs
- - Generate summary
- - One paragraph
- - At most three paragraphs
- - Classification
- - 5W: Who, What, When, Where, Why of a Story
- - Related to child abuse?
- - ...
+ TooManyRequests, ...
+ ++ Search per locale (nl-NL, fr-FR, en-GB)
+
+- URLs Processing -> Updates raw URLs
+ - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
+ - Determines if it is a valid article content
+ ++ Proxy / VPN?
+ Bypass geoblock
- Visualization of URLs
- Filter URLs
- - By status, search, source, language
+ - By status, search, source, language, ...
- Charts
+- Valid URLs
+ - Generate summary
+ - One paragraph
+ - At most three paragraphs
+ - Classification
+ - 5W: Who, What, When, Where, Why of a Story
+ - Related to child abuse?
+ - ...
+
- Content generation
- - Select URLs:
+ - URLs Selection
- Valid content
- - language=en
- - published_date during last_week
- - Use classifications
+ - Language of interest
+ - Published (or fetch) date during last_week
+ - Fetched by at least N sources
+ - Use classifications and summaries
- Merge summaries, ...
\ No newline at end of file
diff --git a/app_urls/Dockerfile b/app_urls/Dockerfile
index b1dd674..03d18bf 100644
--- a/app_urls/Dockerfile
+++ b/app_urls/Dockerfile
@@ -19,31 +19,10 @@ RUN pip install --no-cache-dir -r requirements.txt
COPY --chown=appuser:appuser . /opt/app/
-RUN chmod -R 755 /opt/app
-RUN chown -R appuser:appuser /opt/app
+RUN chmod -R 755 /opt
+RUN chown -R appuser:appuser /opt
+
USER appuser
-# Initialization script
-RUN echo '#!/bin/bash' > /opt/app/initialize.sh && \
- echo 'if [ "${INITIALIZE_DB}" = false ]; then' >> /opt/app/initialize.sh && \
- echo 'echo "Initialization not required"' >> /opt/app/initialize.sh && \
- echo 'else' >> /opt/app/initialize.sh && \
- echo 'echo "Initializating database"' >> /opt/app/initialize.sh && \
- echo 'python db.py --initialize_tables --initialize_data' >> /opt/app/initialize.sh && \
- echo 'python manage.py makemigrations fetcher; python manage.py migrate --fake-initial' >> /opt/app/initialize.sh && \
- echo 'python manage.py createsuperuser --noinput' >> /opt/app/initialize.sh && \
- echo 'python manage.py collectstatic --no-input' >> /opt/app/initialize.sh && \
- echo 'python manage.py import --filename scheduled_tasks.json' >> /opt/app/initialize.sh && \
- echo 'fi' >> /opt/app/initialize.sh && \
- chmod +x /opt/app/initialize.sh
-
-# Serving script
-RUN echo '#!/bin/bash' > /opt/app/run.sh && \
- # Prod mode:
- echo 'gunicorn core.wsgi:application --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
- # Dev mode:
- #echo 'gunicorn core.wsgi:application --reload --bind 0.0.0.0:8000 & python manage.py rqworker high default low' >> /opt/app/run.sh && \
- chmod +x /opt/app/run.sh
-
# Run Django’s server & workers
-CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"]
+CMD ["sh", "-c", "/opt/app/script_initialize.sh && /opt/app/script_run.sh"]
diff --git a/app_urls/db.py b/app_urls/db.py
index 7f1bedf..4e8fc80 100644
--- a/app_urls/db.py
+++ b/app_urls/db.py
@@ -134,10 +134,18 @@ def initialize_data():
with conn.transaction() as tx:
# Feeds
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" )
+ cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://feeds.feedburner.com/breitbart', 'rss_feed');" )
+ cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('http://feeds.feedburner.com/zerohedge/feed', 'rss_feed');" )
+ cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://moxie.foxnews.com/google-publisher/latest.xml', 'rss_feed');" )
+ cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362', 'rss_feed');" )
+ cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362', 'rss_feed');" )
# Websites of interest
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" )
+ cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('zerohedge.com', 'url_host');" )
+ cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('foxnews.com', 'url_host');" )
+ cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('cnbc.com', 'url_host');" )
# Search keywords
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" )
# TODO: Language per search
@@ -146,12 +154,34 @@ def initialize_data():
# Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) )
+ cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("yewtu.be/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) )
+ """ # TODO: To review with new scheme
+ # Status update based on pattern matching (with priority to apply in order)
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/(video|quotes)/.*', 100, 'invalid');" )
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(video|category)/.*', 100, 'invalid');" )
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(tag|author)/.*', 100, 'invalid');" )
+
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*zerohedge.com/(economics|political|markets)/.*', 50, 'valid');" )
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(economy|entertainment|border|crime|clips)/.*', 50, 'valid');" )
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(lifestyle|opinion|sports|world)/.*', 50, 'valid');" )
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/[0-9]{4}/[0-9]{2}/[0-9]{2}/.*', 50, 'valid');" )
+
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*bbc.com/news/.*', 50, 'valid');" )
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*msn.com/[A-z]{2}-[A-z]{2}/news/.*', 50, 'valid');" )
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*newschannel9.com/news/.*', 50, 'valid');" )
+
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*radaronline.com/p.*', 25, 'valid');" )
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*okmagazine.com/p.*', 25, 'valid');" )
+ cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*9news.com.au/national.*', 25, 'valid');" )
+ """
+
+
def main(name):
print('Hello, %s!' % name)
diff --git a/app_urls/fetcher/models.py b/app_urls/fetcher/models.py
index 72c2811..f33dd46 100644
--- a/app_urls/fetcher/models.py
+++ b/app_urls/fetcher/models.py
@@ -99,6 +99,7 @@ class UrlsDuplicate(models.Model):
class UrlsSourceSearch(models.Model):
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
+ #id_url = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url')
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
diff --git a/app_urls/fetcher/templates/filtered_urls.html b/app_urls/fetcher/templates/filtered_urls.html
index 81efd19..040a32b 100644
--- a/app_urls/fetcher/templates/filtered_urls.html
+++ b/app_urls/fetcher/templates/filtered_urls.html
@@ -331,6 +331,12 @@ input[type="checkbox"] {
{% endfor %}
+
+