diff --git a/1-DB.ipynb b/1-DB.ipynb
index 65e0993..f2306d3 100644
--- a/1-DB.ipynb
+++ b/1-DB.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -11,7 +11,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -21,24 +21,16 @@
"db_postgres\n",
"db_redis\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n",
- " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
+ " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
- " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
- " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
- "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
- " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.3s \u001b[0m\n",
- " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.3s \u001b[0m\n",
- " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
- "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n",
- " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.4s \u001b[0m\n",
- " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.4s \u001b[0m\n",
+ " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n",
- " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.4s \u001b[0m\n",
- " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.4s \u001b[0m\n",
+ " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
+ " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h"
]
@@ -50,7 +42,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -167,7 +159,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -219,7 +211,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -268,7 +260,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
@@ -293,7 +285,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
diff --git a/app_urls/README.md b/app_urls/README.md
index 9edfd16..942c24d 100644
--- a/app_urls/README.md
+++ b/app_urls/README.md
@@ -2,8 +2,13 @@
```
conda create -n matitos_urls python=3.12
conda activate matitos_urls
-pip install django psycopg[binary] django-redis django-rq
+# Core
+pip install django psycopg[binary] django-redis django-tasks-scheduler
+# django-rq
+# Fetcher
pip install feedparser python-dateutil newspaper4k lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews
+# News visualization
+pip install ollama
```
* From automated inspectdb
@@ -89,6 +94,8 @@ RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600}
python manage.py inspectdb
# Migrations
python manage.py makemigrations api; python manage.py migrate --fake-initial
+# Create user
+python manage.py createsuperuser
```
* Deploy
diff --git a/app_urls/api/models.py b/app_urls/api/models.py
index 1e0babc..f23ae0b 100644
--- a/app_urls/api/models.py
+++ b/app_urls/api/models.py
@@ -17,7 +17,7 @@ class Search(models.Model):
db_table = 'search'
def __str__(self):
- return "[{}] {}".format(self.type, self.search)
+ return "[{}]->{}".format(self.type, self.search)
class Source(models.Model):
id = models.SmallAutoField(primary_key=True)
@@ -28,7 +28,7 @@ class Source(models.Model):
db_table = 'source'
def __str__(self):
- return self.source
+ return "[{}]".format(self.source)
class StatusPatternMatching(models.Model):
pattern = models.TextField(primary_key=True)
@@ -82,7 +82,7 @@ class Urls(models.Model):
ordering = ["-ts_fetch"]
def __str__(self):
- return "{} {} {}".format(self.url, self.ts_fetch, self.status)
+ return "URL: {} Fetch:{} Status:{}".format(self.url, self.ts_fetch, self.status)
class UrlsDuplicate(models.Model):
@@ -95,8 +95,7 @@ class UrlsDuplicate(models.Model):
unique_together = (('id_url_canonical', 'id_url_duplicated'),)
def __str__(self):
- return Urls(id=self.id_url_duplicated), Urls(id=self.id_url_canonical)
-
+ return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical)
class UrlsSourceSearch(models.Model):
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
@@ -109,4 +108,4 @@ class UrlsSourceSearch(models.Model):
unique_together = (('id_url', 'id_source', 'id_search'),)
def __str__(self):
- return Urls(id=self.id_url), Source(id=self.id_source), Search(id=self.id_search)
+ return "{} {} {}".format(self.id_source, self.id_search, self.id_url)
diff --git a/app_urls/api/obsolete_src/db_utils.py b/app_urls/api/obsolete_src/db_utils.py
deleted file mode 100644
index eca1a73..0000000
--- a/app_urls/api/obsolete_src/db_utils.py
+++ /dev/null
@@ -1,502 +0,0 @@
-import psycopg
-import redis
-import traceback
-import random
-import requests
-import json
-import os
-from .url_utils import process_article
-from .logger import get_logger
-logger = get_logger()
-
-# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
-# The rest, elsewhere
-
-class DB_Handler():
- def __init__(self, db_connect_info, redis_connect_info):
- logger.debug("Initializing URL DB writer")
- self.db_connect_info = db_connect_info
- self.redis_instance = redis.Redis(host=redis_connect_info.get("host"), port=redis_connect_info.get("port"))
- self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours
-
- try:
- self.redis_instance.ping()
- logger.debug("Succesfully pinged Redis")
- except Exception as e:
- logger.warning("Error trying to ping Redis: {}".format(str(e)))
-
- def get_urls_count(self, last_minutes_check):
- #####################
- ### Get number of URLs within last X minutes
- #####################
- try:
- # Update
- with psycopg.connect(self.db_connect_info) as conn:
- # Open cursor
- cursor = conn.cursor()
- num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0]
- except Exception as e:
- logger.warning("Error updating URLs status: {}".format(str(e)))
- num_urls = None
- return num_urls
-
- def _get_url_host_list(self):
- try:
- with psycopg.connect(self.db_connect_info) as conn:
- # List of URL host
- list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()]
- # Clean http / https from URLs
- list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host]
- # Clean last slash if exists
- list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host]
- except Exception as e:
- logger.warning("Exception fetching URL host list: " + str(e))
- list_url_host = []
- return list_url_host
-
- def _get_search_list(self):
- try:
- with psycopg.connect(self.db_connect_info) as conn:
- # List of keyword searches
- list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()]
- except Exception as e:
- logger.warning("Exception fetching searches list: " + str(e))
- list_search_text = []
- return list_search_text
-
- def _get_feed_urls(self):
- try:
- with psycopg.connect(self.db_connect_info) as conn:
- list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
- # Decode (tuple with 1 element)
- list_url_feeds = [l[0] for l in list_url_feeds]
- except Exception as e:
- logger.warning("Exception fetching RSS sites: " + str(e))
- list_url_feeds = []
- return list_url_feeds
-
- def _get_url_hosts(self):
- try:
- with psycopg.connect(self.db_connect_info) as conn:
- list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
- # Decode (tuple with 1 element)
- list_url_hosts = [l[0] for l in list_url_hosts]
- except Exception as e:
- logger.warning("Exception fetching RSS sites: " + str(e))
- list_url_hosts = []
- return list_url_hosts
-
- def _format(self, values):
- # Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729
- # String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value)
- if (type(values) == list) or (type(values) == tuple):
- insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")"
- elif (type(values) == str):
- insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" )
- else:
- logger.warning("Error formatting input values: {}".format(values))
- assert False
- return insert_args
-
- def _get_cached_canonical_url(self, url):
- ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
- try:
- filter_url = self.redis_instance.get(url)
- if (filter_url is not None):
- filter_url = filter_url.decode("utf-8")
- except Exception as e:
- logger.warning("Exception querying Redis: {}".format(str(e)))
- filter_url = None
- return filter_url
-
- def _update_urls_status(self, dict_status_ids):
- #####################
- ### Update status to array of URL IDs
- #####################
- try:
- # Update
- with psycopg.connect(self.db_connect_info) as conn:
- # Open cursor
- cursor = conn.cursor()
- # Autocommit at end of transaction (Atomic insert of URLs and sources)
- with conn.transaction() as tx:
- for key_status, value_ids in dict_status_ids.items():
- cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids])))
- except Exception as e:
- logger.warning("Error updating URLs status: {}".format(str(e)))
-
- def _get_missing_kids_urls(self, num_urls=None):
- #####################
- ### Get list of Missing Kids URLs
- #####################
- try:
- missing_kids_ids_and_urls = []
- if (num_urls is None):
- limit = 500
- else:
- limit = num_urls
- offset = 0
- with psycopg.connect(self.db_connect_info) as conn:
- # Open cursor
- cursor = conn.cursor()
- while True:
- # Query
- missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
- # Finished?
- if (len(missing_kids_ids_and_urls_query) == 0):
- break
- # Extend
- missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query
- # Offset
- offset += len(missing_kids_ids_and_urls_query)
- # Stop?
- if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls):
- break
-
- except Exception as e:
- logger.warning("Error getting Missing Kids URLs: {}".format(str(e)))
- missing_kids_ids_and_urls = []
- return missing_kids_ids_and_urls
-
- def _get_error_urls(self, num_urls=None):
- #####################
- ### Get list of Missing Kids URLs
- #####################
- try:
- error_urls = []
- if (num_urls is None):
- limit = 500
- else:
- limit = num_urls
- offset = 0
- with psycopg.connect(self.db_connect_info) as conn:
- # Open cursor
- cursor = conn.cursor()
- while True:
- # Query
- error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
- # Finished?
- if (len(error_urls_query) == 0):
- break
- # Extend
- error_urls = error_urls + error_urls_query
- # Offset
- offset += len(error_urls_query)
- # Stop?
- if (num_urls is not None) and (len(error_urls) >= num_urls):
- break
-
- except Exception as e:
- logger.warning("Error getting Error URLs: {}".format(str(e)))
- error_urls = []
- return error_urls
-
- def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched...
- """
- # TODO: REFACTOR
- For each input url
-
- Already processed?
- -> Update on Redis expire time
- -> Associate to source
- Not processed? Get main URL:
- -> URL Canonical valid?
- -> Rely on this as main URL
- -> URL Canonical not valid?
- -> Use input url, unless it's a news.google.com link
- -> If news.google.com link, filter out. REDIS?
- Main URL processing:
- -> Update in REDIS, association url -> url_canonical
- -> url != url_canonical: Add in duplicate table
- If both != news.google.com
- """
-
- # URLs to insert, URLs duplicated association, URL to Canonical form
- list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {}
-
- # URL VS CANONICAL:
- # News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen
- # Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/
-
- for url in urls_fetched:
- # Domain to filter? Input url
- filter_due_to_domain = False
- for domain_to_filter in list_domains_to_filter:
- if (domain_to_filter in url):
- logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url))
- filter_due_to_domain = True
- if (filter_due_to_domain):
- continue
-
- # URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
- cached_canonical_url = self._get_cached_canonical_url(url)
- if (cached_canonical_url is not None):
- # Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry)
- dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y
- # If url has been processed, so was its canonical form
- logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url))
- continue
-
- # Process TODO: Add language...
- url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple)
- # TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id)
-
- # Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB)
- if (url_canonical is None) and ("news.google.com" in url):
- logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url))
- continue
- # Canonical URL still news.google.com? Continue (avoid inserting in DB)
- if (url_canonical is not None) and ("news.google.com" in url_canonical):
- logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical))
- continue
-
- # Domain to filter? Input canonical_url
- filter_due_to_domain = False
- for domain_to_filter in list_domains_to_filter:
- if (url_canonical is not None) and (domain_to_filter in url_canonical):
- filter_due_to_domain = True
- if (filter_due_to_domain):
- logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical))
- continue
-
- if (url_canonical is None) or (article_status == "error"):
- logger.debug("Processing failed for URL: {}".format(url))
- # Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based
- if ("news.google.com" in url) or ("consent.google.com" in url):
- logging.debug("Not able to process Google News link, skipping: {}".format(url))
- else:
- dict_full_urls_to_canonical[url] = url # X -> X
- list_insert_url_tuple_args.append( (url, article_status) )
- continue
-
- # URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different
- if (url_canonical != url):
- list_tuple_canonical_duplicate_urls.append( (url_canonical, url) )
- # Dict: url -> canonical (update association)
- dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X
-
- # Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
- if (self._get_cached_canonical_url(url_canonical) is not None):
- # Canonical URL was already processed
- logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical))
- else:
- # Insert url_canonical to DB formatted
- list_insert_url_tuple_args.append( (url_canonical, article_status) )
- # Canonical URL different? Process
- if (url_canonical != url):
- if ("news.google.com" in url) or ("consent.google.com" in url):
- logging.debug("Not adding google.news.com based link, skipping: {}".format(url))
- else:
- # Fetched url -> duplicate (using canonical as main link)
- article_status = "duplicate"
- # Insert url (non-canonical) to DB formatted
- list_insert_url_tuple_args.append( (url, article_status) )
-
- return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical
-
- def _insert_urls(self, cursor, list_insert_url_tuple_args):
- #####################
- ### Insert URLs with status
- #####################
- if (len(list_insert_url_tuple_args) > 0):
- insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] )
- # Insert. (url_1, status_1), (url_2, status_2), ...
- sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args)
- # logger.debug("SQL CODE: {}".format(sql_code))
- c = cursor.execute(sql_code)
- # NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT)
- # https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488
-
- def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls):
- #####################
- ### Insert duplicated URLs
- #####################
- if (len(list_tuple_canonical_duplicate_urls) > 0):
- # Flatten, format, set to remove duplicates
- args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")"
-
- # Dict: url -> id
- dict_url_to_id = {}
- # Get url -> id association to populate duplicated URLs
- for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall():
- dict_url_to_id[url_] = id_
-
- # Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB
- # ORIGINAL CODE. Issue, might not have found association to all urls
- ### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls]
-
- list_tuple_canonical_duplicate_urls_ids = []
- for (url_1, url_2) in list_tuple_canonical_duplicate_urls:
- id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2)
- if (id_url_1 is None) or (id_url_2 is None):
- logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2))
- else:
- list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) )
-
- if (len(list_tuple_canonical_duplicate_urls_ids) > 0):
- insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] )
- # Insert. (id_url_canonical_1, id_url_1), ...
- sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args)
- # logger.debug("SQL CODE: {}".format(sql_code))
- c = cursor.execute(sql_code)
-
- def _get_pattern_status_list(self):
- #####################
- ### Get list of domains to filter
- #####################
- # TODO: Cache on redis and query once every N hours? ...
- try:
- with psycopg.connect(self.db_connect_info) as conn:
- # Open cursor
- cursor = conn.cursor()
- # TODO: Cache on Redis
- list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall()
- except Exception as e:
- logger.warning("Error getting pattern status list: {}".format(str(e)))
- list_pattern_status = []
- return list_pattern_status
-
- def _get_domains_to_filter(self):
- #####################
- ### Get list of domains to filter
- #####################
- # TODO: Cache on redis and query once every N hours? ...
- try:
- with psycopg.connect(self.db_connect_info) as conn:
- # Open cursor
- cursor = conn.cursor()
- # TODO: Cache on Redis
- sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ]
- except Exception as e:
- logger.warning("Error getting domains to filter: {}".format(str(e)))
- sites_to_filter = []
- return sites_to_filter
-
- def _get_cached_source_id(self, source):
- ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
- try:
- source_id = self.redis_instance.get(source)
- if (source_id is not None):
- source_id = source_id.decode("utf-8")
- except Exception as e:
- logger.warning("Exception querying Redis: {}".format(str(e)))
- source_id = None
- return source_id
-
- def _get_source_id(self, cursor, source):
- #####################
- ### Get source corresponding id
- #####################
- # Cached?
- id_source = self._get_cached_source_id(source)
- if (id_source is None):
- c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone()
- if (c is None) or (len(c) == 0):
- # Source does not exist, insert and get id
- c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone()
- # Decode source id
- id_source = c[0]
- # Cache
- print("*"*10, source, id_source)
- self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds)
- return id_source
-
- def _get_urls_id(self, cursor, urls_full):
- #####################
- ### Get id of inserted and filtered URLs
- #####################
- # TODO: Cache url -> url_id, url_canonical
- if (len(urls_full) == 0):
- return []
- # Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source
- in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")"
- id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ]
- return id_urls_related
-
- def _insert_urls_source(self, cursor, id_urls_related, id_source):
- #####################
- ### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ...
- #####################
- if (len(id_urls_related) == 0) or (id_source is None):
- return
- columns = "(id_url, id_source)"
- insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] )
- # Insert
- sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args)
- # logger.debug("SQL CODE: {}".format(sql_code))
- c = cursor.execute(sql_code)
-
- def write_batch(self, urls_fetched, source):
- # Chunks of 50 elements
- n = 50
- # Divide in small chunks
- urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)]
- # Process
- for urls_fetched_chunk_i in urls_fetched_chunks:
- self._write_small_batch(urls_fetched_chunk_i, source)
-
- def _write_small_batch(self, urls_fetched, source):
- try:
- logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source))
-
- if (len(urls_fetched) == 0):
- logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
- return
-
- # Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests)
- random.shuffle(urls_fetched)
-
- # Get list of domains to filter
- list_domains_to_filter = self._get_domains_to_filter()
- # Get list of (pattern, priority, status) tuples to override status if required
- list_pattern_status_tuple = self._get_pattern_status_list()
- # Sort pattern tuples by priority
- list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
-
- # Process URLs to update DB
- list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple)
- # Full set of URL and its canonical form (to associate them to a search), both to insert and filter
- urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) )
-
- # Insert
- with psycopg.connect(self.db_connect_info) as conn:
- # Open cursor
- cursor = conn.cursor()
- # Autocommit at end of transaction (Atomic insert of URLs and sources)
- with conn.transaction() as tx:
- # Insert processed URLs
- self._insert_urls(cursor, list_insert_url_tuple_args)
- # Insert URLs duplicated (canonical != fetched url)
- self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls)
-
- # Get source id in DB
- id_source = self._get_source_id(cursor, source)
- # Get IDs of all related URLs
- id_urls_related = self._get_urls_id(cursor, urls_full)
- # Insert search source associated to URLs
- self._insert_urls_source(cursor, id_urls_related, id_source)
-
- # Update Redis status of inserted and filtered URLs after writing to DB
- for url, url_canonical in dict_full_urls_to_canonical.items():
- try:
- # Set with updated expiry time
- self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds)
- if (url != url_canonical):
- self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds)
- except Exception as e:
- logger.warning("Exception running set in Redis: {}".format(str(e)))
-
- if (len(list_insert_url_tuple_args) > 0):
- try:
- webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
- endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token)
-
- payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) })
- r = requests.post(endpoint_message, data=payload)
- except Exception as e:
- logger.warning("Webhook failed: {}".format(str(e)))
-
- logger.debug("URL DB write finished")
- except Exception as e:
- logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) )
- logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) )
\ No newline at end of file
diff --git a/app_urls/api/obsolete_src/fetch_feed.py b/app_urls/api/obsolete_src/fetch_feed.py
deleted file mode 100644
index dc12736..0000000
--- a/app_urls/api/obsolete_src/fetch_feed.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from .db_utils import DB_Handler
-import feedparser
-import dateutil
-from .logger import get_logger
-logger = get_logger()
-
-class FetchFeeds():
- def __init__(self, db_handler: DB_Handler) -> None:
- logger.debug("Initializing News feed")
- self.db_handler = db_handler
-
- def run(self):
- try:
- logger.debug("Starting NewsFeed.run()")
- # Get feeds
- list_url_feeds = self.db_handler._get_feed_urls()
- logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds)))
-
- # Process via RSS feeds
- for url_feed in list_url_feeds:
- # Initialize
- urls_fetched, urls_publish_date = [], []
- # Fetch feeds
- feeds = feedparser.parse(url_feed)
- # Parse
- for f in feeds.get("entries", []):
- # Get URL
- url = f.get("link", None)
- # Process?
- if (url is not None):
- # Available publish date?
- publish_date_parsed = f.get("published_parsed")
- if (publish_date_parsed is None):
- publish_date = f.get("published", None)
- if (publish_date is not None):
- publish_date_parsed = dateutil.parser.parse(publish_date)
-
- # Published date
- urls_publish_date.append(publish_date_parsed)
- # URL
- urls_fetched.append(url)
-
- # URL fetching source
- source = "feed {}".format(url_feed)
- # Write to DB
- self.db_handler.write_batch(urls_fetched, source)
- except Exception as e:
- logger.warning("Exception in NewsFeed.run(): {}".format(str(e)))
diff --git a/app_urls/api/obsolete_src/fetch_parser.py b/app_urls/api/obsolete_src/fetch_parser.py
deleted file mode 100644
index c3a73cb..0000000
--- a/app_urls/api/obsolete_src/fetch_parser.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from .db_utils import DB_Handler
-import newspaper
-from .logger import get_logger
-logger = get_logger()
-
-class FetchParser():
- def __init__(self, db_handler: DB_Handler) -> None:
- logger.debug("Initializing News SiteParsing newspaper4k")
- self.db_handler = db_handler
-
- # TODO: MOVE LOGIC ELSEWHERE!
- def _postprocess(self, article_urls):
- return [url.replace("#comment-stream", "") for url in article_urls]
-
- def run(self):
- try:
- logger.debug("Starting NewsSiteParsing.run() for {}")
-
- # Get URL hosts
- list_url_hosts = self.db_handler._get_url_hosts()
- logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
-
- # Process newspaper4k build method
- for url_host_feed in list_url_hosts:
- # Protocol
- if not (url_host_feed.startswith("http")):
- url_host_feed_formatted = "https://" + url_host_feed
- else:
- url_host_feed_formatted = url_host_feed
-
- logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
- # Source object
- url_host_built = newspaper.build(url_host_feed_formatted)
- # Get articles URL list
- urls_fetched = url_host_built.article_urls()
- # TODO: MOVE!
- # Post-processing
- urls_fetched = self._postprocess(urls_fetched)
-
- # URL fetching source
- source = "newspaper4k {}".format(url_host_feed)
- # Write to DB
- self.db_handler.write_batch(urls_fetched, source)
- except Exception as e:
- logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))
\ No newline at end of file
diff --git a/app_urls/api/obsolete_src/fetch_search.py b/app_urls/api/obsolete_src/fetch_search.py
deleted file mode 100644
index 9d6a17d..0000000
--- a/app_urls/api/obsolete_src/fetch_search.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from .db_utils import DB_Handler
-from .utils import get_searxng_instances
-from .fetch_search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch
-from .logger import get_logger
-logger = get_logger()
-
-class FetchSearcher():
- def __init__(self, db_handler: DB_Handler, full=True) -> None:
- logger.debug("Initializing News feed")
- self.db_handler = db_handler
- self.full_search = full
-
- def _run_fetching(self, search_text):
- logger.debug("Starting _run_fetching() for {}".format(search_text))
-
- # Common parameters
- lang, region = "en", "US"
-
- ### PreSearch
- dict_params_news = {"search": search_text}
- FetcherPreSearch(**dict_params_news).fetch_articles(self.db_handler)
-
- ### DuckDuckGo
- period = "d"
- dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
- FetcherDuckDuckGo(**dict_params_news).fetch_articles(self.db_handler)
- dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
- FetcherDuckDuckGo(**dict_params_general).fetch_articles(self.db_handler)
-
- if (self.full_search):
- # Avoid site:{} search due to G-Bypass required time
- if ("site:" not in search_text):
- ### GNews
- dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
- FetcherGNews(**dict_params).fetch_articles(self.db_handler)
-
- ### GoogleNews
- dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
- FetcherGoogleNews(**dict_params_news).fetch_articles(self.db_handler)
- # dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
-
- if False:
- ### SearxNG
- period = "day"
- for searx_instance in get_searxng_instances():
- dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
- dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
- # Append thread
- FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
- FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)
-
- logger.debug("Finished _run_fetching()")
-
- def run(self):
- try:
- logger.info("Fetching text searches & URL hosts of interest")
-
- # Get text searches of interest
- list_search_text_of_interest = self.db_handler._get_search_list()
-
- # Get URL host of interest
- list_url_host = self.db_handler._get_url_host_list()
- # Get text searches for URL hosts
- list_search_text_url_host = ["site:{}".format(l) for l in list_url_host]
-
- for search_text in list_search_text_of_interest + list_search_text_url_host:
- logger.debug("Fetching news for search: {}".format(search_text))
- self._run_fetching(search_text)
-
- logger.info("Finished fetching text searches & URL hosts of interest")
- except Exception as e:
- logger.warning("Exception in NewsSearch.run(): {}".format(str(e)))
-
\ No newline at end of file
diff --git a/app_urls/api/obsolete_src/fetch_search_sources.py b/app_urls/api/obsolete_src/fetch_search_sources.py
deleted file mode 100644
index 25813b5..0000000
--- a/app_urls/api/obsolete_src/fetch_search_sources.py
+++ /dev/null
@@ -1,384 +0,0 @@
-from duckduckgo_search import DDGS
-from gnews import GNews
-from GoogleNews import GoogleNews
-
-import requests
-from bs4 import BeautifulSoup
-import os
-import time
-import json
-import numpy as np
-import random
-from .google_bypass import GoogleByPass
-from abc import ABC, abstractmethod
-from .logger import get_logger
-logger = get_logger()
-
-
-
-# Generic fetcher (fetches articles, writes to DB)
-class FetcherAbstract(ABC):
- @abstractmethod
- def _fetch(self):
- pass
-
- def fetch_articles(self, db_writer):
- logger.debug("Starting fetch() for {}".format(self.name))
- # Fetch articles
- list_news = self._fetch()
- logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
- # Write to DB
- db_writer.write_batch(list_news, self.name)
-
-# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
-
-user_agents_list = [
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
- "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
- "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
- "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
- "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
- "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
- "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
- "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
- "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
- "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
- "Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
- "Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
- "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
-]
-
-
-
-
-
-class FetcherPreSearch(FetcherAbstract):
- def __init__(self, search):
- """
- # period ->
- - h = hours (eg: 12h)
- - d = days (eg: 7d)
- - m = months (eg: 6m)
- - y = years (eg: 1y)
- """
- self.search = search
- self.period = "1d" # TODO Fixed for the moment
- # self.lang = lang
- # self.region = region
- search_category = "news"
- self.name = "presearch {} {} {}".format(search, search_category, self.period)
-
- def _fetch(self):
- try:
- # PreSearch fetching endpoint, parameter search keyword
- presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search)
- # Timeout: 15 minutes
- r = requests.get(presearch_fetch_endpoint, timeout=900)
- # Decode
- list_news = json.loads(r.text).get("list_urls", [])
- except Exception as e:
- logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e)))
- list_news = []
- return list_news
-
-
-
-class FetcherGNews(FetcherAbstract):
- def __init__(self, search, period, lang="en", region="US"):
- """
- # period ->
- - h = hours (eg: 12h)
- - d = days (eg: 7d)
- - m = months (eg: 6m)
- - y = years (eg: 1y)
- """
- self.search = search
- self.period = period
- self.lang = lang
- self.region = region
- search_category = "news"
- self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
-
- def _fetch(self):
- try:
- list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search)
- # Decode
- list_news = []
- for l in list_dict_news:
- list_news.append(l.get("url"))
- except Exception as e:
- logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
- list_news = []
-
- # Bypass Google links
- list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
-
- return list_news_redirections
-
-class FetcherGoogleNews(FetcherAbstract):
- def __init__(self, search, search_category="news", period="1d", lang="en", region="US"):
- assert(search_category in ["news", "general"])
-
- self.lang = lang
- self.region = region
- self.period = period
- self.search_category = search_category
- self.search = search
- self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
-
- def _fetch(self):
- try:
- # Initialize
- g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region)
- g.enableException(True)
-
- if (self.search_category == "general"):
- set_links = set()
- # Search
- g.search(self.search)
-
- # Iterate pages
- MAX_ITER_PAGES = 15
- for i in range(MAX_ITER_PAGES):
- time.sleep(random.uniform(1, 1.5))
- num_before = len(set_links)
-
- # Get page
- try:
- links = g.page_at(i)
- except Exception as e:
- logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e)))
- break
- # Links
- for l in links:
- # '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ'
- url = l.get("link").split("url=")[-1]
- set_links.add(url)
-
- num_after = len(set_links)
-
- # Finished?
- if (num_before == num_after):
- logger.debug("Iterated {} pages on GoogleNews general search".format(i))
- break
- # To list
- list_news = list(set_links)
- elif (self.search_category == "news"):
- # Search
- g.get_news(self.search)
- # Fetch
- list_news = g.get_links()
-
- except Exception as e:
- logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
- list_news = []
-
- # Bypass Google links
- list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
-
- return list_news_redirections
-
-class FetcherDuckDuckGo(FetcherAbstract):
- def __init__(self, search, search_category, period, lang="wt", region="wt"):
- assert(search_category in ["news", "general"])
- assert(period in ["d", "w", "m", "y"])
- self.search = search
- self.search_category = search_category
- self.period = period
- self.lang_region = "{}-{}".format(lang, region)
- self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region)
-
- def _fetch(self):
- try:
- list_news = []
- with DDGS(timeout=10) as ddgs:
- if (self.search_category == "general"):
- generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region)
- elif (self.search_category == "news"):
- generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region)
-
- for l in generator_links:
- list_news.append( l.get("url", l.get("href")) )
-
- except Exception as e:
- logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
- list_news = []
- return list_news
-
-
-class FetcherSearxNews(FetcherAbstract):
- def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"):
- assert(search_category in ["news", "general"])
- assert(period in [None, "day", "week", "month", "year"])
- # Random header (minimize prob of web-scrapping detection)
- self.headers = {
- 'User-agent': str(np.random.choice(user_agents_list)),
- 'Accept-Encoding': 'gzip, deflate',
- 'Accept': '*/*',
- 'Connection': 'keep-alive',
- }
- """ # Optional header
- self.headers = {
- 'User-agent': str(np.random.choice(user_agents_list)),
- 'Accept-Encoding': 'gzip, deflate, br',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
- 'Connection': 'keep-alive',
- 'Upgrade-Insecure-Requests': '1',
- 'TE': 'trailers',
- 'Sec-Fetch-Site': 'cross-site',
- 'Sec-Fetch-Mode': 'navigate',
- 'Sec-Fetch-Dest': 'document',
- }
- """
- self.search = search
- self.searx_instance = searx_instance
- self.lang_region = "{}-{}".format(lang, region)
- self.search_category = search_category
- self.period = period
- self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5
- self.request_timeout = 240
-
- period_name_mapping = {
- None: "no_date_range",
- "day": "1d",
- "week": "1w",
- "month": "1m",
- "year": "1y",
- }
- self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region)
- logger.info("SearX - Initialized SearX fetcher: {}".format(self.name))
-
- def _request_and_decode(self, url_search):
- # Initial random time sleep (minimize chance of getting blocked)
- time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher))
- # Request
- logger.debug("SearX - Searching: {}".format(url_search))
- try:
- r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout)
- except Exception as e:
- logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e))
- return []
-
- if (r.status_code == 200):
- # Status code Ok
- pass
- elif (r.status_code == 429):
- # TooManyRequests, "Rate limit exceeded"
- logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text))
- return []
- elif (r.status_code != 200):
- logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text))
- return []
- else:
- logger.debug("SearX - Status code: {}".format(r.status_code))
-
- # Decode request
- soup = BeautifulSoup(r.text, 'html.parser')
- page_url_set = set()
- # h3 links
- for elem in soup.find_all('h3'):
- # Get url
- url = elem.find('a').get('href')
- page_url_set.add(url)
- return page_url_set
-
- def _get_news_list(self):
- ############################################################
- # Domain & search parameter
- search_domain = os.path.join(self.searx_instance, "search?q=")
- # Search keywords
- search_formatted = self.search.replace(" ", "+").replace(":", "%3A")
- # Period formatted
- period_formatted = "&time_range={}".format(self.period) if self.period is not None else ""
- # Search parameters
- search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted)
- # Combined url search
- url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters)
- ############################################################
-
- # Request and decode on page=1
- url_set = self._request_and_decode(url_search_nopage)
- # No results?
- if (len(url_set) == 0):
- logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage))
- return []
-
- # Iterate pages
- search_numpage = 2
- while True:
- # Combine url search with page number
- url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage)
- # Request and decode on page=X
- url_set_i = self._request_and_decode(url_search_with_page)
-
- # Length before merging
- length_current = len(url_set)
- # Merge
- url_set = url_set.union(url_set_i)
- # Length after merging
- length_merged = len(url_set)
-
- # No new elements?
- if (length_current == length_merged):
- logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage))
- break
- # Next page
- search_numpage += 1
-
- return list(url_set)
-
- def _fetch(self):
- try:
- # Fetch news
- list_news = self._get_news_list()
- except Exception as e:
- logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
- list_news = []
- return list_news
diff --git a/app_urls/api/obsolete_src/google_bypass.py b/app_urls/api/obsolete_src/google_bypass.py
deleted file mode 100644
index 6e34e72..0000000
--- a/app_urls/api/obsolete_src/google_bypass.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import requests
-import json
-from .logger import get_logger
-logger = get_logger()
-
-class GoogleByPass():
- def __init__(self) -> None:
- pass
-
- def bypass_google_urls(self, list_urls):
- if (len(list_urls) == 0):
- return []
-
- try:
- # Endpoint
- gbypass_endpoint = "http://selenium_app:80/get_redirection"
- # Timeout: 20 minutes
- timeout = 60*20
- r = requests.post(gbypass_endpoint, json={"list_urls": list_urls}, timeout=timeout)
- # Decode
- list_urls_redirections = json.loads(r.text).get("list_urls_redirections", [])
- except Exception as e:
- logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
- list_urls_redirections = []
-
- return list_urls_redirections
diff --git a/app_urls/api/obsolete_src/logger.py b/app_urls/api/obsolete_src/logger.py
deleted file mode 100644
index 83f00b3..0000000
--- a/app_urls/api/obsolete_src/logger.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import logging
-
-import os
-os.makedirs("logs", exist_ok=True)
-
-logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
-logger = logging.getLogger("news_fetcher")
-logger.setLevel(logging.INFO)
-
-# To file log: INFO / WARNING / ERROR
-fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4)
-fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
-logger.addHandler(fh)
-
-# To file log: WARNING / ERROR
-fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
-fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
-fh_.setLevel(logging.WARNING)
-logger.addHandler(fh_)
-
-def get_logger():
- return logger
diff --git a/app_urls/api/obsolete_src/missing_kids_fetch.py b/app_urls/api/obsolete_src/missing_kids_fetch.py
deleted file mode 100644
index ea92cb7..0000000
--- a/app_urls/api/obsolete_src/missing_kids_fetch.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from .db_utils import DB_Handler
-import requests
-import json
-from .logger import get_logger
-logger = get_logger()
-
-class MissingKidsFetch():
- def __init__(self, db_handler: DB_Handler, num_pages) -> None:
- logger.debug("Initializing News MissingKids")
- self.db_handler = db_handler
- self.num_pages = num_pages
- self.missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}"
-
- def run(self):
- try:
- logger.debug("Starting NewsMissingKids.run()")
- try:
- # Timeout
- if (self.num_pages > 15):
- timeout = 60*90 # 1.5h
- else:
- timeout = 60*5 # 5 min
- # Request
- r = requests.get(self.missingkids_fetch_endpoint.format(self.num_pages), timeout=timeout)
- # Decode
- urls_fetched = json.loads(r.text).get("list_urls", [])
- except Exception as e:
- logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
- urls_fetched = []
-
- # URL fetching source
- source = "missingkids fetcher"
- # Write to DB
- self.db_handler.write_batch(urls_fetched, source)
- except Exception as e:
- logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e)))
diff --git a/app_urls/api/obsolete_src/missing_kids_status.py b/app_urls/api/obsolete_src/missing_kids_status.py
deleted file mode 100644
index df0768a..0000000
--- a/app_urls/api/obsolete_src/missing_kids_status.py
+++ /dev/null
@@ -1,98 +0,0 @@
-from .db_utils import URL_DB_Writer
-from .url_utils import get_missing_kid_status
-from .logger import get_logger
-logger = get_logger()
-
-
-def get_missing_kid_status(url, return_canonical_url=False):
- import time
- import requests
-
- # Sleep
- time.sleep(0.75)
- try:
- # Request
- r = requests.get(url, timeout=300)
- # Decode
- status_code = r.status_code
- # Canonical URL removing parameters
- url_canonical = r.url
- except Exception as e:
- logger.warning("Exception on get URL status request: {}. {}".format(url, str(e)))
- status_code = None
- url_canonical = url
-
- if (status_code == 200):
- status = "valid"
- elif (status_code == 404):
- status = "invalid"
- else:
- status = "unknown"
-
- logger.debug("Missing Kid URL {} status: {}".format(url, status))
- if (return_canonical_url):
- return status, url_canonical
- else:
- return status
-
-class MissingKidsStatus():
- def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
- self.num_urls = num_urls
- self.db_connect_info = db_connect_info
- self.redis_connect_info = redis_connect_info
- self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
-
- def update_missing_kids_status(self):
- try:
- logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls))
- # List of URLs
- list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls)
- # Dict: status -> IDs to update to new status
- dict_status_ids, dict_status_urls = {}, {}
- # Check URLs with invalid status?
- skip_invalid_check = False
-
- flush_every, flush_current = 20, 0
- # Iterate URLs
- for (id, url, current_status) in list_ids_and_urls:
- # Skip duplicate URLs
- if (current_status == "duplicate"):
- continue
- # Skip invalid URLs?
- if (skip_invalid_check):
- if (current_status == "invalid"):
- continue
-
- # Get status
- new_status = get_missing_kid_status(url)
- # Different? Update
- if (current_status != new_status):
- # Extend array
- dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
- # Debugging dict
- dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
- # +1 processed
- flush_current += 1
-
- # Flush batch?
- if (flush_every == flush_current):
- logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
- # Update DB
- self.db_writer._update_urls_status(dict_status_ids)
- # Reset
- flush_current = 0
- dict_status_ids, dict_status_urls = {}, {}
-
- # Flush remaining batch
- if (flush_current > 0):
- logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
- # Update DB
- self.db_writer._update_urls_status(dict_status_ids)
- # Reset
- flush_current = 0
- dict_status_ids, dict_status_urls = {}, {}
-
- logger.info("Finished updating status to Missing Kids URLs")
- except Exception as e:
- logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e)))
-
\ No newline at end of file
diff --git a/app_urls/api/obsolete_src/url_status.py b/app_urls/api/obsolete_src/url_status.py
deleted file mode 100644
index 3948417..0000000
--- a/app_urls/api/obsolete_src/url_status.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from .db_utils import URL_DB_Writer
-from .url_utils import process_article
-from .logger import get_logger
-logger = get_logger()
-
-class UpdateErrorURLs():
- def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
- self.num_urls = num_urls
- self.db_connect_info = db_connect_info
- self.redis_connect_info = redis_connect_info
- self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
-
- def update_error_urls_status(self):
- try:
- logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls))
- # List of URLs with status 'error'
- list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls)
- # Current status
- current_status = "error"
- # Dict: status -> IDs to update to new status
- dict_status_ids, dict_status_urls = {}, {}
-
- # Get list of (pattern, priority, status) tuples to override status if required
- list_pattern_status_tuple = self.db_writer._get_pattern_status_list()
- # Sort pattern tuples by priority
- list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
-
- flush_every, flush_current = 20, 0
- # Iterate URLs
- for (id, url) in list_ids_and_urls:
- # Get status
- url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple)
- # Different? Update
- if (current_status != new_status):
- # Extend array
- dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
- # Debugging dict
- dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
- # +1 processed
- flush_current += 1
-
- # Flush batch?
- if (flush_every == flush_current):
- logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
- # Update DB
- self.db_writer._update_urls_status(dict_status_ids)
- # Reset
- flush_current = 0
- dict_status_ids, dict_status_urls = {}, {}
-
- # Flush remaining batch
- if (flush_current > 0):
- logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
- # Update DB
- self.db_writer._update_urls_status(dict_status_ids)
- # Reset
- flush_current = 0
- dict_status_ids, dict_status_urls = {}, {}
-
- logger.info("Finished updating status to URLs with error")
- except Exception as e:
- logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))
diff --git a/app_urls/api/obsolete_src/url_utils.py b/app_urls/api/obsolete_src/url_utils.py
deleted file mode 100644
index ca57cb4..0000000
--- a/app_urls/api/obsolete_src/url_utils.py
+++ /dev/null
@@ -1,263 +0,0 @@
-from gnews import GNews
-import dateutil.parser
-from datetime import datetime, timedelta
-from .utils import remove_http_s
-import time
-import random
-import traceback
-import requests
-import json
-import re
-from bs4 import BeautifulSoup
-
-from .logger import get_logger
-logger = get_logger()
-
-def get_published_date(article):
- try:
- """
- # Already fetched publish date information?
- if (publish_date_ is not None):
- return publish_date_
- """
-
- # List of potential publish dates
- potential_dates = []
- # Publish date is the best match
- potential_dates.append(article.publish_date)
- # Publish date metadata is the following best match
- potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
- # Iterate remaining keys
- for key in article.meta_data.keys():
- if ("date" in key):
- potential_dates.append(article.meta_data[key])
-
- def invalid_date(p_date):
- # Today + 2 days, article from the future?
- today_plus_two = datetime.utcnow() + timedelta(days=2)
- # Article from the future?
- return p_date.timestamp() > today_plus_two.timestamp()
-
- for date_ in potential_dates:
- # String date? parse
- if (type(date_) == str):
- try:
- date_ = dateutil.parser.parse(date_)
- except Exception as e:
- logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
- date_ = None
- # Valid?
- if (date_ is not None) and (not invalid_date(date_)):
- return date_
-
- logger.debug("Article with no published date: {}".format(article.url))
- return None
- except Exception as e:
- logger.info("Error while retrieving published date for URL: {}".format(article.url))
- return None
-
-def get_url_host(article_source_url, url):
- # https://www.blabla.com/blabla -> www.blabla.com
- if (article_source_url != ""):
- # Article source URL already extracted, save path if any
- return remove_http_s(article_source_url) # .split("/")[0]
- else:
- return remove_http_s(url).split("/")[0]
-
-def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
- # Regex pattern to update status on "valid", "invalid", and "unknown" status only
- # Status "raw", "duplicated" and "error" should remain the way they are
- # Assumption: List of patterns sorted by importance
- if (article_status in ["valid", "invalid", "unknown"]):
- # Regular expression pattern matching: https://regexr.com/
- for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
- # Matching?
- matching = bool(re.match(regex_pattern, url))
- # Update article status
- if (matching):
- if (status_if_match != article_status):
- logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
- return status_if_match
- # Pattern matching not required or not found, original article status
- return article_status
-
-
-
-def bypass_google_link(article_url):
-
- def bypass_google_consent(article_url):
- # Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
- article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
-
- # https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
- headers = {
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
- }
- cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
-
- try:
- # Request
- r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
- # Decode
- soup = BeautifulSoup(r.text, 'html.parser')
- url_of_interest = soup.a['href']
- except Exception as e:
- logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
- url_of_interest = None
-
- # Not able to bypass?
- if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
- url_of_interest = None
- return url_of_interest
-
- def bypass_google_using_service(article_url):
- try:
- # e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
- gbypass_endpoint = "http://selenium_app:80/get_redirection"
- # Timeout: 5 minutes
- r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
- # Decode
- redirect_url = json.loads(r.text).get("redirect_url", "")
- except Exception as e:
- logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
- redirect_url = ""
-
- return redirect_url
-
- logger.debug("Starting gbypass_endpoint()")
-
- article_url_bypassed = None
- # Bypass using request
- if ("consent.google.com" in article_url):
- article_url_bypassed = bypass_google_consent(article_url)
- # Not bypassed yet? Bypass using service
- if (article_url_bypassed is None):
- article_url_bypassed = bypass_google_using_service(article_url)
-
- # if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
- if (article_url_bypassed == "") or (article_url_bypassed is None):
- # Empty URL returned by Gbypass
- logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
- return None
- else:
- logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
- return article_url_bypassed
-
-def process_article(article_url, list_pattern_status_tuple, language="en"):
- # TODO:
- """
- https://github.com/fhamborg/news-please
- https://github.com/fhamborg/Giveme5W1HQwer123$
-
- https://github.com/santhoshse7en/news-fetch
- """
- try:
- logger.debug("Starting process_article()")
-
- if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
- # Bypass to get redirection
- article_url = bypass_google_link(article_url)
- # Error?
- if (article_url is None):
- return None, {}, "error"
- elif ("missingkids.org/poster" in article_url):
- # Get status
- article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
- article_elements = {
- "url_full": article_url,
- "url_canonical": url_canonical
- }
- return url_canonical, article_elements, article_status
- else:
- # Avoid Too many requests (feeds, ...)
- time.sleep(0.75)
-
- logger.debug("Processing: {}".format(article_url))
-
- # Default status unless something happens
- article_status = "valid"
-
- # Parse article
- # TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
- # TODO: Language per config
- article = GNews(language).get_full_article(url=article_url)
-
- # Article parsed?
- if (article is None) or (not article.is_parsed):
- logger.debug("Article not parsed: {}".format(article_url))
- return article_url, {}, "error"
-
- # Canonical link as main URL
- url_canonical = article.canonical_link
- # Empty canonical URL?
- if (article.canonical_link is None) or (article.canonical_link == ""):
- # URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
- if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
- logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
- try:
- # Remove text after parameter call
- url = article.url.split("?")[0]
- # Remove comment-stream
- url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
- # Article
- article_attempt = GNews(language).get_full_article(url=url)
- # Retrieving same title? Update article based on clean URL
- if (article_attempt is not None) and (article_attempt.title == article.title):
- article = article_attempt
- except Exception as e:
- logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
- else: # Default behaviour
- logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
-
- # By default, URL same as canonical
- url_canonical = article.url
-
- elif (article.url != article.canonical_link):
- # If different, stick to canonical URL
- logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
- else:
- # If same, continue...
- pass
-
- # Update config to determine if content is valid
- article.config.MIN_WORD_COUNT = 150
- article.config.MIN_SENT_COUNT = 6
-
- # Valid URL?
- if (not article.is_valid_url()):
- logger.debug("Not a valid news article: {}".format(url_canonical))
- article_status = "invalid"
- # Is the article's body text is long enough to meet standard article requirements?
- if (not article.is_valid_body()):
- logger.debug("Article body not valid: {}".format(url_canonical))
- article_status = "unknown"
-
- if (article.images != article.imgs):
- logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
-
- # article.keywords, article.meta_keywords, article.summary
- # article.movies
- # article.top_image
-
- # Check if article status needs to be updated
- article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
-
- article_elements = {
- 'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
- 'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com
- 'title': article.title, # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020
- 'description': article.meta_description, # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office
- 'text': article.text, # ${Article content}
- 'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
- 'authors': article.authors, # ['Christopher Knaus']
- 'language': article.meta_lang, # en
- 'tags': list(article.tags), # ['Wide Open Border', '’My Son Hunter’ Movie', ...]
- 'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...]
- 'url_canonical': url_canonical, # Canonical URL (redirection)
- # 'html': article.html, # HTML article
- }
- logger.debug("Processing OK: {}".format(url_canonical))
- return url_canonical, article_elements, article_status
- except Exception as e:
- logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
- return None, {}, "error"
\ No newline at end of file
diff --git a/app_urls/api/obsolete_src/utils.py b/app_urls/api/obsolete_src/utils.py
deleted file mode 100644
index 76ae07a..0000000
--- a/app_urls/api/obsolete_src/utils.py
+++ /dev/null
@@ -1,33 +0,0 @@
-
-def remove_http_s(url):
- url = url.replace("https://", "") if url.startswith("https://") else url
- url = url.replace("http://", "") if url.startswith("http://") else url
- return url
-
-def is_valid_url(url):
- if (url.startswith("https://")):
- return True
- else:
- return False
-
-def get_searxng_instances():
- # SearxNG instances: https://searx.space/
- searx_instances = set()
- searx_instances.add("https://searx.work/")
- searx_instances.add("https://search.ononoki.org/")
- searx_instances.add("https://searxng.nicfab.eu/")
- searx_instances.add("https://searx.be/")
-
- # searx_instances.add("https://searx.fmac.xyz/")
- # searx_instances.add("https://northboot.xyz/") # FIX
-
- # searx_instances.add("https://serx.ml/") # Offline
- # searx_instances.add("https://searx.ru/")
- # searx_instances.add("https://searx.sp-codes.de/")
- # searx_instances.add("https://searxng.nicfab.eu/")
- # searx_instances.add("https://s.frlt.one/")
- # searx_instances.add("https://search.sapti.me/")
-
- # To list
- list_searx_instances = list(searx_instances)
- return list_searx_instances
\ No newline at end of file
diff --git a/app_urls/api/src/db_utils.py b/app_urls/api/src/db_utils.py
index 6ea318e..96799fe 100644
--- a/app_urls/api/src/db_utils.py
+++ b/app_urls/api/src/db_utils.py
@@ -14,8 +14,6 @@ class DB_Handler():
self._cache_timeout_insert_url = 86400
# Processing error URL, cache time: 2 days
self._cache_timeout_error_url = 86400*2
- # URL host slowdown
- self.url_host_slowdown_seconds = 5
def insert_raw_urls(self, urls, obj_source, obj_search):
try:
@@ -90,13 +88,6 @@ class DB_Handler():
if (obj_url.status != status):
obj_url.status = status
obj_url.save()
- # updating_urls.append(obj_url)
-
- # TODO: Fix enum type issue. Bulk update instead of .save() for each object
- # List of objects to bulk update
- # updating_urls = []
- # ... general processing, append to updating_urls
- # Urls.objects.bulk_update(updating_urls, ['status'])
##### Filter URL? -> Invalid
if (status_pattern_match == "invalid"):
diff --git a/app_urls/api/tasks.py b/app_urls/api/tasks.py
index 3e4d163..a390ebe 100644
--- a/app_urls/api/tasks.py
+++ b/app_urls/api/tasks.py
@@ -1,11 +1,11 @@
-from django_rq import job
+# from django_rq import job
+from scheduler import job
from .src.fetch_feed import FetchFeeds
from .src.fetch_parser import FetchParser
from .src.fetch_search import FetchSearcher
from .src.db_utils import DB_Handler
'''
-from src.fetch_search import FetchSearcher
from src.missing_kids_fetch import MissingKidsFetch
from src.missing_kids_status import MissingKidsStatus
'''
@@ -13,12 +13,56 @@ from src.missing_kids_status import MissingKidsStatus
from .src.logger import get_logger
logger = get_logger()
-@job
+@job('default')
def fetch_feeds():
- logger.info("Task triggered: {}".format("FetchFeeds"))
+ task = "Fetch Feeds"
+ logger.info("Task triggered: {}".format(task))
FetchFeeds().run()
+ logger.info("Task completed: {}".format(task))
-@job
+@job('default')
+def fetch_parser():
+ task = "Fetch Parser"
+ logger.info("Task triggered: {}".format(task))
+ FetchParser().run()
+ logger.info("Task completed: {}".format(task))
+
+@job('default')
+def fetch_search():
+ task = "Fetch Search"
+ logger.info("Task triggered: {}".format(task))
+ FetchSearcher().run()
+ logger.info("Task completed: {}".format(task))
+
+# TODO: fetch_missing_kids()
+
+@job('default')
+def process_raw_urls(batch_size=50):
+ task = "Process raw URLs"
+ logger.info("Task triggered: {}".format(task))
+ DB_Handler().process_raw_urls(batch_size=batch_size)
+ logger.info("Task completed: {}".format(task))
+
+@job('default')
+def process_error_urls(batch_size=50):
+ task = "Process error URLs"
+ logger.info("Task triggered: {}".format(task))
+ DB_Handler().process_error_urls(batch_size=batch_size)
+ logger.info("Task completed: {}".format(task))
+
+@job('default')
+def process_missing_kids_urls(batch_size=50):
+ task = "Process Missing Kids URLs"
+ logger.info("Task triggered: {}".format(task))
+ DB_Handler().process_missing_kids_urls(batch_size=batch_size)
+ logger.info("Task completed: {}".format(task))
+
+
+
+
+
+
+@job('default')
def background_task(process_type: str):
logger.info("Task triggered: {}".format(process_type))
@@ -46,21 +90,11 @@ def background_task(process_type: str):
'''
-
- elif (process_type == "search") or (process_type == "search_full"):
- FetchSearcher(cred.db_connect_info, cred.redis_connect_info, full=True).run()
- elif (process_type == "search_reduced"):
- FetchSearcher(cred.db_connect_info, cred.redis_connect_info, full=False).run()
-
# Selenium based
elif (process_type == "fetch_missing_kids_reduced"):
MissingKidsFetch(db_handler, num_pages=4).run()
elif (process_type == "fetch_missing_kids_full"):
MissingKidsFetch(db_handler, num_pages=100000).run()
-
- else:
- logger.error("Task error, unknown type: {}".format(process_type))
- return
'''
logger.info("Task completed: {}".format(process_type))
diff --git a/app_urls/api/templates/item_list.html b/app_urls/api/templates/item_list.html
new file mode 100644
index 0000000..f33e579
--- /dev/null
+++ b/app_urls/api/templates/item_list.html
@@ -0,0 +1,508 @@
+
+
+
+
+
+ News
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% include 'item_list_partial.html' %}
+
+
+
+
+
+
+
+
diff --git a/app_urls/api/templates/item_list_partial.html b/app_urls/api/templates/item_list_partial.html
new file mode 100644
index 0000000..d41c3ea
--- /dev/null
+++ b/app_urls/api/templates/item_list_partial.html
@@ -0,0 +1,87 @@
+{% load custom_filters %}
+
+
+
+
+
+ | URL |
+ Fetch date |
+ Sources |
+ Status |
+ Action |
+
+
+
+ {% for item in page_obj %}
+
+ | {{ item.url }} |
+ {{ item.ts_fetch }} |
+
+ {% with sources_map|dict_get:item.id as sources %}
+ {% if sources %}
+ {% for source in sources %}
+ {{ source }}
+ {% endfor %}
+ {% else %}
+ No sources
+ {% endif %}
+ {% endwith %}
+ |
+
+ {% if item.status == 'raw' %}
+ {{ item.status|capfirst }}
+ {% elif item.status == 'error' %}
+ {{ item.status|capfirst }}
+ {% elif item.status == 'valid' %}
+ {{ item.status|capfirst }}
+ {% elif item.status == 'unknown' %}
+ {{ item.status|capfirst }}
+ {% elif item.status == 'invalid' %}
+ {{ item.status|capfirst }}
+ {% elif item.status == 'duplicate' %}
+ {{ item.status|capfirst }}
+ {% else %}
+ Unknown
+ {% endif %}
+ |
+
+ Details
+ |
+
+
+ {% empty %}
+
+ | No items available. |
+
+ {% endfor %}
+
+
+
+
+
+
+
diff --git a/app_urls/api/templates/url_detail.html b/app_urls/api/templates/url_detail.html
new file mode 100644
index 0000000..d920ff2
--- /dev/null
+++ b/app_urls/api/templates/url_detail.html
@@ -0,0 +1,211 @@
+
+
+
+
+
+ {% block title %}News{% endblock %}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
URL Details
+
+
+ | URL |
+ {{ url_item.url }} |
+
+
+ | Fetch Date |
+ {{ url_item.ts_fetch }} |
+
+
+ | Sources |
+ {{ sources|join:", " }} |
+
+
+ | Status |
+ {{ url_item.status }} |
+
+
+ | Title |
+ {{ url_content.title }} |
+
+
+ | Description |
+ {{ url_content.description }} |
+
+
+ | Content |
+ {{ url_content.content }} |
+
+
+ | Tags |
+ {{ url_content.tags }} |
+
+
+ | Authors |
+ {{ url_content.authors }} |
+
+
+ | Image URLs |
+ {{ url_content.image_urls }} |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Loading...
+
+
+
+
+
+
+
+ {% block extra_js %}{% endblock %}
+
+
diff --git a/app_urls/api/templatetags/__init__.py b/app_urls/api/templatetags/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app_urls/api/templatetags/custom_filters.py b/app_urls/api/templatetags/custom_filters.py
new file mode 100644
index 0000000..f4ad62b
--- /dev/null
+++ b/app_urls/api/templatetags/custom_filters.py
@@ -0,0 +1,8 @@
+from django import template
+
+register = template.Library()
+
+@register.filter
+def dict_get(dictionary, key):
+ """Custom filter to get a value from a dictionary in Django templates."""
+ return dictionary.get(key, [])
diff --git a/app_urls/api/urls.py b/app_urls/api/urls.py
index 78e35f1..b7c08ec 100644
--- a/app_urls/api/urls.py
+++ b/app_urls/api/urls.py
@@ -1,7 +1,10 @@
from django.urls import path
-from .views import trigger_task, link_list
+from . import views
urlpatterns = [
- path('links', link_list, name='link_list'),
- path('', trigger_task, name='trigger_task'),
+ path('', views.link_list, name='link_list'),
+ path('url/', views.news, name='url_detail'),
+ path('url//', views.url_detail_view, name='url_detail'),
+ path('url//fetch/', views.fetch_details, name='fetch_details'),
+ path('task/', views.trigger_task, name='trigger_task'),
]
diff --git a/app_urls/api/views.py b/app_urls/api/views.py
index 07b8294..b503803 100644
--- a/app_urls/api/views.py
+++ b/app_urls/api/views.py
@@ -1,30 +1,130 @@
-import django_rq
-from django.http import JsonResponse
+# import django_rq
from .tasks import background_task
+from django.http import JsonResponse
import os
-from .src.logger import get_logger
-logger = get_logger()
-
-# TODO: Queues with priorities, process_raw_urls, process_error_urls least priority due to slowdown logic
def trigger_task(request, task):
- """View that enqueues a task."""
+ # View that enqueues a task
- """
- if ("fetch_" in task):
- priority = "low"
- job_timeout="30m"
- elif ("process_" in task):
- priority = "medium"
- job_timeout="30m"
- """
+ # Enqueue function in "default" queue
+ background_task.delay(task)
+ return JsonResponse({"message": "Task has been enqueued!", "task": task})
- queue = django_rq.get_queue('default') # Get the default queue
- job = queue.enqueue(background_task, task, job_timeout="30m")
- return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})
+ # queue = django_rq.get_queue('default') # Get the default queue
+ # job = queue.enqueue(background_task, task, job_timeout="30m")
+ # return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})
def link_list(request):
- prefix = "http://localhost:8000/api"
+ prefix = "http://localhost:8000/api/task"
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_500000"]
+
db_links = ["http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500"]
- return JsonResponse({"links": db_links + [os.path.join(prefix, l) for l in links]})
+ return JsonResponse({"links": ["http://localhost:8000/api/url"] + db_links + [os.path.join(prefix, l) for l in links]})
+
+
+from django.http import StreamingHttpResponse, HttpResponse, JsonResponse
+from django.shortcuts import render, get_object_or_404
+from django.core.paginator import Paginator
+import requests
+from django.http import StreamingHttpResponse
+import json
+import time
+import ollama
+
+from .models import Urls, Source, Search, UrlsSourceSearch, UrlContent
+
+# Create your views here.
+def news(request):
+ # URLs
+ urls = Urls.objects.all()
+ # Sources
+ sources = Source.objects.all()
+ seaerches = Search.objects.all()
+
+ # Parameters
+ page_number = request.GET.get("page", 1)
+ num_items = request.GET.get("items", 15)
+ source_ids = request.GET.get("sources", ','.join([str(s.id) for s in sources]))
+ status_filters = request.GET.get("status", None)
+
+ # Filters
+ if (status_filters) and (status_filters != "all"):
+ urls = urls.filter(status__in=status_filters.split(","))
+ if (source_ids) and (source_ids != "all"):
+ # TODO: Distinct needed?
+ # urls = urls.filter(urlssource__id_source__in=source_ids.split(",")).distinct()
+ pass
+
+ # Pagination
+ paginator = Paginator(urls, num_items)
+ page_obj = paginator.get_page(page_number)
+
+ # Map URL IDs to their sources, only for subset of URLs (page of interest)
+ sources_map= {}
+ """
+ sources_map = {
+ url.id: list(Source.objects.filter(urlssource__id_url=url).values_list('source', flat=True))
+ for url in page_obj.object_list
+ }
+ """
+
+ context = {
+ "page_obj": page_obj,
+ "sources": sources,
+ "sources_map": sources_map,
+ "list_status": Urls.STATUS_ENUM.values,
+ "list_urls_per_page": [15, 50, 100],
+ }
+
+ # If request is AJAX, return JSON response
+ if request.headers.get("X-Requested-With") == "XMLHttpRequest":
+ return JsonResponse({'items_html': render(request, 'item_list_partial.html', context).content.decode('utf-8')})
+
+ return render(request, "item_list.html", context)
+
+
+def url_detail_view(request, id):
+ url_item = get_object_or_404(Urls, id=id)
+ url_sources = list(Source.objects.filter(urlssource__id_url=url_item).values_list('source', flat=True))
+ try:
+ url_content = UrlContent.objects.get(pk=id)
+ except UrlContent.DoesNotExist:
+ url_content = {}
+
+ # TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
+ # LLM models available
+ client = ollama.Client(host = 'https://ollamamodel.matitos.org')
+ models = sorted([m.model for m in client.list().models])
+ # default_model = "llama3.2:3b"
+
+ context = {
+ 'url_item': url_item,
+ 'sources': url_sources,
+ 'models': models,
+ #'default_model': default_model,
+ 'prompt': "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:",
+ #"prompt": "Image you are a journalist, TLDR in a paragraph:",
+ #"prompt": "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
+ 'url_content': url_content,
+ }
+ return render(request, 'url_detail.html', context)
+
+def fetch_details(request, id):
+ url_item = get_object_or_404(Urls, id=id)
+ url_param = request.GET.get("url", "") # Get URL
+ model = request.GET.get("model", "") # Get LLM model
+ text = request.GET.get("text", "") # Get LLM prompt
+
+ # LLM
+ client = ollama.Client(host = 'https://ollamamodel.matitos.org')
+
+ def stream_response():
+ msg_content = {
+ "role": "user",
+ "content": text,
+ }
+ response = client.chat(model=model, messages=[msg_content], stream=True)
+ for chunk in response:
+ yield chunk["message"]["content"] # Stream each chunk of text
+
+ return StreamingHttpResponse(stream_response(), content_type="text/plain")
diff --git a/app_urls/core/settings.py b/app_urls/core/settings.py
index 25720d3..9083892 100644
--- a/app_urls/core/settings.py
+++ b/app_urls/core/settings.py
@@ -38,7 +38,8 @@ INSTALLED_APPS = [
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
- 'django_rq',
+ # 'django_rq',
+ 'scheduler',
'api',
]
@@ -92,7 +93,6 @@ DATABASES = {
CACHES = {
"default": {
- #"BACKEND": "django.core.cache.backends.redis.RedisCache",
"BACKEND": "django_redis.cache.RedisCache",
"LOCATION": "redis://{}:{}".format(
os.environ.get("REDIS_HOST", "localhost"),
@@ -105,15 +105,36 @@ CACHES = {
}
}
+'''
RQ_QUEUES = {
'default': {
'HOST': os.environ.get("REDIS_HOST", "localhost"),
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 900),
- 'DEFAULT_RESULT_TTL': os.environ.get("RQ_DEFAULT_RESULT_TTL", 3600),
+ # 'DEFAULT_RESULT_TTL': os.environ.get("RQ_DEFAULT_RESULT_TTL", 3600),
}
}
+'''
+
+# https://django-tasks-scheduler.readthedocs.io/en/latest/configuration/
+SCHEDULER_QUEUES = {
+ 'default': {
+ 'HOST': os.environ.get("REDIS_HOST", "localhost"),
+ 'PORT': os.environ.get("REDIS_PORT", 6379),
+ 'DB': os.environ.get("REDIS_DB", 0),
+ 'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
+ #'USERNAME': 'some-user',
+ #'PASSWORD': 'some-password',
+ #'DEFAULT_TIMEOUT': 360,
+ }
+}
+SCHEDULER_CONFIG = {
+ 'EXECUTIONS_IN_PAGE': 20,
+ 'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours
+ 'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15), # 15 minutes
+ 'SCHEDULER_INTERVAL': 10, # 10 seconds
+}
# Password validation
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
diff --git a/app_urls/core/urls.py b/app_urls/core/urls.py
index a0122ef..818f64a 100644
--- a/app_urls/core/urls.py
+++ b/app_urls/core/urls.py
@@ -19,5 +19,7 @@ from django.urls import path, include
urlpatterns = [
path('admin/', admin.site.urls),
- path('api/', include('api.urls'))
+ path('api/', include('api.urls')),
+ #path('scheduler/', include('django_rq.urls')),
+ path('scheduler/', include('scheduler.urls')),
]
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
index c77a75d..de4ea11 100644
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -41,6 +41,17 @@ services:
ports:
- 8080:8080
+ matitos_dozzle:
+ container_name: dozzle
+ image: amir20/dozzle:latest
+ volumes:
+ - /var/run/docker.sock:/var/run/docker.sock:ro
+ ports:
+ - 8888:8080
+ environment:
+ - DOZZLE_FILTER="name=matitos_" # Need container name matitos_ ?
+
+
# django:
# Env: DB_HOST=matitos_db
# DJANGO_DB_NAME=${DB_DATABASE_NAME:-matitos}