Refactoring fetcher, working feeds and raw url writer

2025-03-12 17:56:40 +01:00
parent e124dbc21a
commit 61c31ee9aa
24 changed files with 2085 additions and 194 deletions
--- a/1-DB.ipynb
+++ b/1-DB.ipynb
@@ -118,14 +118,23 @@
    "                        title TEXT,\n",
    "                        description TEXT,\n",
    "                        content TEXT,\n",
+    "                        valid_content BOOLEAN,\n",
    "                        language CHAR(2), -- ISO 639-1 Code\n",
+    "                        keywords TEXT[],\n",
    "                        tags TEXT[],\n",
    "                        authors TEXT[],\n",
-    "                        image_urls TEXT[]\n",
+    "                        image_main TEXT,\n",
+    "                        images_url TEXT[],\n",
+    "                        videos_url TEXT[],\n",
+    "                        url_host TEXT,    -- www.breitbart.com\n",
+    "                        site_name TEXT    -- Breitbart News\n",
    "                    );\n",
    "                    CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n",
    "                    CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n",
+    "                    CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\n",
+    "                    CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\n",
    "                    CREATE INDEX idx_language ON URL_CONTENT (language);\n",
+    "                    CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n",
    "                \"\"\")\n",
    "\n",
    "                # Feeds\n",
--- a/A_Development.ipynb
+++ b/A_Development.ipynb
@@ -101,95 +101,11 @@
   "outputs": [],
   "source": []
  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "('https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert',\n",
-       " 'foxnews.com')"
-      ]
-     },
-     "execution_count": 54,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# !pip install trafilatura trafilatura[all] cchardet\n",
-    "import courlan\n",
-    "url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
-    "url = \"https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
-    "courlan.check_url(url)"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import newspaper\n",
-    "\n",
-    "article = newspaper.article(url)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "datetime.datetime(2025, 3, 4, 4, 0, 31, tzinfo=tzoffset(None, -18000))"
-      ]
-     },
-     "execution_count": 49,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "article.publish_date"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "# !pip install trafilatura\n",
    "import trafilatura\n",
@@ -197,9 +113,18 @@
    "\n",
    "url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
    "# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
+    "url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
    "\n",
    "# Fetch\n",
-    "doc = trafilatura.fetch_url(url)\n",
+    "doc = trafilatura.fetch_url(url)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
    "# Content & metadata\n",
    "metadata = trafilatura.extract_metadata(doc)\n",
    "content = trafilatura.extract(doc)"
@@ -207,40 +132,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'author': 'Audrey Conklin',\n",
-      " 'body': <Element body at 0x7e22813ce400>,\n",
-      " 'categories': [],\n",
-      " 'comments': None,\n",
-      " 'commentsbody': <Element body at 0x7e22813ce180>,\n",
-      " 'date': '2025-03-03',\n",
-      " 'description': \"Disgraced parenting blogger and mom of six Ruby Franke's \"\n",
-      "                '\"power\" and public image\" allowed her crimes against her '\n",
-      "                'children to go \"unchecked,\" according to a defense attorney.',\n",
-      " 'filedate': '2025-03-08',\n",
-      " 'fingerprint': None,\n",
-      " 'hostname': 'foxnews.com',\n",
-      " 'id': None,\n",
-      " 'image': 'https://static.foxnews.com/foxnews.com/content/uploads/2024/03/967e1c1b-Franke.jpg',\n",
-      " 'language': None,\n",
-      " 'license': None,\n",
-      " 'pagetype': 'article',\n",
-      " 'raw_text': None,\n",
-      " 'sitename': 'Fox News',\n",
-      " 'tags': [],\n",
-      " 'text': None,\n",
-      " 'title': \"Utah mommy blogger Ruby Franke's power, public image allowed child \"\n",
-      "          \"abuse to go 'unchecked': expert\",\n",
-      " 'url': 'https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert'}\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "pprint(metadata.as_dict())"
   ]
@@ -263,85 +157,50 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 18.6 ms, sys: 40 μs, total: 18.7 ms\n",
-      "Wall time: 18 ms\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'en'"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
-    "'''\n",
-    "!pip install lingua-language-detector\n",
-    "import lingua\n",
-    "ld = lingua.LanguageDetectorBuilder.from_all_languages().build()\n",
-    "l = ld.detect_language_of(content)\n",
-    "'''\n",
+    "# !pip install newspaper4k\n",
    "# !pip install langdetect \n",
+    "import newspaper\n",
    "import langdetect\n",
    "langdetect.DetectorFactory.seed = 0\n",
-    "langdetect.detect(content)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# !pip install newspaper4k"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import newspaper\n",
    "\n",
+    "\n",
+    "\n",
+    "# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
+    "#url = \"https://www.waff.com/2025/03/11/colbert-heights-high-school-employee-arrested-child-abuse/\"\n",
+    "\n",
+    "\n",
+    "\n",
+    "#url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n",
+    "\n",
+    "\n",
+    "url = \"https://apnews.com/article/canada-trump-us-tariffs-steel-2517a6a2baf0596cb1a43d3a7d1e7939\"\n",
    "url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
-    "url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
+    "#url = \"https://www.ft.com/content/6d7c6915-4ceb-43fc-9896-590036b12a87\"\n",
+    "#url = \"https://www.lanacion.com.ar/politica/milei-en-bahia-blanca-un-viaje-sorpresa-para-frenar-las-criticas-y-mostrar-cercania-nid12032025/\"\n",
+    "#url = \"https://www.missingkids.org/poster/NCMC/2043547/1\"\n",
    "\n",
-    "article = newspaper.article(url)\n",
+    "try:\n",
+    "    article = newspaper.article(url)\n",
+    "except newspaper.ArticleException as e:\n",
+    "    print(\"ArticleException: {}\".format(str(e)))\n",
+    "except Exception as e:\n",
+    "    print(\"Err: {}\".format(str(e)))\n",
    "\n",
-    "url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])"
+    "# url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])\n",
+    "# article.is_valid_url(), article.is_parsed, article.is_media_news(), article.is_valid_body()\n",
+    "article.meta_data\n",
+    "\n"
   ]
  },
  {
@@ -351,6 +210,13 @@
   "outputs": [],
   "source": []
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/app_urls/README.md
+++ b/app_urls/README.md
@@ -3,6 +3,36 @@
 conda create -n matitos_urls python=3.12
 conda activate matitos_urls
 pip install django psycopg[binary] django-rq
+pip install feedparser python-dateutil newspaper4k lxml[html_clean]
+```
+
+* From automated inspectdb
+```
+# 1) Inspect DB, generate models.py
+python manage.py inspectdb
+
+# 2) models.py, within class Urls, add:
+
+    class STATUS_ENUM(models.TextChoices):
+        RAW = "raw"
+        ERROR = "error"
+        VALID = "valid"
+        UNKNOWN = "unknown"
+        INVALID = "invalid"
+        DUPLICATE = "duplicate"
+
+# Update status
+    status = models.TextField(choices=STATUS_ENUM, default=STATUS_ENUM.RAW)  # This field type is a guess.
+
+# To class Meta, add default ordering
+    class Meta:
+        managed = False
+        db_table = 'urls' # db_table = '{}_urls'.format(project_name)
+        ordering = ["-ts_fetch"]
+
+# Fields default:
+    ts_fetch = models.DateTimeField(auto_now_add=True)
+    status = models.TextField(default='raw')  # This field type is a guess.
 ```

 * Environment variables
@@ -25,10 +55,20 @@ python manage.py makemigrations
 python manage.py migrate --fake
 ```

-
+* Deploy
 ```
 # Server
 python manage.py runserver
+
 # Worker
 python manage.py rqworker default
+while true; do python manage.py rqworker default --burst; sleep 5; done
 ```
+
+* Utils
+```
+python manage.py rqstats
+python manage.py rqstats --interval=1  # Refreshes every second
+python manage.py rqstats --json  # Output as JSON
+python manage.py rqstats --yaml  # Output as YAML
+```
--- a/app_urls/api/models.py
+++ b/app_urls/api/models.py
@@ -55,8 +55,8 @@ class UrlContent(models.Model):

 class Urls(models.Model):
    url = models.TextField(unique=True)
-    ts_fetch = models.DateTimeField()
-    status = models.TextField()  # This field type is a guess.
+    ts_fetch = models.DateTimeField(auto_now_add=True)
+    status = models.TextField(default='raw')  # This field type is a guess.

    class Meta:
        managed = False
--- a/app_urls/api/obsolete_src/db_utils.py
+++ b/app_urls/api/obsolete_src/db_utils.py
@@ -0,0 +1,502 @@
+import psycopg
+import redis
+import traceback
+import random
+import requests
+import json
+import os
+from .url_utils import process_article
+from .logger import get_logger
+logger = get_logger()
+
+# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
+# The rest, elsewhere
+
+class DB_Handler():
+    def __init__(self, db_connect_info, redis_connect_info):
+        logger.debug("Initializing URL DB writer")
+        self.db_connect_info = db_connect_info
+        self.redis_instance = redis.Redis(host=redis_connect_info.get("host"), port=redis_connect_info.get("port"))
+        self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours
+        
+        try:
+            self.redis_instance.ping()
+            logger.debug("Succesfully pinged Redis")
+        except Exception as e:
+            logger.warning("Error trying to ping Redis: {}".format(str(e)))
+
+    def get_urls_count(self, last_minutes_check):
+        #####################
+        ### Get number of URLs within last X minutes
+        #####################
+        try:
+            # Update
+            with psycopg.connect(self.db_connect_info) as conn:
+                # Open cursor
+                cursor = conn.cursor()
+                num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0]
+        except Exception as e:
+            logger.warning("Error updating URLs status: {}".format(str(e)))
+            num_urls = None
+        return num_urls
+    
+    def _get_url_host_list(self):
+        try:
+            with psycopg.connect(self.db_connect_info) as conn:
+                # List of URL host
+                list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()]
+            # Clean http / https from URLs
+            list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host]
+            # Clean last slash if exists
+            list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host]
+        except Exception as e:
+            logger.warning("Exception fetching URL host list: " + str(e))
+            list_url_host = []
+        return list_url_host
+
+    def _get_search_list(self):
+        try:
+            with psycopg.connect(self.db_connect_info) as conn:
+                # List of keyword searches
+                list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()]
+        except Exception as e:
+            logger.warning("Exception fetching searches list: " + str(e))
+            list_search_text = []
+        return list_search_text
+
+    def _get_feed_urls(self):
+        try:
+            with psycopg.connect(self.db_connect_info) as conn:
+                list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
+                # Decode (tuple with 1 element)
+                list_url_feeds = [l[0] for l in list_url_feeds]
+        except Exception as e:
+            logger.warning("Exception fetching RSS sites: " + str(e))
+            list_url_feeds = []
+        return list_url_feeds
+    
+    def _get_url_hosts(self):
+        try:
+            with psycopg.connect(self.db_connect_info) as conn:
+                list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
+                # Decode (tuple with 1 element)
+                list_url_hosts = [l[0] for l in list_url_hosts]
+        except Exception as e:
+            logger.warning("Exception fetching RSS sites: " + str(e))
+            list_url_hosts = []
+        return list_url_hosts
+
+    def _format(self, values):
+        # Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729
+        # String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value)
+        if (type(values) == list) or (type(values) == tuple):
+            insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")"
+        elif (type(values) == str):
+            insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" )
+        else:
+            logger.warning("Error formatting input values: {}".format(values))
+            assert False
+        return insert_args
+
+    def _get_cached_canonical_url(self, url):
+        ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
+        try:
+            filter_url = self.redis_instance.get(url)
+            if (filter_url is not None):
+                filter_url = filter_url.decode("utf-8")
+        except Exception as e:
+            logger.warning("Exception querying Redis: {}".format(str(e)))
+            filter_url = None
+        return filter_url
+    
+    def _update_urls_status(self, dict_status_ids):
+        #####################
+        ### Update status to array of URL IDs
+        #####################
+        try:
+            # Update
+            with psycopg.connect(self.db_connect_info) as conn:
+                # Open cursor
+                cursor = conn.cursor()
+                # Autocommit at end of transaction (Atomic insert of URLs and sources)
+                with conn.transaction() as tx:
+                    for key_status, value_ids in dict_status_ids.items():
+                        cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids])))
+        except Exception as e:
+            logger.warning("Error updating URLs status: {}".format(str(e)))
+
+    def _get_missing_kids_urls(self, num_urls=None):
+        #####################
+        ### Get list of Missing Kids URLs
+        #####################
+        try:
+            missing_kids_ids_and_urls = []
+            if (num_urls is None):
+                limit = 500
+            else:
+                limit = num_urls
+            offset = 0
+            with psycopg.connect(self.db_connect_info) as conn:
+                # Open cursor
+                cursor = conn.cursor()
+                while True:
+                    # Query
+                    missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
+                    # Finished?
+                    if (len(missing_kids_ids_and_urls_query) == 0):
+                        break
+                    # Extend
+                    missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query
+                    # Offset
+                    offset += len(missing_kids_ids_and_urls_query)
+                    # Stop?
+                    if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls):
+                        break
+                        
+        except Exception as e:
+            logger.warning("Error getting Missing Kids URLs: {}".format(str(e)))
+            missing_kids_ids_and_urls = []
+        return missing_kids_ids_and_urls
+    
+    def _get_error_urls(self, num_urls=None):
+        #####################
+        ### Get list of Missing Kids URLs
+        #####################
+        try:
+            error_urls = []
+            if (num_urls is None):
+                limit = 500
+            else:
+                limit = num_urls
+            offset = 0
+            with psycopg.connect(self.db_connect_info) as conn:
+                # Open cursor
+                cursor = conn.cursor()
+                while True:
+                    # Query
+                    error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
+                    # Finished?
+                    if (len(error_urls_query) == 0):
+                        break
+                    # Extend
+                    error_urls = error_urls + error_urls_query
+                    # Offset
+                    offset += len(error_urls_query)
+                    # Stop?
+                    if (num_urls is not None) and (len(error_urls) >= num_urls):
+                        break
+                        
+        except Exception as e:
+            logger.warning("Error getting Error URLs: {}".format(str(e)))
+            error_urls = []
+        return error_urls
+
+    def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched...
+        """
+        # TODO: REFACTOR
+        For each input url
+
+            Already processed? 
+                -> Update on Redis expire time
+                -> Associate to source
+            Not processed? Get main URL:
+                -> URL Canonical valid? 
+                    -> Rely on this as main URL
+                -> URL Canonical not valid?
+                    -> Use input url, unless it's a news.google.com link
+                        -> If news.google.com link, filter out. REDIS?
+            Main URL processing:
+                -> Update in REDIS, association url -> url_canonical
+                -> url != url_canonical: Add in duplicate table
+                    If both != news.google.com
+        """
+
+        # URLs to insert, URLs duplicated association, URL to Canonical form
+        list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {}
+
+        # URL VS CANONICAL:
+        # News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen
+        # Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/
+
+        for url in urls_fetched:
+            # Domain to filter? Input url
+            filter_due_to_domain = False
+            for domain_to_filter in list_domains_to_filter:
+                if (domain_to_filter in url):
+                    logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url))
+                    filter_due_to_domain = True
+            if (filter_due_to_domain):
+                continue
+
+            # URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
+            cached_canonical_url = self._get_cached_canonical_url(url)
+            if (cached_canonical_url is not None):
+                # Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry)
+                dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y
+                # If url has been processed, so was its canonical form
+                logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url))
+                continue
+
+            # Process TODO: Add language...
+            url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple)
+            # TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id)
+
+            # Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB)
+            if (url_canonical is None) and ("news.google.com" in url):
+                logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url))
+                continue
+            # Canonical URL still news.google.com? Continue (avoid inserting in DB)
+            if (url_canonical is not None) and ("news.google.com" in url_canonical):
+                logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical))
+                continue
+
+            # Domain to filter? Input canonical_url
+            filter_due_to_domain = False
+            for domain_to_filter in list_domains_to_filter:
+                if (url_canonical is not None) and (domain_to_filter in url_canonical):
+                    filter_due_to_domain = True
+            if (filter_due_to_domain):
+                logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical))
+                continue
+
+            if (url_canonical is None) or (article_status == "error"):
+                logger.debug("Processing failed for URL: {}".format(url))
+                # Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based
+                if ("news.google.com" in url) or ("consent.google.com" in url):
+                    logging.debug("Not able to process Google News link, skipping: {}".format(url))
+                else:
+                    dict_full_urls_to_canonical[url] = url # X -> X
+                    list_insert_url_tuple_args.append( (url, article_status) )
+                continue
+            
+            # URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different
+            if (url_canonical != url):
+                list_tuple_canonical_duplicate_urls.append( (url_canonical, url) )
+            # Dict: url -> canonical (update association)
+            dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X
+
+            # Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
+            if (self._get_cached_canonical_url(url_canonical) is not None):
+                # Canonical URL was already processed
+                logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical))
+            else:
+                # Insert url_canonical to DB formatted
+                list_insert_url_tuple_args.append( (url_canonical, article_status) )
+                # Canonical URL different? Process
+                if (url_canonical != url):
+                    if ("news.google.com" in url) or ("consent.google.com" in url):
+                        logging.debug("Not adding google.news.com based link, skipping: {}".format(url))
+                    else:
+                        # Fetched url -> duplicate (using canonical as main link)
+                        article_status = "duplicate"
+                        # Insert url (non-canonical) to DB formatted
+                        list_insert_url_tuple_args.append( (url, article_status) )
+
+        return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical
+    
+    def _insert_urls(self, cursor, list_insert_url_tuple_args):
+        #####################
+        ### Insert URLs with status
+        #####################
+        if (len(list_insert_url_tuple_args) > 0):
+            insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] )
+            # Insert. (url_1, status_1), (url_2, status_2), ...
+            sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args)
+            # logger.debug("SQL CODE: {}".format(sql_code))
+            c = cursor.execute(sql_code)
+            # NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT)
+            # https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488
+    
+    def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls):
+        #####################
+        ### Insert duplicated URLs
+        #####################
+        if (len(list_tuple_canonical_duplicate_urls) > 0):
+            # Flatten, format, set to remove duplicates
+            args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")"
+
+            # Dict: url -> id
+            dict_url_to_id = {}
+            # Get url -> id association to populate duplicated URLs
+            for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall():
+                dict_url_to_id[url_] = id_
+            
+            # Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB
+            # ORIGINAL CODE. Issue, might not have found association to all urls
+            ### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls]
+
+            list_tuple_canonical_duplicate_urls_ids = []
+            for (url_1, url_2) in list_tuple_canonical_duplicate_urls:
+                id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2)
+                if (id_url_1 is None) or (id_url_2 is None):
+                    logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2))
+                else:
+                    list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) )
+            
+            if (len(list_tuple_canonical_duplicate_urls_ids) > 0):
+                insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] )
+                # Insert. (id_url_canonical_1, id_url_1), ...
+                sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args)
+                # logger.debug("SQL CODE: {}".format(sql_code))
+                c = cursor.execute(sql_code)
+
+    def _get_pattern_status_list(self):
+        #####################
+        ### Get list of domains to filter
+        #####################
+        # TODO: Cache on redis and query once every N hours? ...
+        try:
+            with psycopg.connect(self.db_connect_info) as conn:
+                # Open cursor
+                cursor = conn.cursor()
+                # TODO: Cache on Redis
+                list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall()
+        except Exception as e:
+            logger.warning("Error getting pattern status list: {}".format(str(e)))
+            list_pattern_status = []
+        return list_pattern_status
+
+    def _get_domains_to_filter(self):
+        #####################
+        ### Get list of domains to filter
+        #####################
+        # TODO: Cache on redis and query once every N hours? ...
+        try:
+            with psycopg.connect(self.db_connect_info) as conn:
+                # Open cursor
+                cursor = conn.cursor()
+                # TODO: Cache on Redis
+                sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ]
+        except Exception as e:
+            logger.warning("Error getting domains to filter: {}".format(str(e)))
+            sites_to_filter = []
+        return sites_to_filter
+
+    def _get_cached_source_id(self, source):
+        ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
+        try:
+            source_id = self.redis_instance.get(source)
+            if (source_id is not None):
+                source_id = source_id.decode("utf-8")
+        except Exception as e:
+            logger.warning("Exception querying Redis: {}".format(str(e)))
+            source_id = None
+        return source_id
+    
+    def _get_source_id(self, cursor, source):
+        #####################
+        ### Get source corresponding id
+        #####################
+        # Cached?
+        id_source = self._get_cached_source_id(source)
+        if (id_source is None):
+            c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone()
+            if (c is None) or (len(c) == 0):
+                # Source does not exist, insert and get id
+                c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone()
+            # Decode source id
+            id_source = c[0]
+        # Cache
+        print("*"*10, source, id_source)
+        self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds)
+        return id_source
+    
+    def _get_urls_id(self, cursor, urls_full):
+        #####################
+        ### Get id of inserted and filtered URLs
+        #####################
+        # TODO: Cache url -> url_id, url_canonical
+        if (len(urls_full) == 0):
+            return []
+        # Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source
+        in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")"
+        id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ]
+        return id_urls_related
+
+    def _insert_urls_source(self, cursor, id_urls_related, id_source):
+        #####################
+        ### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ...
+        #####################
+        if (len(id_urls_related) == 0) or (id_source is None):
+            return
+        columns = "(id_url, id_source)"
+        insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] )
+        # Insert
+        sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args)
+        # logger.debug("SQL CODE: {}".format(sql_code))
+        c = cursor.execute(sql_code)
+
+    def write_batch(self, urls_fetched, source):
+        # Chunks of 50 elements
+        n = 50
+        # Divide in small chunks
+        urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)]
+        # Process
+        for urls_fetched_chunk_i in urls_fetched_chunks:
+            self._write_small_batch(urls_fetched_chunk_i, source)
+
+    def _write_small_batch(self, urls_fetched, source):
+        try:
+            logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source))
+
+            if (len(urls_fetched) == 0):
+                logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
+                return
+            
+            # Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests)
+            random.shuffle(urls_fetched)
+            
+            # Get list of domains to filter
+            list_domains_to_filter = self._get_domains_to_filter()
+            # Get list of (pattern, priority, status) tuples to override status if required
+            list_pattern_status_tuple = self._get_pattern_status_list()
+            # Sort pattern tuples by priority
+            list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
+            
+            # Process URLs to update DB
+            list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple)
+            # Full set of URL and its canonical form (to associate them to a search), both to insert and filter
+            urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) )
+
+            # Insert
+            with psycopg.connect(self.db_connect_info) as conn:
+                # Open cursor
+                cursor = conn.cursor()
+                # Autocommit at end of transaction (Atomic insert of URLs and sources)
+                with conn.transaction() as tx:
+                    # Insert processed URLs
+                    self._insert_urls(cursor, list_insert_url_tuple_args)
+                    # Insert URLs duplicated (canonical != fetched url)
+                    self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls)
+
+                    # Get source id in DB
+                    id_source = self._get_source_id(cursor, source)
+                    # Get IDs of all related URLs
+                    id_urls_related = self._get_urls_id(cursor, urls_full)
+                    # Insert search source associated to URLs
+                    self._insert_urls_source(cursor, id_urls_related, id_source)
+
+            # Update Redis status of inserted and filtered URLs after writing to DB
+            for url, url_canonical in dict_full_urls_to_canonical.items():
+                try:
+                    # Set with updated expiry time
+                    self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds)
+                    if (url != url_canonical):
+                        self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds)
+                except Exception as e:
+                    logger.warning("Exception running set in Redis: {}".format(str(e)))
+
+            if (len(list_insert_url_tuple_args) > 0):
+                try:
+                    webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
+                    endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token)
+                    
+                    payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) })
+                    r = requests.post(endpoint_message, data=payload)
+                except Exception as e:
+                    logger.warning("Webhook failed: {}".format(str(e)))
+
+            logger.debug("URL DB write finished")
+        except Exception as e:
+            logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) )
+            logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) )
--- a/app_urls/api/obsolete_src/fetch_feed.py
+++ b/app_urls/api/obsolete_src/fetch_feed.py
@@ -0,0 +1,48 @@
+from .db_utils import DB_Handler
+import feedparser
+import dateutil
+from .logger import get_logger
+logger = get_logger()
+
+class FetchFeeds():
+    def __init__(self, db_handler: DB_Handler) -> None:
+        logger.debug("Initializing News feed")
+        self.db_handler = db_handler
+    
+    def run(self):
+        try:
+            logger.debug("Starting NewsFeed.run()")
+            # Get feeds
+            list_url_feeds = self.db_handler._get_feed_urls()
+            logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds)))
+
+            # Process via RSS feeds
+            for url_feed in list_url_feeds:
+                # Initialize
+                urls_fetched, urls_publish_date = [], []
+                # Fetch feeds
+                feeds = feedparser.parse(url_feed)
+                # Parse
+                for f in feeds.get("entries", []):
+                    # Get URL
+                    url = f.get("link", None)
+                    # Process?
+                    if (url is not None):
+                        # Available publish date?
+                        publish_date_parsed = f.get("published_parsed")
+                        if (publish_date_parsed is None):
+                            publish_date = f.get("published", None)
+                            if (publish_date is not None):
+                                publish_date_parsed = dateutil.parser.parse(publish_date)
+                        
+                        # Published date
+                        urls_publish_date.append(publish_date_parsed)
+                        # URL
+                        urls_fetched.append(url)
+
+                # URL fetching source
+                source = "feed {}".format(url_feed)
+                # Write to DB
+                self.db_handler.write_batch(urls_fetched, source)
+        except Exception as e:
+            logger.warning("Exception in NewsFeed.run(): {}".format(str(e)))
--- a/app_urls/api/obsolete_src/fetch_parser.py
+++ b/app_urls/api/obsolete_src/fetch_parser.py
@@ -0,0 +1,45 @@
+from .db_utils import DB_Handler
+import newspaper
+from .logger import get_logger
+logger = get_logger()
+
+class FetchParser():
+    def __init__(self, db_handler: DB_Handler) -> None:
+        logger.debug("Initializing News SiteParsing newspaper4k")
+        self.db_handler = db_handler
+
+    # TODO: MOVE LOGIC ELSEWHERE!
+    def _postprocess(self, article_urls):
+        return [url.replace("#comment-stream", "") for url in article_urls]
+    
+    def run(self):
+        try:
+            logger.debug("Starting NewsSiteParsing.run() for {}")
+
+            # Get URL hosts
+            list_url_hosts = self.db_handler._get_url_hosts()
+            logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
+
+            # Process newspaper4k build method
+            for url_host_feed in list_url_hosts:
+                # Protocol
+                if not (url_host_feed.startswith("http")):
+                    url_host_feed_formatted = "https://" + url_host_feed
+                else:
+                    url_host_feed_formatted = url_host_feed
+
+                logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
+                # Source object
+                url_host_built = newspaper.build(url_host_feed_formatted)
+                # Get articles URL list
+                urls_fetched = url_host_built.article_urls()
+                # TODO: MOVE!
+                # Post-processing
+                urls_fetched = self._postprocess(urls_fetched)
+
+                # URL fetching source
+                source = "newspaper4k {}".format(url_host_feed)
+                # Write to DB
+                self.db_handler.write_batch(urls_fetched, source)
+        except Exception as e:
+            logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))
--- a/app_urls/api/obsolete_src/fetch_search.py
+++ b/app_urls/api/obsolete_src/fetch_search.py
@@ -0,0 +1,73 @@
+from .db_utils import DB_Handler
+from .utils import get_searxng_instances
+from .fetch_search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch
+from .logger import get_logger
+logger = get_logger()
+
+class FetchSearcher():
+    def __init__(self, db_handler: DB_Handler, full=True) -> None:
+        logger.debug("Initializing News feed")
+        self.db_handler = db_handler
+        self.full_search = full
+    
+    def _run_fetching(self, search_text):
+        logger.debug("Starting _run_fetching() for {}".format(search_text))
+        
+        # Common parameters
+        lang, region = "en", "US"
+
+        ### PreSearch
+        dict_params_news = {"search": search_text}
+        FetcherPreSearch(**dict_params_news).fetch_articles(self.db_handler)
+
+        ### DuckDuckGo
+        period = "d"
+        dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
+        FetcherDuckDuckGo(**dict_params_news).fetch_articles(self.db_handler)
+        dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
+        FetcherDuckDuckGo(**dict_params_general).fetch_articles(self.db_handler)
+
+        if (self.full_search):
+            # Avoid site:{} search due to G-Bypass required time
+            if ("site:" not in search_text):
+                ### GNews
+                dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
+                FetcherGNews(**dict_params).fetch_articles(self.db_handler)
+
+                ### GoogleNews
+                dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
+                FetcherGoogleNews(**dict_params_news).fetch_articles(self.db_handler)
+                # dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
+
+            if False:
+                ### SearxNG
+                period = "day"
+                for searx_instance in get_searxng_instances():
+                    dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
+                    dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
+                    # Append thread
+                    FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
+                    FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)
+
+        logger.debug("Finished _run_fetching()")
+                    
+    def run(self):
+        try:
+            logger.info("Fetching text searches & URL hosts of interest")
+
+            # Get text searches of interest
+            list_search_text_of_interest = self.db_handler._get_search_list()
+
+            # Get URL host of interest
+            list_url_host = self.db_handler._get_url_host_list()
+            # Get text searches for URL hosts
+            list_search_text_url_host = ["site:{}".format(l) for l in list_url_host]
+
+            for search_text in list_search_text_of_interest + list_search_text_url_host:
+                logger.debug("Fetching news for search: {}".format(search_text))
+                self._run_fetching(search_text)
+
+            logger.info("Finished fetching text searches & URL hosts of interest")
+        except Exception as e:
+            logger.warning("Exception in NewsSearch.run(): {}".format(str(e)))
+        
--- a/app_urls/api/obsolete_src/fetch_search_sources.py
+++ b/app_urls/api/obsolete_src/fetch_search_sources.py
@@ -0,0 +1,384 @@
+from duckduckgo_search import DDGS
+from gnews import GNews
+from GoogleNews import GoogleNews
+
+import requests
+from bs4 import BeautifulSoup
+import os
+import time
+import json
+import numpy as np
+import random
+from .google_bypass import GoogleByPass
+from abc import ABC, abstractmethod
+from .logger import get_logger
+logger = get_logger()
+
+
+
+# Generic fetcher (fetches articles, writes to DB)
+class FetcherAbstract(ABC):
+    @abstractmethod
+    def _fetch(self):
+        pass
+
+    def fetch_articles(self, db_writer):
+        logger.debug("Starting fetch() for {}".format(self.name))
+        # Fetch articles
+        list_news = self._fetch()
+        logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
+        # Write to DB
+        db_writer.write_batch(list_news, self.name)
+
+# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
+
+user_agents_list = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
+    "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
+    "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
+    "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
+    "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36                       (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
+    "Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
+    "Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
+    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
+]
+
+
+
+
+
+class FetcherPreSearch(FetcherAbstract):
+    def __init__(self, search):
+        """
+        # period ->
+        - h = hours (eg: 12h)
+        - d = days (eg: 7d)
+        - m = months (eg: 6m)
+        - y = years (eg: 1y)
+        """
+        self.search = search
+        self.period = "1d" # TODO Fixed for the moment
+        # self.lang = lang
+        # self.region = region
+        search_category = "news"
+        self.name = "presearch {} {} {}".format(search, search_category, self.period)
+
+    def _fetch(self):
+        try:
+            # PreSearch fetching endpoint, parameter search keyword
+            presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search)
+            # Timeout: 15 minutes
+            r = requests.get(presearch_fetch_endpoint, timeout=900)
+            # Decode
+            list_news = json.loads(r.text).get("list_urls", [])
+        except Exception as e:
+            logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e)))
+            list_news = []
+        return list_news
+
+
+
+class FetcherGNews(FetcherAbstract):
+    def __init__(self, search, period, lang="en", region="US"):
+        """
+        # period ->
+        - h = hours (eg: 12h)
+        - d = days (eg: 7d)
+        - m = months (eg: 6m)
+        - y = years (eg: 1y)
+        """
+        self.search = search
+        self.period = period
+        self.lang = lang
+        self.region = region
+        search_category = "news"
+        self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
+
+    def _fetch(self):
+        try:
+            list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search)
+            # Decode
+            list_news = []
+            for l in list_dict_news:
+                list_news.append(l.get("url"))
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
+            list_news = []
+
+        # Bypass Google links
+        list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
+
+        return list_news_redirections
+
+class FetcherGoogleNews(FetcherAbstract):
+    def __init__(self, search, search_category="news", period="1d", lang="en", region="US"):
+        assert(search_category in ["news", "general"])
+
+        self.lang = lang
+        self.region = region
+        self.period = period
+        self.search_category = search_category
+        self.search = search
+        self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
+
+    def _fetch(self):
+        try:
+            # Initialize
+            g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region)
+            g.enableException(True)
+
+            if (self.search_category == "general"):
+                set_links = set()
+                # Search
+                g.search(self.search)
+
+                # Iterate pages
+                MAX_ITER_PAGES = 15
+                for i in range(MAX_ITER_PAGES):
+                    time.sleep(random.uniform(1, 1.5))
+                    num_before = len(set_links)
+
+                    # Get page
+                    try:
+                        links = g.page_at(i)
+                    except Exception as e:
+                        logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e)))
+                        break
+                    # Links
+                    for l in links:
+                        # '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ'
+                        url = l.get("link").split("url=")[-1]
+                        set_links.add(url)
+                    
+                    num_after = len(set_links)
+
+                    # Finished?
+                    if (num_before == num_after):
+                        logger.debug("Iterated {} pages on GoogleNews general search".format(i))
+                        break
+                # To list
+                list_news = list(set_links)
+            elif (self.search_category == "news"):
+                # Search
+                g.get_news(self.search)
+                # Fetch
+                list_news = g.get_links()
+
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
+            list_news = []
+
+        # Bypass Google links
+        list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
+        
+        return list_news_redirections
+
+class FetcherDuckDuckGo(FetcherAbstract):
+    def __init__(self, search, search_category, period, lang="wt", region="wt"):
+        assert(search_category in ["news", "general"])
+        assert(period in ["d", "w", "m", "y"])
+        self.search = search
+        self.search_category = search_category
+        self.period = period
+        self.lang_region = "{}-{}".format(lang, region)
+        self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region)
+
+    def _fetch(self):
+        try:
+            list_news = []
+            with DDGS(timeout=10) as ddgs:
+                if (self.search_category == "general"):
+                    generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region)
+                elif (self.search_category == "news"):
+                    generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region)
+                
+                for l in generator_links:
+                    list_news.append( l.get("url", l.get("href")) )
+                    
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
+            list_news = []
+        return list_news
+
+
+class FetcherSearxNews(FetcherAbstract):
+    def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"):
+        assert(search_category in ["news", "general"])
+        assert(period in [None, "day", "week", "month", "year"])
+        # Random header (minimize prob of web-scrapping detection)
+        self.headers = {
+            'User-agent': str(np.random.choice(user_agents_list)),
+            'Accept-Encoding': 'gzip, deflate', 
+            'Accept': '*/*', 
+            'Connection': 'keep-alive',
+        }
+        """ # Optional header
+        self.headers = {
+            'User-agent': str(np.random.choice(user_agents_list)),
+            'Accept-Encoding': 'gzip, deflate, br', 
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'TE': 'trailers',
+            'Sec-Fetch-Site': 'cross-site',
+            'Sec-Fetch-Mode': 'navigate',
+            'Sec-Fetch-Dest': 'document',
+        }
+        """
+        self.search = search
+        self.searx_instance = searx_instance
+        self.lang_region = "{}-{}".format(lang, region)
+        self.search_category = search_category
+        self.period = period
+        self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5
+        self.request_timeout = 240
+
+        period_name_mapping = {
+            None: "no_date_range",
+            "day": "1d",
+            "week": "1w",
+            "month": "1m",
+            "year": "1y",
+        }
+        self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region)
+        logger.info("SearX - Initialized SearX fetcher: {}".format(self.name))
+    
+    def _request_and_decode(self, url_search):
+        # Initial random time sleep (minimize chance of getting blocked)
+        time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher))
+        # Request
+        logger.debug("SearX - Searching: {}".format(url_search))
+        try:
+            r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout)
+        except Exception as e:
+            logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e))
+            return []
+        
+        if (r.status_code == 200):
+            # Status code Ok
+            pass
+        elif (r.status_code == 429):
+            # TooManyRequests, "Rate limit exceeded"
+            logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text))
+            return []
+        elif (r.status_code != 200):
+            logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text))
+            return []
+        else:
+            logger.debug("SearX - Status code: {}".format(r.status_code))
+
+        # Decode request
+        soup = BeautifulSoup(r.text, 'html.parser')
+        page_url_set = set()
+        # h3 links
+        for elem in soup.find_all('h3'):
+            # Get url
+            url = elem.find('a').get('href')
+            page_url_set.add(url)
+        return page_url_set
+
+    def _get_news_list(self):
+        ############################################################
+        # Domain & search parameter
+        search_domain = os.path.join(self.searx_instance, "search?q=")
+        # Search keywords
+        search_formatted = self.search.replace(" ", "+").replace(":", "%3A")
+        # Period formatted
+        period_formatted = "&time_range={}".format(self.period) if self.period is not None else ""
+        # Search parameters
+        search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted)
+        # Combined url search
+        url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters)
+        ############################################################
+
+        # Request and decode on page=1
+        url_set = self._request_and_decode(url_search_nopage)
+        # No results?
+        if (len(url_set) == 0):
+            logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage))
+            return []
+
+        # Iterate pages
+        search_numpage = 2
+        while True:
+            # Combine url search with page number
+            url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage)
+            # Request and decode on page=X
+            url_set_i = self._request_and_decode(url_search_with_page)
+
+            # Length before merging
+            length_current = len(url_set)
+            # Merge
+            url_set = url_set.union(url_set_i)
+            # Length after merging
+            length_merged = len(url_set)
+
+            # No new elements?
+            if (length_current == length_merged):
+                logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage))
+                break
+            # Next page
+            search_numpage += 1
+        
+        return list(url_set)
+
+    def _fetch(self):
+        try:
+            # Fetch news
+            list_news = self._get_news_list()
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
+            list_news = []        
+        return list_news
--- a/app_urls/api/obsolete_src/google_bypass.py
+++ b/app_urls/api/obsolete_src/google_bypass.py
@@ -0,0 +1,26 @@
+import requests
+import json
+from .logger import get_logger
+logger = get_logger()
+
+class GoogleByPass():
+    def __init__(self) -> None:
+        pass
+
+    def bypass_google_urls(self, list_urls):
+        if (len(list_urls) == 0):
+            return []
+        
+        try:
+            # Endpoint
+            gbypass_endpoint = "http://selenium_app:80/get_redirection"
+            # Timeout: 20 minutes
+            timeout = 60*20
+            r = requests.post(gbypass_endpoint, json={"list_urls": list_urls}, timeout=timeout)
+            # Decode
+            list_urls_redirections = json.loads(r.text).get("list_urls_redirections", [])
+        except Exception as e:
+            logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
+            list_urls_redirections = []
+            
+        return list_urls_redirections
--- a/app_urls/api/obsolete_src/logger.py
+++ b/app_urls/api/obsolete_src/logger.py
@@ -0,0 +1,22 @@
+import logging
+
+import os
+os.makedirs("logs", exist_ok=True)
+
+logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
+logger = logging.getLogger("news_fetcher")
+logger.setLevel(logging.INFO)
+
+# To file log: INFO / WARNING / ERROR
+fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4)
+fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
+logger.addHandler(fh)
+
+# To file log: WARNING / ERROR
+fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
+fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
+fh_.setLevel(logging.WARNING)
+logger.addHandler(fh_)
+
+def get_logger():
+    return logger
--- a/app_urls/api/obsolete_src/missing_kids_fetch.py
+++ b/app_urls/api/obsolete_src/missing_kids_fetch.py
@@ -0,0 +1,36 @@
+from .db_utils import DB_Handler
+import requests
+import json
+from .logger import get_logger
+logger = get_logger()
+
+class MissingKidsFetch():
+    def __init__(self, db_handler: DB_Handler, num_pages) -> None:
+        logger.debug("Initializing News MissingKids")
+        self.db_handler = db_handler
+        self.num_pages = num_pages
+        self.missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}"
+    
+    def run(self):
+        try:
+            logger.debug("Starting NewsMissingKids.run()")
+            try:
+                # Timeout
+                if (self.num_pages > 15):
+                    timeout = 60*90 # 1.5h
+                else:
+                    timeout = 60*5  # 5 min
+                # Request
+                r = requests.get(self.missingkids_fetch_endpoint.format(self.num_pages), timeout=timeout)
+                # Decode
+                urls_fetched = json.loads(r.text).get("list_urls", [])
+            except Exception as e:
+                logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
+                urls_fetched = []
+
+            # URL fetching source
+            source = "missingkids fetcher"
+            # Write to DB
+            self.db_handler.write_batch(urls_fetched, source)
+        except Exception as e:
+            logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e)))
--- a/app_urls/api/obsolete_src/missing_kids_status.py
+++ b/app_urls/api/obsolete_src/missing_kids_status.py
@@ -0,0 +1,98 @@
+from .db_utils import URL_DB_Writer
+from .url_utils import get_missing_kid_status
+from .logger import get_logger
+logger = get_logger()
+
+
+def get_missing_kid_status(url, return_canonical_url=False):
+    import time
+    import requests
+
+    # Sleep
+    time.sleep(0.75)
+    try:
+        # Request
+        r = requests.get(url, timeout=300)
+        # Decode
+        status_code = r.status_code
+        # Canonical URL removing parameters
+        url_canonical = r.url
+    except Exception as e:
+        logger.warning("Exception on get URL status request: {}. {}".format(url, str(e)))
+        status_code = None
+        url_canonical = url
+    
+    if (status_code == 200):
+        status = "valid"
+    elif (status_code == 404):
+        status = "invalid"
+    else:
+        status = "unknown"
+
+    logger.debug("Missing Kid URL {} status: {}".format(url, status))
+    if (return_canonical_url):
+        return status, url_canonical
+    else:
+        return status
+
+class MissingKidsStatus():
+    def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
+        self.num_urls = num_urls
+        self.db_connect_info = db_connect_info
+        self.redis_connect_info = redis_connect_info
+        self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
+
+    def update_missing_kids_status(self):
+        try:
+            logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls))
+            # List of URLs
+            list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls)
+            # Dict: status -> IDs to update to new status
+            dict_status_ids, dict_status_urls = {}, {}
+            # Check URLs with invalid status?
+            skip_invalid_check = False
+
+            flush_every, flush_current = 20, 0
+            # Iterate URLs
+            for (id, url, current_status) in list_ids_and_urls:
+                # Skip duplicate URLs
+                if (current_status == "duplicate"):
+                    continue
+                # Skip invalid URLs?
+                if (skip_invalid_check):
+                    if (current_status == "invalid"):
+                        continue
+
+                # Get status
+                new_status = get_missing_kid_status(url)
+                # Different? Update
+                if (current_status != new_status):
+                    # Extend array
+                    dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
+                    # Debugging dict
+                    dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
+                    # +1 processed
+                    flush_current += 1
+
+                # Flush batch?
+                if (flush_every == flush_current):
+                    logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
+                    # Update DB
+                    self.db_writer._update_urls_status(dict_status_ids)
+                    # Reset
+                    flush_current = 0
+                    dict_status_ids, dict_status_urls = {}, {}
+
+            # Flush remaining batch
+            if (flush_current > 0):
+                logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
+                # Update DB
+                self.db_writer._update_urls_status(dict_status_ids)
+                # Reset
+                flush_current = 0
+                dict_status_ids, dict_status_urls = {}, {}
+            
+            logger.info("Finished updating status to Missing Kids URLs")
+        except Exception as e:
+            logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e)))
+        
--- a/app_urls/api/obsolete_src/url_status.py
+++ b/app_urls/api/obsolete_src/url_status.py
@@ -0,0 +1,62 @@
+from .db_utils import URL_DB_Writer
+from .url_utils import process_article
+from .logger import get_logger
+logger = get_logger()
+
+class UpdateErrorURLs():
+    def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
+        self.num_urls = num_urls
+        self.db_connect_info = db_connect_info
+        self.redis_connect_info = redis_connect_info
+        self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
+
+    def update_error_urls_status(self):
+        try:
+            logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls))
+            # List of URLs with status 'error'
+            list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls)
+            # Current status
+            current_status = "error"
+            # Dict: status -> IDs to update to new status
+            dict_status_ids, dict_status_urls = {}, {}
+
+            # Get list of (pattern, priority, status) tuples to override status if required
+            list_pattern_status_tuple = self.db_writer._get_pattern_status_list()
+            # Sort pattern tuples by priority
+            list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
+
+            flush_every, flush_current = 20, 0
+            # Iterate URLs
+            for (id, url) in list_ids_and_urls:
+                # Get status
+                url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple)
+                # Different? Update
+                if (current_status != new_status):
+                    # Extend array
+                    dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
+                    # Debugging dict
+                    dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
+                    # +1 processed
+                    flush_current += 1
+
+                # Flush batch?
+                if (flush_every == flush_current):
+                    logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
+                    # Update DB
+                    self.db_writer._update_urls_status(dict_status_ids)
+                    # Reset
+                    flush_current = 0
+                    dict_status_ids, dict_status_urls = {}, {}
+
+            # Flush remaining batch
+            if (flush_current > 0):
+                logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
+                # Update DB
+                self.db_writer._update_urls_status(dict_status_ids)
+                # Reset
+                flush_current = 0
+                dict_status_ids, dict_status_urls = {}, {}
+            
+            logger.info("Finished updating status to URLs with error")
+        except Exception as e:
+            logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))
--- a/app_urls/api/obsolete_src/url_utils.py
+++ b/app_urls/api/obsolete_src/url_utils.py
@@ -0,0 +1,263 @@
+from gnews import GNews
+import dateutil.parser
+from datetime import datetime, timedelta
+from .utils import remove_http_s
+import time
+import random
+import traceback
+import requests
+import json
+import re
+from bs4 import BeautifulSoup
+
+from .logger import get_logger
+logger = get_logger()
+
+def get_published_date(article):
+    try:
+        """
+        # Already fetched publish date information?
+        if (publish_date_ is not None):
+            return publish_date_
+        """
+        
+        # List of potential publish dates
+        potential_dates = []
+        # Publish date is the best match
+        potential_dates.append(article.publish_date)
+        # Publish date metadata is the following best match
+        potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
+        # Iterate remaining keys
+        for key in article.meta_data.keys():
+            if ("date" in key):
+                potential_dates.append(article.meta_data[key])
+
+        def invalid_date(p_date):
+            # Today + 2 days, article from the future?
+            today_plus_two = datetime.utcnow() + timedelta(days=2)
+            # Article from the future?
+            return p_date.timestamp() > today_plus_two.timestamp()
+        
+        for date_ in potential_dates:
+            # String date? parse
+            if (type(date_) == str):
+                try:
+                    date_ = dateutil.parser.parse(date_)
+                except Exception as e:
+                    logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
+                    date_ = None
+            # Valid?
+            if (date_ is not None) and (not invalid_date(date_)):
+                return date_
+            
+        logger.debug("Article with no published date: {}".format(article.url))
+        return None
+    except Exception as e:
+        logger.info("Error while retrieving published date for URL: {}".format(article.url))
+        return None
+
+def get_url_host(article_source_url, url):
+    # https://www.blabla.com/blabla -> www.blabla.com
+    if (article_source_url != ""):
+        # Article source URL already extracted, save path if any
+        return remove_http_s(article_source_url) # .split("/")[0]
+    else:
+        return remove_http_s(url).split("/")[0]
+
+def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
+    # Regex pattern to update status on "valid", "invalid", and "unknown" status only
+    # Status "raw", "duplicated" and "error" should remain the way they are
+    # Assumption: List of patterns sorted by importance
+    if (article_status in ["valid", "invalid", "unknown"]):
+        # Regular expression pattern matching: https://regexr.com/
+        for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
+            # Matching?
+            matching = bool(re.match(regex_pattern, url))
+            # Update article status
+            if (matching):
+                if (status_if_match != article_status):
+                    logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
+                return status_if_match
+    # Pattern matching not required or not found, original article status
+    return article_status
+
+
+
+def bypass_google_link(article_url):
+
+    def bypass_google_consent(article_url):
+        # Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
+        article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
+
+        # https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
+        }
+        cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
+
+        try:
+            # Request
+            r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
+            # Decode
+            soup = BeautifulSoup(r.text, 'html.parser')
+            url_of_interest = soup.a['href']
+        except Exception as e:
+            logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
+            url_of_interest = None
+        
+        # Not able to bypass?
+        if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
+            url_of_interest = None
+        return url_of_interest
+
+    def bypass_google_using_service(article_url):
+        try:
+            # e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
+            gbypass_endpoint = "http://selenium_app:80/get_redirection"
+            # Timeout: 5 minutes
+            r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
+            # Decode
+            redirect_url = json.loads(r.text).get("redirect_url", "")
+        except Exception as e:
+            logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
+            redirect_url = ""
+            
+        return redirect_url
+    
+    logger.debug("Starting gbypass_endpoint()")
+    
+    article_url_bypassed = None
+    # Bypass using request
+    if ("consent.google.com" in article_url):
+        article_url_bypassed = bypass_google_consent(article_url)
+    # Not bypassed yet? Bypass using service
+    if (article_url_bypassed is None):
+        article_url_bypassed = bypass_google_using_service(article_url)
+
+    # if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
+    if (article_url_bypassed == "") or (article_url_bypassed is None):
+        # Empty URL returned by Gbypass
+        logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
+        return None
+    else:
+        logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
+        return article_url_bypassed
+
+def process_article(article_url, list_pattern_status_tuple, language="en"):
+    # TODO:
+    """
+    https://github.com/fhamborg/news-please
+    https://github.com/fhamborg/Giveme5W1HQwer123$
+    
+    https://github.com/santhoshse7en/news-fetch
+    """
+    try:
+        logger.debug("Starting process_article()")
+
+        if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
+            # Bypass to get redirection
+            article_url = bypass_google_link(article_url)
+            # Error?
+            if (article_url is None):
+                return None, {}, "error"
+        elif ("missingkids.org/poster" in article_url):
+            # Get status
+            article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
+            article_elements = {
+                "url_full": article_url,
+                "url_canonical": url_canonical
+            }
+            return url_canonical, article_elements, article_status
+        else:
+            # Avoid Too many requests (feeds, ...)
+            time.sleep(0.75)
+
+        logger.debug("Processing: {}".format(article_url))
+
+        # Default status unless something happens
+        article_status = "valid"
+
+        # Parse article
+        # TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
+        # TODO: Language per config
+        article = GNews(language).get_full_article(url=article_url)
+
+        # Article parsed?
+        if (article is None) or (not article.is_parsed):
+            logger.debug("Article not parsed: {}".format(article_url))
+            return article_url, {}, "error"
+
+        # Canonical link as main URL
+        url_canonical = article.canonical_link
+        # Empty canonical URL?
+        if (article.canonical_link is None) or (article.canonical_link == ""):
+            # URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
+            if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
+                logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
+                try:
+                    # Remove text after parameter call
+                    url = article.url.split("?")[0]
+                    # Remove comment-stream
+                    url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
+                    # Article
+                    article_attempt = GNews(language).get_full_article(url=url)
+                    # Retrieving same title? Update article based on clean URL
+                    if (article_attempt is not None) and (article_attempt.title == article.title):
+                        article = article_attempt
+                except Exception as e:
+                    logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
+            else:  # Default behaviour
+                logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
+
+            # By default, URL same as canonical
+            url_canonical = article.url
+
+        elif (article.url != article.canonical_link):
+            # If different, stick to canonical URL
+            logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
+        else:
+            # If same, continue...
+            pass
+        
+        # Update config to determine if content is valid
+        article.config.MIN_WORD_COUNT = 150
+        article.config.MIN_SENT_COUNT = 6
+        
+        # Valid URL?
+        if (not article.is_valid_url()):
+            logger.debug("Not a valid news article: {}".format(url_canonical))
+            article_status = "invalid"
+        # Is the article's body text is long enough to meet standard article requirements?
+        if (not article.is_valid_body()):
+            logger.debug("Article body not valid: {}".format(url_canonical))
+            article_status = "unknown"
+
+        if (article.images != article.imgs):
+            logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
+
+        # article.keywords, article.meta_keywords, article.summary
+        # article.movies 
+        # article.top_image
+
+        # Check if article status needs to be updated
+        article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
+
+        article_elements = {
+            'url_full': article.url,                          # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
+            'url_host': get_url_host(article.source_url, url_canonical),    # www.breitbart.com
+            'title': article.title,                           # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020
+            'description': article.meta_description,          # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office
+            'text': article.text,                             # ${Article content}
+            'published_date': get_published_date(article),    # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
+            'authors': article.authors,                       # ['Christopher Knaus']
+            'language': article.meta_lang,                    # en
+            'tags': list(article.tags),                       # ['Wide Open Border', '’My Son Hunter’ Movie', ...]
+            'images': list(article.images),                   # [URL_IMAGE_1, URL_IMAGE_2, ...]
+            'url_canonical': url_canonical,                   # Canonical URL (redirection)
+            # 'html': article.html,                           # HTML article
+        }
+        logger.debug("Processing OK: {}".format(url_canonical))
+        return url_canonical, article_elements, article_status
+    except Exception as e:
+        logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
+        return None, {}, "error"
--- a/app_urls/api/obsolete_src/utils.py
+++ b/app_urls/api/obsolete_src/utils.py
@@ -0,0 +1,33 @@
+
+def remove_http_s(url):
+    url = url.replace("https://", "") if url.startswith("https://") else url
+    url = url.replace("http://", "") if url.startswith("http://") else url
+    return url
+
+def is_valid_url(url):
+    if (url.startswith("https://")):
+        return True
+    else:
+        return False
+    
+def get_searxng_instances():
+    # SearxNG instances: https://searx.space/
+    searx_instances = set()
+    searx_instances.add("https://searx.work/")
+    searx_instances.add("https://search.ononoki.org/")
+    searx_instances.add("https://searxng.nicfab.eu/")
+    searx_instances.add("https://searx.be/")    
+    
+    # searx_instances.add("https://searx.fmac.xyz/")
+    # searx_instances.add("https://northboot.xyz/") # FIX
+    
+    # searx_instances.add("https://serx.ml/") # Offline
+    # searx_instances.add("https://searx.ru/")
+    # searx_instances.add("https://searx.sp-codes.de/")
+    # searx_instances.add("https://searxng.nicfab.eu/")
+    # searx_instances.add("https://s.frlt.one/")
+    # searx_instances.add("https://search.sapti.me/")
+    
+    # To list
+    list_searx_instances = list(searx_instances)
+    return list_searx_instances
--- a/app_urls/api/src/db_utils.py
+++ b/app_urls/api/src/db_utils.py
@@ -0,0 +1,190 @@
+from ..models import Urls, UrlContent, UrlsSource, Source, WebsiteToFilter, StatusPatternMatching
+from .url_processor import process_url
+from django.utils import timezone
+from django.core.cache import cache
+import hashlib
+from datetime import timedelta
+import re
+import traceback
+from .logger import get_logger
+logger = get_logger()
+
+class DB_Handler():
+    def __init__(self):
+        logger.debug("Initializing URL DB Handler")
+
+    def _get_safe_cache_key(self, raw_key):
+        """Generate a safe cache key using an MD5 hash"""
+        return hashlib.md5(raw_key.encode()).hexdigest()
+
+    def _cache_key(self, cache_key, cache_timeout=86400):
+        cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout)
+
+    def _is_cached_key(self, cache_key):
+        # Returns True if cached
+        return cache.get(self._get_safe_cache_key(cache_key)) is not None
+
+    def insert_raw_urls(self, urls, source):
+        try:
+            logger.debug("Inserting raw URLs")
+            # Empty?
+            if (len(urls) == 0):
+                logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
+                return
+
+            url_object_to_insert = []
+            # Per URL
+            for url in urls:
+                ### Already processed URL?
+                if (self._is_cached_key(url)):
+                    logger.debug("Already cached URL: {}".format(url))
+
+                    if (self._is_cached_key("{}{}".format(source, url))):
+                        logger.debug("Already cached (source, URL): {} {}".format(source, url))
+                    else:
+                        ### Insert source
+                        # Get the source (create if not exists)
+                        source_obj, created = Source.objects.get_or_create(source=source)                    
+                        # Get URL ID
+                        url_obj = Urls.objects.get(url=url)
+                        # Create (id_source, id_url)
+                        UrlsSource.objects.create(id_source=source_obj.id, id_url=url_obj.id)
+                else:
+                    # Add object to insert
+                    url_object_to_insert.append(Urls(url=url))
+
+            ### Bulk insert URLs, ignore conflicts if a url exists
+            bulk_created_obj = Urls.objects.bulk_create(url_object_to_insert, ignore_conflicts=True)
+            # Insert or update cache
+            for url in urls:
+                self._cache_key(url)
+                self._cache_key("{}{}".format(source, url))
+
+            logger.info("Inserted #{} raw URLs".format(len(url_object_to_insert)))
+
+        except Exception as e:
+            logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
+
+    def _get_status_pattern_matching(self, url, article_status, list_pattern_status_tuple):
+        # Sort pattern tuples by priority. (pattern, priority, status)
+        list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
+
+        # Regex pattern to update status on "valid", "invalid", and "unknown" status only
+        # Status "raw", "duplicated" and "error" should remain the way they are
+        # Assumption: List of patterns sorted by importance
+        if (article_status in ["valid", "invalid", "unknown"]):
+            # Regular expression pattern matching: https://regexr.com/
+            for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
+                # Matching? Update article status
+                if bool(re.match(regex_pattern, url)):
+                    if (status_if_match != article_status):
+                        logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
+                    return status_if_match
+        # Pattern matching not required or not found, original article status
+        return article_status
+
+    def process_raw_urls(self, time_delta=timedelta(days=1), batch_size=50):
+        try:
+            logger.debug("Processing raw URLs")
+
+            # Get list of domains to filter
+            list_domains_to_filter = WebsiteToFilter.objects.values_list('url_host', flat=True)
+            # Get list of (pattern, priority, status) tuples to override status if required
+            list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
+
+            # Fetched during last 24 hours
+            time_delta_ts = timezone.now() - time_delta
+            # Get batch of URLs, status='raw' and fetched X days ago
+            raw_urls = Urls.objects.filter(status='raw', ts_fetch__gte=time_delta_ts)[:batch_size]
+            # List of objects to bulk update
+            updating_urls = []
+
+            # Per URL
+            for obj_url in raw_urls:
+
+                ##### Any domain to filter included in URL? -> Invalid
+                if (any([d in obj_url.url for d in list_domains_to_filter])):
+                    logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
+                    # Update status
+                    obj_url.status = 'invalid'
+                    # Append to bulk update
+                    updating_urls.append(obj_url)
+                    # Next URL
+                    continue
+                
+                ##### Process URL
+                try:
+                    # Get data
+                    dict_url_data = process_url(obj_url.url)
+                    # Not none or handle as exception
+                    assert(dict_url_data is not None)
+                except Exception as e:
+                    logger.debug("Error processing URL: {}\n{}".format(obj_url.url, str(e)))
+                    # Update status
+                    obj_url.status = 'error'
+                    # Append to bulk update
+                    updating_urls.append(obj_url)
+                    # Next URL
+                    continue
+
+                ##### Canonical URL different? -> Duplicate
+                if (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
+                    # Update status
+                    obj_url.status = 'duplicate'
+                    # Append to bulk update
+                    updating_urls.append(obj_url)
+
+                    # Get or create URL with canonical form
+                    obj_url_canonical = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
+                    # Associate same sources to url -> url_canonical
+
+                    # Get the sources id associated to obj_url.id
+                    url_sources = UrlsSource.objects.filter(id_url=obj_url.id)
+                    for url_source_obj in url_sources:
+                        # Associate same sources to url_canonical (it might already exist)
+                        UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical.id)
+                    # Next URL
+                    continue
+                
+                ##### Valid URL
+                # Update status
+                obj_url.status = 'valid'
+                # Append to bulk update
+                updating_urls.append(obj_url)
+                # Create extracted URL data
+                UrlContent.objects.create_or_update(
+                    id_url=obj_url.id,
+                    date_published=dict_url_data.get("publish_date"),
+                    title=dict_url_data.get("title"),
+                    description=dict_url_data.get("description"),
+                    content=dict_url_data.get("content"),
+                    valid_content=dict_url_data.get("valid_content"),
+                    language=dict_url_data.get("language"),
+                    keywords=dict_url_data.get("keywords"),
+                    tags=dict_url_data.get("tags"),
+                    authors=dict_url_data.get("authors"),
+                    image_main=dict_url_data.get("image_main"),
+                    images_url=dict_url_data.get("images_url"),
+                    videos_url=dict_url_data.get("videos_url"),
+                    url_host=dict_url_data.get("url_host"),
+                    site_name=dict_url_data.get("site_name"),
+                )
+
+
+            ##### Override status if pattern matching?
+            for obj_url in updating_urls:
+                # Check if article status needs to be updated with pattern matching
+                status_pattern_matching = self._get_status_pattern_matching(obj_url.url, obj_url.status, list_pattern_status_tuple)
+                # Update status?
+                if (status_pattern_matching != obj_url.status):
+                    logger.debug("Pattern matching, overriding with status {} for URL: {}".format(status_pattern_matching, obj_url.url))
+                    # Update, no need to append to updating_urls, already included
+                    obj_url.status = status_pattern_matching
+
+            # Bulk update
+            Urls.objects.bulk_update(updating_urls, ['status'])
+
+            logger.debug("Finished processing raw URLs")
+        except Exception as e:
+            logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
+
--- a/app_urls/api/src/fetch_feed.py
+++ b/app_urls/api/src/fetch_feed.py
@@ -0,0 +1,50 @@
+from .db_utils import DB_Handler
+from ..models import Feed
+import feedparser
+import dateutil
+import traceback
+from .logger import get_logger
+logger = get_logger()
+
+class FetchFeeds():
+    def __init__(self) -> None:
+        logger.debug("Initializing News feed")
+    
+    def run(self):
+        try:
+            logger.debug("Starting NewsFeed.run()")
+            
+            # Get feeds
+            list_url_feeds = list(Feed.objects.values_list('rss_feed', flat=True))
+            logger.debug("Fetching news from feeds: {}".format(list_url_feeds))
+
+            # Process via RSS feeds
+            for url_feed in list_url_feeds:
+                # Initialize
+                urls_fetched, urls_publish_date = [], []
+                # Fetch feeds
+                feeds = feedparser.parse(url_feed)
+                # Parse
+                for f in feeds.get("entries", []):
+                    # Get URL
+                    url = f.get("link", None)
+                    # Process?
+                    if (url is not None):
+                        # Available publish date?
+                        publish_date_parsed = f.get("published_parsed")
+                        if (publish_date_parsed is None):
+                            publish_date = f.get("published", None)
+                            if (publish_date is not None):
+                                publish_date_parsed = dateutil.parser.parse(publish_date)
+                        
+                        # Published date
+                        urls_publish_date.append(publish_date_parsed)
+                        # URL
+                        urls_fetched.append(url)
+
+                # URL fetching source
+                source = "feed {}".format(url_feed)
+                # Write to DB
+                DB_Handler().insert_raw_urls(urls_fetched, source)
+        except Exception as e:
+            logger.warning("Exception in NewsFeed.run(): {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/api/src/logger.py
+++ b/app_urls/api/src/logger.py
@@ -0,0 +1,22 @@
+import logging
+
+import os
+os.makedirs("logs", exist_ok=True)
+
+logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
+logger = logging.getLogger("news_fetcher")
+logger.setLevel(logging.DEBUG)
+
+# To file log: INFO / WARNING / ERROR
+fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4)
+fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
+logger.addHandler(fh)
+
+# To file log: WARNING / ERROR
+fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
+fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
+fh_.setLevel(logging.WARNING)
+logger.addHandler(fh_)
+
+def get_logger():
+    return logger
--- a/app_urls/api/src/url_processor.py
+++ b/app_urls/api/src/url_processor.py
@@ -0,0 +1,60 @@
+from .logger import get_logger
+logger = get_logger()
+import newspaper 
+# pip install langdetect
+#import langdetect
+#langdetect.DetectorFactory.seed = 0
+
+def process_url(url):
+    try:
+        # Process
+        article = newspaper.article(url)
+    except newspaper.ArticleException as e:
+        logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
+        return None
+    except Exception as e:
+        logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
+        return None
+
+    dict_data = {
+        "url": url,
+        "url_canonical": article.canonical_link,
+        "url_host": article.source_url,
+        "site_name": article.meta_site_name,
+        "publish_date": article.publish_date,
+        "language": article.meta_lang, # langdetect.detect(article.text)
+        "title": article.title,
+        "description": article.meta_description,
+        "content": article.text,
+        "valid_content": article.is_valid_body(),
+        "keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
+        "tags": article.tags,
+        "authors": article.authors,
+        "image_main": article.top_image, # article.meta_img
+        "images": article.images,
+        "videos": article.videos,
+    }
+
+    '''
+    # TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",")
+    if (dict_data["tags"] is None):
+        dict_data["tags"] = []
+    for k in article.meta_data.keys():
+        if ("tags" in k):
+            dict_data["tags"] += article.meta_data[k].split(",")
+    '''
+
+    # Sanity check
+    for k in dict_data.keys():
+        if (type(k) is list):
+            # Remove empty string
+            dict_data[k] = [ e for e in dict_data[k] if e != "" ]
+            # NULL instead of empty list
+            if (len(dict_data[k]) == 0):
+                dict_data[k] = None
+        else:
+            # NULL instead of empty string
+            if (dict_data[k] == ""):
+                dict_data[k] = None
+
+    return dict_data
--- a/app_urls/api/tasks.py
+++ b/app_urls/api/tasks.py
@@ -1,13 +1,65 @@
 from django_rq import job
-import time
+
+from .src.fetch_feed import FetchFeeds
+from .src.db_utils import DB_Handler
+'''
+from src.fetch_parser import FetchParser
+from src.fetch_search import FetchSearcher
+from src.missing_kids_fetch import MissingKidsFetch
+from src.missing_kids_status import MissingKidsStatus
+from src.url_status import UpdateErrorURLs
+from src.db_utils import DB_Handler
+from src.credentials import db_connect_info, redis_connect_info
+
+# DB Handler
+db_handler = DB_Handler(db_connect_info, redis_connect_info)
+'''
+
 import logging
 logger = logging.getLogger(__name__)

+
@job
-def task_1(message):
-    logger.info("Message: {}".format(message))
+def background_task(process_type: str):
+    logger.info("Task triggered: {}".format(process_type))
+
    try:
-        time.sleep(5)  # Simulate a long-running task
-        print(f"Task completed: {message}")
+        FetchFeeds().run()
+
+        # DB_Handler().process_raw_urls()
+
+        
+        '''
+        if (process_type == "fetch_feeds"):
+            FetchFeeds(db_handler).run()
+            
+        elif (process_type == "fetch_parser"):
+            FetchParser(db_handler).run()
+            
+        elif (process_type == "search") or (process_type == "search_full"):
+            FetchSearcher(cred.db_connect_info, cred.redis_connect_info, full=True).run()
+        elif (process_type == "search_reduced"):
+            FetchSearcher(cred.db_connect_info, cred.redis_connect_info, full=False).run()
+
+        # Selenium based
+        elif (process_type == "fetch_missing_kids_reduced"):
+            MissingKidsFetch(db_handler, num_pages=4).run()
+        elif (process_type == "fetch_missing_kids_full"):
+            MissingKidsFetch(db_handler, num_pages=100000).run()
+        
+        elif (process_type == "update_missing_kids_status_reduced"):
+            MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status()
+        elif (process_type == "update_missing_kids_status_full"):
+            MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status()
+
+        elif (process_type == "update_error_urls"):
+            UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status()
+            
+        else:
+            logger.error("Task error, unknown type: {}".format(process_type))
+            return
+        '''
+
+        logger.info("Task completed: {}".format(process_type))
    except Exception as e:
        logger.error(e)
--- a/app_urls/api/urls.py
+++ b/app_urls/api/urls.py
@@ -2,5 +2,5 @@ from django.urls import path
 from .views import trigger_task

 urlpatterns = [
-    path('trigger_task/', trigger_task, name='trigger_task')
+    path('fetch', trigger_task, name='trigger_task')
 ]
--- a/app_urls/api/views.py
+++ b/app_urls/api/views.py
@@ -1,10 +1,10 @@
 import django_rq
 from django.http import JsonResponse
-from .tasks import task_1
+from .tasks import background_task

 def trigger_task(request):
    """View that enqueues a task."""
    queue = django_rq.get_queue('default')  # Get the default queue
-    job = queue.enqueue(task_1, "Hello from Django RQ!")
+    job = queue.enqueue(background_task, "Hello from Django RQ!")

    return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})
--- a/app_urls/core/settings.py
+++ b/app_urls/core/settings.py
@@ -91,6 +91,16 @@ DATABASES = {
    }
 }

+CACHES = {
+    "default": {
+        "BACKEND": "django.core.cache.backends.redis.RedisCache",
+        "LOCATION": "redis://{}:{}".format(
+                os.environ.get("REDIS_HOST", "localhost"), 
+                os.environ.get("REDIS_PORT", 6379)
+            ),
+    }
+}
+
 RQ_QUEUES = {
    'default': {
        'HOST': os.environ.get("REDIS_HOST", "localhost"),