Dockerization, whitenoise serving static, refactor

2025-04-04 10:53:16 +02:00
parent 5addfa5ba9
commit 4dbe2e55ef
39 changed files with 708 additions and 1238 deletions
--- a/app_urls/fetcher/src/db_utils.py
+++ b/app_urls/fetcher/src/db_utils.py
@@ -0,0 +1,273 @@
+from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPatternMatching, Source, Search
+from django.db.models import Q
+from django.core.cache import cache
+from django.db import IntegrityError
+from .url_processor import process_url, get_with_protocol
+import re
+import traceback
+from .logger import get_logger
+logger = get_logger()
+
+class DB_Handler():
+    def __init__(self):
+        # Inserting raw URL, cache time: 1 day
+        self._cache_timeout_insert_url = 86400
+        # Processing error URL, cache time: 2 days
+        self._cache_timeout_error_url = 86400*2
+
+    def insert_raw_urls(self, urls, obj_source, obj_search):        
+        try:
+            logger.debug("Inserting raw URLs")
+            # Empty?
+            if (len(urls) == 0):
+                logger.debug("Empty batch of urls (not writing to DB) for source-search: {} - {}".format(obj_source.source, obj_search.search))
+                return
+            # Default protocol https://
+            urls_clean = [get_with_protocol(url) for url in urls]
+
+            urls_to_insert = []
+            # Per URL
+            for url in urls_clean:
+
+                ### Already processed URL?
+                if (cache.get("insert_{}".format(url)) is not None):
+                    logger.debug("Already cached URL: {}".format(url))
+
+                    if (cache.get("insert_{}{}{}".format(url, obj_source.source, obj_search.search)) is not None):
+                        logger.debug("Already cached (URL, source, search): {} {} {}".format(url, obj_source.source, obj_search.search))
+                    else:
+                        ### Insert (URL_id, source_id, search_id), since not cached
+                        # Get URL ID (should already be created)
+                        obj_url, created = Urls.objects.get_or_create(url=url)
+                        # Create (id_source, id_url) (shouldn't exist)
+                        UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
+                else:
+                    # Add object to insert
+                    # url_object_to_insert.append(Urls(url=url))
+                    urls_to_insert.append(url)
+
+            ### Insert URLs & (URL_id, source_id)
+            try:
+                ### Bulk insert, fails if duplicated URL (not retuning IDs when using ignore_conflicts=True)
+                # URLs (ignore_conflicts=False to return IDs)
+                bulk_created_urls = Urls.objects.bulk_create([Urls(url=url) for url in urls_to_insert], ignore_conflicts=False)
+                # (URL_id, source_id)
+                UrlsSourceSearch.objects.bulk_create([UrlsSourceSearch(id_url=obj_url, id_source=obj_source, id_search=obj_search) for obj_url in bulk_created_urls], ignore_conflicts=True)
+            except IntegrityError as e:
+                ### Fallback to one-by-one insert
+                logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method")
+                # One by one
+                for url in urls_to_insert:
+                    # URL
+                    obj_url, created = Urls.objects.get_or_create(url=url)
+                    if (created):
+                        logger.debug("Inserted: {}".format(obj_url.url))
+                    else:
+                        logger.debug("Not inserted: {}".format(obj_url.url))
+                    # (URL, source, search)
+                    UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
+            except Exception as e:
+                logger.warning("bulk_create unknown exception while inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
+                # Avoid caching due to error on insertion
+                urls_clean = []
+
+            # Insert or update cache
+            for url in urls_clean:
+                cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
+                cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
+
+            logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search))
+
+        except Exception as e:
+            logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
+
+    def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
+        
+        def set_status(obj_url, status):
+            # Update status if setting a new value
+            if (obj_url.status != status):
+                obj_url.status = status
+                obj_url.save()
+
+        ##### Filter URL? -> Invalid
+        if (status_pattern_match == "invalid"):
+            logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
+            # Update status
+            set_status(obj_url, Urls.STATUS_ENUM.INVALID)
+            # Next URL
+            return
+        
+        ##### Process URL
+        try:
+            # Get data
+            dict_url_data = process_url(obj_url.url)
+        except Exception as e:
+            if (raise_exception_on_error):
+                # Simply raise exception, handled in a different way
+                raise Exception("Error processing URL, raising exception as expected")
+            else:
+                logger.debug("Error processing URL: {}\n{}\n{}".format(obj_url.url, str(e), traceback.format_exc()))
+                # Set status to error
+                dict_url_data = None
+        
+        # (dict_url_data is None) or (Exception while processing URL) ? -> Error status
+        if (dict_url_data is None):
+            # Update status
+            set_status(obj_url, Urls.STATUS_ENUM.ERROR)
+            # Next URL
+            return
+
+        # Invalid? e.g. binary data
+        if (dict_url_data.get("override_status") == "invalid"):
+            # Update status
+            set_status(obj_url, Urls.STATUS_ENUM.INVALID)
+            # Next URL
+            return
+
+        ##### Canonical URL different? -> Duplicate
+        if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
+            # Update status
+            set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
+            
+            # Get or create URL with canonical form
+            obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
+            # Get the source-search IDs associated to obj_url.id
+            list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
+            for obj_url_source_search in list_url_source_search:
+                # Associate same sources to url_canonical (it might already exist)
+                UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
+            
+            # URLs duplciate association
+            UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
+
+            # TODO: return obj_url_canonical so as to directly process the recently inserted URL
+            # Wherever this function is called, add:
+            # self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
+
+            # Next URL
+            return
+        
+        ##### Valid URL
+        # Update status
+        set_status(obj_url, Urls.STATUS_ENUM.VALID)
+
+        # Create or update extracted URL data
+        UrlContent.objects.update_or_create(
+            id_url=obj_url,
+            defaults = {
+                "date_published" : dict_url_data.get("publish_date"),
+                "title" : dict_url_data.get("title"),
+                "description" : dict_url_data.get("description"),
+                "content" : dict_url_data.get("content"),
+                "valid_content" : dict_url_data.get("valid_content"),
+                "language" : dict_url_data.get("language"),
+                "keywords" : dict_url_data.get("keywords"),
+                "tags" : dict_url_data.get("tags"),
+                "authors" : dict_url_data.get("authors"),
+                "image_main_url" : dict_url_data.get("image_main_url"),
+                "images_url" : dict_url_data.get("images_url"),
+                "videos_url" : dict_url_data.get("videos_url"),
+                "url_host" : dict_url_data.get("url_host"),
+                "site_name" : dict_url_data.get("site_name"),
+            }
+        )
+
+    def process_raw_urls(self, batch_size):
+
+        def _get_status_pattern_matching(url, list_pattern_status_tuple):
+            """ Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
+            """
+            # Sort pattern tuples by priority. (pattern, priority, status)
+            for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
+                # Regular expression pattern matching: https://regexr.com/
+                if bool(re.match(regex_pattern, obj_url.url)):
+                    logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
+                    return status_if_match
+            return None
+
+        try:
+            logger.debug("Processing raw URLs")
+
+            # Get batch of URLs, status='raw'
+            raw_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.RAW)[:batch_size]
+
+            if (len(raw_urls) == 0):
+                logger.debug("No raw URLs to process")
+                return
+
+            # Get list of (pattern, priority, status) tuples to override status if required
+            list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
+
+            # Per URL
+            for obj_url in raw_urls:
+                # Override status if pattern matching?
+                status_pattern_match = _get_status_pattern_matching(obj_url.url, list_pattern_status_tuple)
+                # Process URL
+                self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False)
+
+            logger.info("Updated #{} raw URLs".format(len(raw_urls)))
+        except Exception as e:
+            logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
+
+    def process_error_urls(self, batch_size):
+        try:
+            logger.debug("Processing error URLs")
+
+            # Keep track of processed and skipped "error" URLs
+            num_urls_skipped, num_urls_processed = 0, 0
+            # Get batch of URLs, status='error'
+            error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
+
+            while ((len(error_urls) > 0) and (num_urls_processed < batch_size)):
+                # Per URL
+                for obj_url in error_urls:
+                    # URL ID cached? -> Tried to process recently already, skip
+                    if (cache.get("error_{}".format(obj_url.id)) is not None):
+                        logger.debug("Already cached URL ID: {}".format(obj_url.id))
+                        num_urls_skipped += 1
+                        continue
+                    
+                    try:
+                        # Process URL
+                        self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
+                        num_urls_processed += 1
+                    except Exception as e:
+                        # Error, cache to avoid re-processing for X time
+                        cache.set("error_{}".format(obj_url.id), True, timeout=self._cache_timeout_insert_url)
+                        num_urls_skipped += 1
+                
+                # Get following batch of URLs, status='error'
+                error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
+            
+            logger.info("Updated #{}, skipped #{} error URLs".format(num_urls_processed, num_urls_skipped))
+        except Exception as e:
+            logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
+    
+    def process_missing_kids_urls(self, batch_size=None):
+        try:
+            logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size))
+            # Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
+            missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
+                (Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
+                &
+                (Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
+            )
+            
+            # Get batch size
+            if (batch_size is not None):
+                missingkids_urls = missingkids_urls[:batch_size]
+
+            # Per URL
+            for obj_url in missingkids_urls:
+                try:
+                    # Process URL. If no exception -> Valid
+                    self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
+                except Exception as e:
+                    # Raised exception -> Invalid (404 error)
+                    obj_url.status = Urls.STATUS_ENUM.INVALID
+                    obj_url.save()
+            
+            logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
+        except Exception as e:
+            logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
+    
--- a/app_urls/fetcher/src/fetch_feed.py
+++ b/app_urls/fetcher/src/fetch_feed.py
@@ -0,0 +1,51 @@
+from .db_utils import DB_Handler
+from ..models import Search, Source
+import feedparser
+import dateutil
+import traceback
+from .logger import get_logger
+logger = get_logger()
+
+class FetchFeeds():
+    def __init__(self) -> None:
+        logger.debug("Initializing Fetcher Feeds")
+    
+    def run(self):
+        try:
+            logger.debug("Starting FetchFeeds.run()")
+            
+            # Get source object
+            obj_source, created = Source.objects.get_or_create(source="feeds")
+
+            # Get feeds objects
+            list_obj_search_feeds = Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED)
+            logger.debug("Fetching from feeds: {}".format([e.search for e in list_obj_search_feeds]))
+
+            # Process via RSS feeds
+            for obj_search in list_obj_search_feeds:
+                # Initialize
+                urls_fetched, urls_publish_date = [], []
+                # Fetch feeds
+                feeds = feedparser.parse(obj_search.search)
+                # Parse
+                for f in feeds.get("entries", []):
+                    # Get URL
+                    url = f.get("link", None)
+                    # Process?
+                    if (url is not None):
+                        # Available publish date?
+                        publish_date_parsed = f.get("published_parsed")
+                        if (publish_date_parsed is None):
+                            publish_date = f.get("published", None)
+                            if (publish_date is not None):
+                                publish_date_parsed = dateutil.parser.parse(publish_date)
+                        
+                        # Published date
+                        urls_publish_date.append(publish_date_parsed)
+                        # URL
+                        urls_fetched.append(url)
+                
+                # Write to DB
+                DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
+        except Exception as e:
+            logger.warning("Exception in FetchFeeds.run(): {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/fetcher/src/fetch_missing_kids.py
+++ b/app_urls/fetcher/src/fetch_missing_kids.py
@@ -0,0 +1,42 @@
+from .db_utils import DB_Handler
+from ..models import Search, Source
+import os
+import requests
+import json
+import traceback
+from .logger import get_logger
+logger = get_logger()
+
+class FetchMissingKids():
+    def __init__(self) -> None:
+        logger.debug("Initializing Fetcher MissingKids")
+
+    def run(self, number_pages=-1):
+        try:
+            logger.debug("Starting MissingKids.run(), processing #{} pages".format(number_pages))
+
+            # Get source object
+            obj_source, created = Source.objects.get_or_create(source="missingkids.org")
+            # Get search object
+            obj_search, created = Search.objects.get_or_create(search="missingkids.org/poster", type=Search.TYPE_ENUM.URL_HOST)
+
+            try:
+                # Missing kids fetching endpoint, parameter number of pages to fetch
+                missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "get_missing_kids/?pages={}".format(number_pages))
+                # Timeout
+                if (number_pages > 15) or (number_pages == -1):
+                    timeout = 60*90 # 1.5h
+                else:
+                    timeout = 60*10  # 10 min
+                # Request
+                r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
+                # Decode
+                urls_fetched = json.loads(r.text).get("list_urls", [])
+            except Exception as e:
+                logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
+                urls_fetched = []
+                
+            # Write to DB
+            DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
+        except Exception as e:
+            logger.warning("Exception in MissingKids.run(): {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/fetcher/src/fetch_parser.py
+++ b/app_urls/fetcher/src/fetch_parser.py
@@ -0,0 +1,46 @@
+from .db_utils import DB_Handler
+from ..models import Search, Source
+from .url_processor import get_with_protocol, url_host_slowdown
+import newspaper
+import traceback
+from .logger import get_logger
+logger = get_logger()
+
+class FetchParser():
+    def __init__(self) -> None:
+        logger.debug("Initializing Fetcher Parser")
+
+    def run(self):
+        try:
+            logger.debug("Starting FetchParser.run() for {}")
+
+            # Get source object
+            obj_source, created = Source.objects.get_or_create(source="newspaper4k")
+            # Get URL hosts
+            list_url_host = Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST)
+            logger.debug("Fetching news by parsing URL hosts: {}".format([e.search for e in list_url_host]))
+
+            # Process newspaper4k build method
+            for obj_search in list_url_host:
+                # Protocol
+                url_host_protocol = get_with_protocol(obj_search.search)
+                logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_protocol))
+                
+                # Make sure no requests made for the last X seconds
+                url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
+                try:
+                    # Source object
+                    url_host_built = newspaper.build(url_host_protocol)
+                    # Get articles URL list
+                    urls_fetched = url_host_built.article_urls()
+                except newspaper.exceptions.ArticleException as e:
+                    logger.debug("ArticleException while parsing input URL {}\n{}".format(url_host_protocol, str(e.args)))
+                    urls_fetched = []
+                except Exception as e:
+                    logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
+                    urls_fetched = []
+                
+                # Write to DB
+                DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
+        except Exception as e:
+            logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/fetcher/src/fetch_search.py
+++ b/app_urls/fetcher/src/fetch_search.py
@@ -0,0 +1,57 @@
+from .db_utils import DB_Handler
+from ..models import Search
+from django.db.models import Q
+import traceback
+import time
+import os
+from .fetch_search_instances import ListSearchInstances
+from .logger import get_logger
+logger = get_logger()
+
+class FetchSearcher():
+    def __init__(self) -> None:
+        logger.debug("Initializing Fetcher Searcher")
+
+    def run(self):
+        try:
+            logger.debug("Starting FetchSearcher.run()")
+
+            # Get search objects of interest
+            list_search_obj = Search.objects.filter(Q(type=Search.TYPE_ENUM.URL_HOST) | Q(type=Search.TYPE_ENUM.KEYWORD_SEARCH))
+            logger.debug("Fetching from search: {}".format(["{} ({})".format(e.search, e.type) for e in list_search_obj]))
+
+            # Search
+            for obj_search in list_search_obj:
+                # TODO: language & country customization
+                
+                # Search
+                keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
+
+                if (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
+                    # Add search with intitle keyword
+                    # TODO: allintitle: "child abuse"
+                    # TODO: intitle: "child abuse"
+                    pass
+                    # language, country = obj_search.language_country.split("-")
+
+                logger.debug("Starting keyword search: {}".format(keyword_search))
+                logger.debug("Search type: {}".format(obj_search.type))
+
+                # DB writer
+                db_writer = DB_Handler()
+
+                # Keyword arguments
+                args = {
+                    "language": "en",
+                    "country": "US",
+                    # "period": ["7d", "1d"], # TODO: List of periods to iterate
+                }                
+
+                for SearchInstance in ListSearchInstances:
+                    # Sleep between requests, avoid too many requests...
+                    time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
+                    SearchInstance(args).fetch_articles(db_writer, obj_search)
+
+                # TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
+        except Exception as e:
+            logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/fetcher/src/fetch_search_instances.py
+++ b/app_urls/fetcher/src/fetch_search_instances.py
@@ -0,0 +1,308 @@
+import time
+import feedparser
+import os
+from django.utils import timezone
+from datetime import timedelta
+from ..models import Search, Source
+from .fetch_utils import decode_gnews_urls
+from .logger import get_logger
+logger = get_logger()
+
+from gnews import GNews
+from duckduckgo_search import DDGS
+from GoogleNews import GoogleNews
+from search_engines import Yahoo, Aol
+
+###########################################################################
+###########################################################################
+from abc import ABC, abstractmethod
+
+# Generic fetcher (fetches articles, writes to DB)
+class FetcherAbstract(ABC):
+    @abstractmethod
+    def _fetch_raw_urls(self):
+        pass
+    
+    @abstractmethod
+    def _get_name(self):
+        pass
+
+    def _get_source_object(self, source):
+        #  TODO: Cache
+        # self.cached_sources = {}
+        # Get source object
+        obj_source, created = Source.objects.get_or_create(source=source)
+        return obj_source
+
+    def _post_process_urls(self, raw_urls, obj_search):
+        # Searching URL Host based? Make sure results belong to that site
+        if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
+            # Get clean URL host
+            url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
+            # Ensure URL host in URL
+            raw_urls = [u for u in raw_urls if url_host_clean in u]
+
+        return raw_urls
+
+    def fetch_articles(self, db_writer, obj_search):
+        # Source name
+        source_name = self._get_name()
+
+        # Search
+        keyword_search = obj_search.search
+        # URL Host search? -> site:${URL_HOST}
+        if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
+            keyword_search = "{}{}".format("site:", keyword_search)
+        # Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
+        if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
+            start_date = timezone.now() - timedelta(days=7)
+            keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
+
+        logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
+        # Fetch
+        raw_urls = self._fetch_raw_urls(keyword_search)
+        # Post-process
+        raw_urls = self._post_process_urls(raw_urls, obj_search)
+
+        # Write to DB
+        db_writer.insert_raw_urls(raw_urls, self._get_source_object(source_name), obj_search)
+
+###########################################################################
+
+class SearchGNews(FetcherAbstract):
+    def __init__(self, args={}):
+        super().__init__()
+        # Parameters
+        self.language = args.get("language", "en")
+        self.country = args.get("country", "US")
+        self.period = args.get("period", "7d")
+        self.max_results = args.get("max_results", 100)
+        
+    def _get_name(self):
+        # [source] [period] [language-country] [max_results]
+        return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
+
+    def _fetch_raw_urls(self, keyword_search):
+        try:
+            # Get news
+            results_gnews = GNews(language=self.language, country=self.country, period=self.period, max_results=self.max_results).get_news(keyword_search)
+            # Get list of encoded urls
+            encoded_urls = [e.get("url") for e in results_gnews]
+            # Decode
+            urls = decode_gnews_urls(encoded_urls)
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
+            urls = []
+        return urls
+
+class SearchDuckDuckGoGeneral(FetcherAbstract):
+    def __init__(self, args={}):
+        super().__init__()
+        # Parameters
+        self.language = args.get("language", "wt")
+        self.country = args.get("country", "wt")
+        self.max_results = args.get("max_results", 20)
+        self.region = "{}-{}".format(self.language, self.country).lower()
+        self.period = None
+        
+    def _get_name(self):
+        # [source] [language-country] [max_results]
+        return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
+
+    def _fetch_raw_urls(self, keyword_search):
+        try:    
+            news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
+            urls = [e.get("href") for e in news]
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
+            urls = []
+        return urls
+
+class SearchDuckDuckGoNews(FetcherAbstract):
+    def __init__(self, args={}):
+        super().__init__()
+        # Parameters
+        self.language = args.get("language", "wt")
+        self.country = args.get("country", "wt")
+        self.max_results = args.get("max_results", 100)
+        self.region = "{}-{}".format(self.language, self.country).lower()
+        self.period = None
+        
+    def _get_name(self):
+        # [source] [language-country] [max_results]
+        return "ddg-news {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
+
+    def _fetch_raw_urls(self, keyword_search):
+        try:    
+            news = DDGS().news(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
+            urls = [e.get("url") for e in news]
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
+            urls = []
+        return urls
+
+class SearchGoogleNews(FetcherAbstract):
+    def __init__(self, args={}):
+        super().__init__()
+        # Parameters
+        self.language = args.get("language", "en")
+        self.country = args.get("country", "US")
+        self.period = args.get("period", "7d")
+        
+    def _get_name(self):
+        # [source] [period] [language-country]
+        return "googlenews {} {}-{}".format(self.period, self.language, self.country)
+
+    def _fetch_raw_urls(self, keyword_search):
+        try:
+            # Initialize
+            googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
+            googlenews.enableException(True)
+            # Search
+            googlenews.get_news(keyword_search)
+            # Fetch
+            encoded_urls = googlenews.get_links()
+            # Decode
+            urls = decode_gnews_urls(encoded_urls)
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
+            urls = []
+        return urls
+
+class SearchGoogleGeneral(FetcherAbstract):
+    def __init__(self, args={}):
+        super().__init__()
+        # Parameters
+        self.language = args.get("language", "en")
+        self.country = args.get("country", "US")
+        self.period = args.get("period", "7d")
+        self.pages = args.get("pages", 1)
+        
+    def _get_name(self):
+        # [source] [period] [language-country] [pages]
+        return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip()
+
+    def _fetch_raw_urls(self, keyword_search):
+        try:
+            # Initialize
+            googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
+            googlenews.enableException(True)
+            # Search
+            googlenews.search(keyword_search)
+
+            set_links = set()
+            # Iterate pages
+            for i in range(self.pages):
+                # Sleep between pages fetch
+                time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
+                # Number of URLs fetched so far
+                num_before = len(set_links)
+                # Get page
+                try:
+                    links = googlenews.page_at(i+1)
+                except Exception as e:
+                    logger.warning("Exception fetching page - {}: {}".format(self._get_name(), str(e)))
+                    break
+                # Links
+                for l in links:
+                    # 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
+                    set_links.add( l.get("link").split("&ved=")[0] )
+                # Finished?
+                if (num_before == len(set_links)):
+                    break
+            # To list
+            urls = list(set_links)
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
+            urls = []
+        return urls
+
+class SearchGoogleNewsRSS(FetcherAbstract):
+    def __init__(self, args={}):
+        super().__init__()
+        # Parameters
+        self.language = args.get("language", "en")
+        self.country = args.get("country", "US")
+
+    def _get_name(self):
+        # [source] [language-country]
+        return "googlenews-rss {}-{}".format(self.language, self.country).strip()
+
+    def _fetch_raw_urls(self, keyword_search):
+        try:
+            # Search URL with parameters filled: https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
+            search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(self.language, self.country.upper()), self.country.upper(), self.country.upper(), self.language)
+            # Control characters
+            search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
+            # Initialize
+            encoded_urls = []
+            # Fetch feeds
+            feeds = feedparser.parse(search_url)
+            # Parse
+            for f in feeds.get("entries", []):
+                # Encoded URL
+                encoded_url = f.get("link", None)
+                '''
+                # Available publish date?
+                publish_date_parsed = f.get("published_parsed")
+                if (publish_date_parsed is None):
+                    publish_date = f.get("published", None)
+                    if (publish_date is not None):
+                        publish_date_parsed = dateutil.parser.parse(publish_date)
+
+                # Published date
+                urls_publish_date.append(publish_date_parsed)'
+                '''
+                # Append
+                encoded_urls.append(encoded_url)
+            
+            # Decode
+            urls = decode_gnews_urls(encoded_urls)
+            
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
+            urls = []
+        
+        return urls
+    
+class SearchYahooGeneral(FetcherAbstract):
+    def __init__(self, args={}):
+        super().__init__()
+        # Parameters
+        self.pages = args.get("pages", 2)
+        
+    def _get_name(self):
+        # [source] [language-country] [pages]
+        return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip()
+
+    def _fetch_raw_urls(self, keyword_search):
+        try:
+            results = Yahoo().search(keyword_search, pages=self.pages)
+            urls = results.links()
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
+            urls = []
+        return urls
+
+class SearchAOLGeneral(FetcherAbstract):
+    def __init__(self, args={}):
+        super().__init__()
+        # Parameters
+        self.pages = args.get("pages", 2)
+        
+    def _get_name(self):
+        # [source] [language-country] [pages]
+        return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip()
+
+    def _fetch_raw_urls(self, keyword_search):
+        try:
+            results = Aol().search(keyword_search, pages=self.pages)
+            urls = results.links()
+        except Exception as e:
+            logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
+            urls = []
+        return urls
+###########################################################################
+
+# List of instances
+ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]
--- a/app_urls/fetcher/src/fetch_utils.py
+++ b/app_urls/fetcher/src/fetch_utils.py
@@ -0,0 +1,35 @@
+import os
+from django.core.cache import cache
+from .logger import get_logger
+logger = get_logger()
+from googlenewsdecoder import gnewsdecoder
+
+
+def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
+    logger.debug("Decoding gnews URLs")
+    # DecodeURLs
+    list_decoded_urls = []
+    for url in encoded_urls:
+        # Already cached?
+        decoded_url = cache.get("gnews_decode_{}".format(url))
+
+        if (decoded_url is not None):
+            logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
+            # Append decoded URL
+            list_decoded_urls.append(decoded_url)
+        else:
+            try:
+                # Decode URL, with interval time to avoid block
+                decoded_url_dict = gnewsdecoder(url, interval=interval)
+                # Ok?
+                if decoded_url_dict.get("status"):
+                    # Append decoded URL
+                    decoded_url = decoded_url_dict["decoded_url"]
+                    list_decoded_urls.append(decoded_url)
+                    # Cache decoded URL
+                    cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
+                else:
+                    logger.info("Bad status while decoding news.google.com, URL {}\n{}".format(url, decoded_url_dict.get("message")))
+            except Exception as e:
+                logger.warning("Error decoding news.google.com, URL: {}".format(url))
+    return list_decoded_urls
--- a/app_urls/fetcher/src/logger.py
+++ b/app_urls/fetcher/src/logger.py
@@ -0,0 +1,33 @@
+import logging
+import os
+
+# Get env var
+logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
+
+# Directory of logs
+os.makedirs(logs_directory, exist_ok=True)
+
+logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
+logger = logging.getLogger("fetcher")
+logger.setLevel(logging.DEBUG)
+
+# To file log: INFO / WARNING / ERROR / CRITICAL
+fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
+fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
+fh.setLevel(logging.DEBUG)
+logger.addHandler(fh)
+
+# To file log: INFO / WARNING / ERROR
+fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
+fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
+fh.setLevel(logging.INFO)
+logger.addHandler(fh)
+
+# To file log: WARNING / ERROR / CRITICAL
+fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
+fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
+fh.setLevel(logging.WARNING)
+logger.addHandler(fh)
+
+def get_logger():
+    return logger
--- a/app_urls/fetcher/src/url_processor.py
+++ b/app_urls/fetcher/src/url_processor.py
@@ -0,0 +1,127 @@
+from django.core.cache import cache
+from .logger import get_logger
+logger = get_logger()
+import newspaper
+import time
+import os
+from urllib.parse import unquote
+import langdetect
+langdetect.DetectorFactory.seed = 0
+
+def get_with_protocol(url):
+    # http:// -> https://
+    url = url.replace("http://", "https://")
+    # "" -> https://
+    if not (url.startswith("https://")):
+        url = "https://" + url
+    return url
+
+def get_url_host(url):
+    # URL no protocol, first substring before '/'
+    url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
+    return url_host
+
+def url_host_slowdown(url, url_host_slowdown_seconds):
+    ### Avoid (frequent) too many requests to the same URL host
+    # Get URL host
+    url_host = get_url_host(url)
+    # Recently processed URL host? -> Slow down required
+    last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None)
+    if last_cached_timestamp:
+        # Get time since last processed URL host (in seconds)
+        time_since_last_processed = time.time() - last_cached_timestamp
+        # Amount of time required to sleep?
+        slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
+        logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
+        # Sleep
+        time.sleep(slowdown_required)
+    # About to process URL host, cache time
+    cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
+
+def process_url(url):
+    try:
+        # Slow down if required to avoid too many requests error
+        url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
+        # Process
+        article = newspaper.article(url)
+    except newspaper.ArticleBinaryDataException:
+        logger.warning("ArticleException for input URL {}".format(url))
+        return {"override_status": "invalid"}
+    except newspaper.ArticleException as e:
+        
+        # Too many requests? Cool down...
+        if ("Status code 429" in str(e.args)):
+            # TODO: cool down and retry once?, proxy/VPN, ...
+            logger.debug("TODO: process_url Implement code 429")
+        # Unavailable for legal reasons
+        if ("Status code 451" in str(e.args)):
+            # TODO: Bypass with VPN
+            logger.debug("TODO: process_url Implement code 451")
+        # CloudFlare protection?
+        if ("Website protected with Cloudflare" in str(e.args)):
+            logger.debug("TODO: process_url Implement bypass CloudFlare")
+        # PerimeterX protection?
+        if ("Website protected with PerimeterX" in str(e.args)):
+            logger.debug("TODO: process_url Implement bypass PerimeterX")
+
+        logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
+        return None
+    except Exception as e:
+        logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
+        return None
+    
+    try:
+        content_merged = "\n".join([article.title, article.meta_description, article.text])
+        if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
+            language = langdetect.detect(content_merged)
+        else:
+            language = None
+    except Exception as e:
+        logger.info("Could not detect language: {}\n{}".format(url, str(e)))
+        language = None
+
+    dict_data = {
+        "url": url,
+        "url_canonical": article.canonical_link,
+        "url_host": article.source_url,
+        "site_name": article.meta_site_name,
+        "publish_date": article.publish_date,
+        "language": language, # article.meta_lang -> Not always reliable
+        "title": article.title,
+        "description": article.meta_description,
+        "content": article.text,
+        "valid_content": article.is_valid_body(),
+        "keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
+        "tags": article.tags,
+        "authors": article.authors,
+        "image_main_url": article.top_image, # article.meta_img
+        "images_url": article.images,
+        "videos_url": article.movies,
+    }
+
+    '''
+    # TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",")
+    if (dict_data["tags"] is None):
+        dict_data["tags"] = []
+    for k in article.meta_data.keys():
+        if ("tags" in k):
+            dict_data["tags"] += article.meta_data[k].split(",")
+    '''
+
+    # Sanity check
+    for k in dict_data.keys():
+        if (type(dict_data[k]) is list):
+            # Remove empty string, unquote special characters, e.g. "%20" -> " "
+            dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ]
+            # NULL instead of empty list
+            if (len(dict_data[k]) == 0):
+                dict_data[k] = None
+        elif (type(dict_data[k]) is str):
+            # Unquote special characters
+            if (dict_data[k] is not None):
+                dict_data[k] = unquote(dict_data[k])
+            # NULL instead of empty string
+            if (dict_data[k] == ""):
+                dict_data[k] = None
+
+    return dict_data