from django.core.cache import cache
from .logger import get_logger
logger = get_logger()
import newspaper
import time
import os
from urllib.parse import unquote
import langdetect
langdetect.DetectorFactory.seed = 0

def get_with_protocol(url):
    # http:// -> https://
    url = url.replace("http://", "https://")
    # "" -> https://
    if not (url.startswith("https://")):
        url = "https://" + url
    return url

def get_url_host(url):
    # URL no protocol, first substring before '/'
    url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
    return url_host

def url_host_slowdown(url, url_host_slowdown_seconds):
    ### Avoid (frequent) too many requests to the same URL host
    # Get URL host
    url_host = get_url_host(url)
    # Recently processed URL host? -> Slow down required
    last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None)
    if last_cached_timestamp:
        # Get time since last processed URL host (in seconds)
        time_since_last_processed = time.time() - last_cached_timestamp
        # Amount of time required to sleep?
        slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
        logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
        # Sleep
        time.sleep(slowdown_required)
    # About to process URL host, cache time
    cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes

def process_url(url):
    try:
        # Slow down if required to avoid too many requests error
        url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
        # Process
        article = newspaper.article(url)
    except newspaper.ArticleBinaryDataException:
        logger.warning("ArticleException for input URL {}".format(url))
        return {"override_status": "invalid"}
    except newspaper.ArticleException as e:
        
        # Too many requests? Cool down...
        if ("Status code 429" in str(e.args)):
            # TODO: cool down and retry once?, proxy/VPN, ...
            logger.debug("TODO: process_url Implement code 429")
        # Unavailable for legal reasons
        if ("Status code 451" in str(e.args)):
            # TODO: Bypass with VPN
            logger.debug("TODO: process_url Implement code 451")
        # CloudFlare protection?
        if ("Website protected with Cloudflare" in str(e.args)):
            logger.debug("TODO: process_url Implement bypass CloudFlare")
        # PerimeterX protection?
        if ("Website protected with PerimeterX" in str(e.args)):
            logger.debug("TODO: process_url Implement bypass PerimeterX")

        logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
        return None
    except Exception as e:
        logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
        return None
    
    try:
        content_merged = "\n".join([article.title, article.meta_description, article.text])
        if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
            language = langdetect.detect(content_merged)
        else:
            language = None
    except Exception as e:
        logger.info("Could not detect language: {}\n{}".format(url, str(e)))
        language = None

    dict_data = {
        "url": url,
        "url_canonical": article.canonical_link,
        "url_host": article.source_url,
        "site_name": article.meta_site_name,
        "publish_date": article.publish_date,
        "language": language, # article.meta_lang -> Not always reliable
        "title": article.title,
        "description": article.meta_description,
        "content": article.text,
        "valid_content": article.is_valid_body(),
        "keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
        "tags": article.tags,
        "authors": article.authors,
        "image_main_url": article.top_image, # article.meta_img
        "images_url": article.images,
        "videos_url": article.movies,
    }

    '''
    # TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",")
    if (dict_data["tags"] is None):
        dict_data["tags"] = []
    for k in article.meta_data.keys():
        if ("tags" in k):
            dict_data["tags"] += article.meta_data[k].split(",")
    '''

    # Sanity check
    for k in dict_data.keys():
        if (type(dict_data[k]) is list):
            # Remove empty string, unquote special characters, e.g. "%20" -> " "
            dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ]
            # NULL instead of empty list
            if (len(dict_data[k]) == 0):
                dict_data[k] = None
        elif (type(dict_data[k]) is str):
            # Unquote special characters
            if (dict_data[k] is not None):
                dict_data[k] = unquote(dict_data[k])
            # NULL instead of empty string
            if (dict_data[k] == ""):
                dict_data[k] = None

    return dict_data