from django.core.cache import cache from .logger import get_logger logger = get_logger() import newspaper import time import os from urllib.parse import unquote import langdetect langdetect.DetectorFactory.seed = 0 def get_with_protocol(url): # http:// -> https:// url = url.replace("http://", "https://") # "" -> https:// if not (url.startswith("https://")): url = "https://" + url return url def get_url_host(url): # URL no protocol, first substring before '/' url_host = url.replace("https://", "").replace("http://", "").split("/")[0] return url_host def url_host_slowdown(url, url_host_slowdown_seconds): ### Avoid (frequent) too many requests to the same URL host # Get URL host url_host = get_url_host(url) # Recently processed URL host? -> Slow down required last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None) if last_cached_timestamp: # Get time since last processed URL host (in seconds) time_since_last_processed = time.time() - last_cached_timestamp # Amount of time required to sleep? slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed) logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host)) # Sleep time.sleep(slowdown_required) # About to process URL host, cache time cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes def process_url(url): try: # Slow down if required to avoid too many requests error url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5))) # Process article = newspaper.article(url) except newspaper.ArticleBinaryDataException: logger.warning("ArticleException for input URL {}".format(url)) return {"override_status": "invalid"} except newspaper.ArticleException as e: # Too many requests? Cool down... if ("Status code 429" in str(e.args)): # TODO: cool down and retry once?, proxy/VPN, ... logger.debug("TODO: process_url Implement code 429") # Unavailable for legal reasons if ("Status code 451" in str(e.args)): # TODO: Bypass with VPN logger.debug("TODO: process_url Implement code 451") # CloudFlare protection? if ("Website protected with Cloudflare" in str(e.args)): logger.debug("TODO: process_url Implement bypass CloudFlare") # PerimeterX protection? if ("Website protected with PerimeterX" in str(e.args)): logger.debug("TODO: process_url Implement bypass PerimeterX") logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args))) return None except Exception as e: logger.warning("Exception for input URL {}\n{}".format(url, str(e))) return None try: content_merged = "\n".join([article.title, article.meta_description, article.text]) if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))): language = langdetect.detect(content_merged) else: language = None except Exception as e: logger.info("Could not detect language: {}\n{}".format(url, str(e))) language = None dict_data = { "url": url, "url_canonical": article.canonical_link, "url_host": article.source_url, "site_name": article.meta_site_name, "publish_date": article.publish_date, "language": language, # article.meta_lang -> Not always reliable "title": article.title, "description": article.meta_description, "content": article.text, "valid_content": article.is_valid_body(), "keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""], "tags": article.tags, "authors": article.authors, "image_main_url": article.top_image, # article.meta_img "images_url": article.images, "videos_url": article.movies, } ''' # TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",") if (dict_data["tags"] is None): dict_data["tags"] = [] for k in article.meta_data.keys(): if ("tags" in k): dict_data["tags"] += article.meta_data[k].split(",") ''' # Sanity check for k in dict_data.keys(): if (type(dict_data[k]) is list): # Remove empty string, unquote special characters, e.g. "%20" -> " " dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ] # NULL instead of empty list if (len(dict_data[k]) == 0): dict_data[k] = None elif (type(dict_data[k]) is str): # Unquote special characters if (dict_data[k] is not None): dict_data[k] = unquote(dict_data[k]) # NULL instead of empty string if (dict_data[k] == ""): dict_data[k] = None return dict_data