Dockerization, whitenoise serving static, refactor
This commit is contained in:
127
app_urls/fetcher/src/url_processor.py
Normal file
127
app_urls/fetcher/src/url_processor.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from django.core.cache import cache
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
import newspaper
|
||||
import time
|
||||
import os
|
||||
from urllib.parse import unquote
|
||||
import langdetect
|
||||
langdetect.DetectorFactory.seed = 0
|
||||
|
||||
def get_with_protocol(url):
|
||||
# http:// -> https://
|
||||
url = url.replace("http://", "https://")
|
||||
# "" -> https://
|
||||
if not (url.startswith("https://")):
|
||||
url = "https://" + url
|
||||
return url
|
||||
|
||||
def get_url_host(url):
|
||||
# URL no protocol, first substring before '/'
|
||||
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||
return url_host
|
||||
|
||||
def url_host_slowdown(url, url_host_slowdown_seconds):
|
||||
### Avoid (frequent) too many requests to the same URL host
|
||||
# Get URL host
|
||||
url_host = get_url_host(url)
|
||||
# Recently processed URL host? -> Slow down required
|
||||
last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None)
|
||||
if last_cached_timestamp:
|
||||
# Get time since last processed URL host (in seconds)
|
||||
time_since_last_processed = time.time() - last_cached_timestamp
|
||||
# Amount of time required to sleep?
|
||||
slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
|
||||
logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
|
||||
# Sleep
|
||||
time.sleep(slowdown_required)
|
||||
# About to process URL host, cache time
|
||||
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||
|
||||
def process_url(url):
|
||||
try:
|
||||
# Slow down if required to avoid too many requests error
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||
# Process
|
||||
article = newspaper.article(url)
|
||||
except newspaper.ArticleBinaryDataException:
|
||||
logger.warning("ArticleException for input URL {}".format(url))
|
||||
return {"override_status": "invalid"}
|
||||
except newspaper.ArticleException as e:
|
||||
|
||||
# Too many requests? Cool down...
|
||||
if ("Status code 429" in str(e.args)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
logger.debug("TODO: process_url Implement code 429")
|
||||
# Unavailable for legal reasons
|
||||
if ("Status code 451" in str(e.args)):
|
||||
# TODO: Bypass with VPN
|
||||
logger.debug("TODO: process_url Implement code 451")
|
||||
# CloudFlare protection?
|
||||
if ("Website protected with Cloudflare" in str(e.args)):
|
||||
logger.debug("TODO: process_url Implement bypass CloudFlare")
|
||||
# PerimeterX protection?
|
||||
if ("Website protected with PerimeterX" in str(e.args)):
|
||||
logger.debug("TODO: process_url Implement bypass PerimeterX")
|
||||
|
||||
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||
return None
|
||||
|
||||
try:
|
||||
content_merged = "\n".join([article.title, article.meta_description, article.text])
|
||||
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
|
||||
language = langdetect.detect(content_merged)
|
||||
else:
|
||||
language = None
|
||||
except Exception as e:
|
||||
logger.info("Could not detect language: {}\n{}".format(url, str(e)))
|
||||
language = None
|
||||
|
||||
dict_data = {
|
||||
"url": url,
|
||||
"url_canonical": article.canonical_link,
|
||||
"url_host": article.source_url,
|
||||
"site_name": article.meta_site_name,
|
||||
"publish_date": article.publish_date,
|
||||
"language": language, # article.meta_lang -> Not always reliable
|
||||
"title": article.title,
|
||||
"description": article.meta_description,
|
||||
"content": article.text,
|
||||
"valid_content": article.is_valid_body(),
|
||||
"keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
|
||||
"tags": article.tags,
|
||||
"authors": article.authors,
|
||||
"image_main_url": article.top_image, # article.meta_img
|
||||
"images_url": article.images,
|
||||
"videos_url": article.movies,
|
||||
}
|
||||
|
||||
'''
|
||||
# TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",")
|
||||
if (dict_data["tags"] is None):
|
||||
dict_data["tags"] = []
|
||||
for k in article.meta_data.keys():
|
||||
if ("tags" in k):
|
||||
dict_data["tags"] += article.meta_data[k].split(",")
|
||||
'''
|
||||
|
||||
# Sanity check
|
||||
for k in dict_data.keys():
|
||||
if (type(dict_data[k]) is list):
|
||||
# Remove empty string, unquote special characters, e.g. "%20" -> " "
|
||||
dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ]
|
||||
# NULL instead of empty list
|
||||
if (len(dict_data[k]) == 0):
|
||||
dict_data[k] = None
|
||||
elif (type(dict_data[k]) is str):
|
||||
# Unquote special characters
|
||||
if (dict_data[k] is not None):
|
||||
dict_data[k] = unquote(dict_data[k])
|
||||
# NULL instead of empty string
|
||||
if (dict_data[k] == ""):
|
||||
dict_data[k] = None
|
||||
|
||||
return dict_data
|
||||
Reference in New Issue
Block a user