263 lines
12 KiB
Python
263 lines
12 KiB
Python
from gnews import GNews
|
||
import dateutil.parser
|
||
from datetime import datetime, timedelta
|
||
from .utils import remove_http_s
|
||
import time
|
||
import random
|
||
import traceback
|
||
import requests
|
||
import json
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
|
||
from .logger import get_logger
|
||
logger = get_logger()
|
||
|
||
def get_published_date(article):
|
||
try:
|
||
"""
|
||
# Already fetched publish date information?
|
||
if (publish_date_ is not None):
|
||
return publish_date_
|
||
"""
|
||
|
||
# List of potential publish dates
|
||
potential_dates = []
|
||
# Publish date is the best match
|
||
potential_dates.append(article.publish_date)
|
||
# Publish date metadata is the following best match
|
||
potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
|
||
# Iterate remaining keys
|
||
for key in article.meta_data.keys():
|
||
if ("date" in key):
|
||
potential_dates.append(article.meta_data[key])
|
||
|
||
def invalid_date(p_date):
|
||
# Today + 2 days, article from the future?
|
||
today_plus_two = datetime.utcnow() + timedelta(days=2)
|
||
# Article from the future?
|
||
return p_date.timestamp() > today_plus_two.timestamp()
|
||
|
||
for date_ in potential_dates:
|
||
# String date? parse
|
||
if (type(date_) == str):
|
||
try:
|
||
date_ = dateutil.parser.parse(date_)
|
||
except Exception as e:
|
||
logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
|
||
date_ = None
|
||
# Valid?
|
||
if (date_ is not None) and (not invalid_date(date_)):
|
||
return date_
|
||
|
||
logger.debug("Article with no published date: {}".format(article.url))
|
||
return None
|
||
except Exception as e:
|
||
logger.info("Error while retrieving published date for URL: {}".format(article.url))
|
||
return None
|
||
|
||
def get_url_host(article_source_url, url):
|
||
# https://www.blabla.com/blabla -> www.blabla.com
|
||
if (article_source_url != ""):
|
||
# Article source URL already extracted, save path if any
|
||
return remove_http_s(article_source_url) # .split("/")[0]
|
||
else:
|
||
return remove_http_s(url).split("/")[0]
|
||
|
||
def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
|
||
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
|
||
# Status "raw", "duplicated" and "error" should remain the way they are
|
||
# Assumption: List of patterns sorted by importance
|
||
if (article_status in ["valid", "invalid", "unknown"]):
|
||
# Regular expression pattern matching: https://regexr.com/
|
||
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
|
||
# Matching?
|
||
matching = bool(re.match(regex_pattern, url))
|
||
# Update article status
|
||
if (matching):
|
||
if (status_if_match != article_status):
|
||
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
|
||
return status_if_match
|
||
# Pattern matching not required or not found, original article status
|
||
return article_status
|
||
|
||
|
||
|
||
def bypass_google_link(article_url):
|
||
|
||
def bypass_google_consent(article_url):
|
||
# Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
|
||
article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
|
||
|
||
# https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
|
||
}
|
||
cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
|
||
|
||
try:
|
||
# Request
|
||
r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
|
||
# Decode
|
||
soup = BeautifulSoup(r.text, 'html.parser')
|
||
url_of_interest = soup.a['href']
|
||
except Exception as e:
|
||
logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
|
||
url_of_interest = None
|
||
|
||
# Not able to bypass?
|
||
if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
|
||
url_of_interest = None
|
||
return url_of_interest
|
||
|
||
def bypass_google_using_service(article_url):
|
||
try:
|
||
# e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
|
||
gbypass_endpoint = "http://selenium_app:80/get_redirection"
|
||
# Timeout: 5 minutes
|
||
r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
|
||
# Decode
|
||
redirect_url = json.loads(r.text).get("redirect_url", "")
|
||
except Exception as e:
|
||
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
|
||
redirect_url = ""
|
||
|
||
return redirect_url
|
||
|
||
logger.debug("Starting gbypass_endpoint()")
|
||
|
||
article_url_bypassed = None
|
||
# Bypass using request
|
||
if ("consent.google.com" in article_url):
|
||
article_url_bypassed = bypass_google_consent(article_url)
|
||
# Not bypassed yet? Bypass using service
|
||
if (article_url_bypassed is None):
|
||
article_url_bypassed = bypass_google_using_service(article_url)
|
||
|
||
# if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
|
||
if (article_url_bypassed == "") or (article_url_bypassed is None):
|
||
# Empty URL returned by Gbypass
|
||
logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
|
||
return None
|
||
else:
|
||
logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
|
||
return article_url_bypassed
|
||
|
||
def process_article(article_url, list_pattern_status_tuple, language="en"):
|
||
# TODO:
|
||
"""
|
||
https://github.com/fhamborg/news-please
|
||
https://github.com/fhamborg/Giveme5W1HQwer123$
|
||
|
||
https://github.com/santhoshse7en/news-fetch
|
||
"""
|
||
try:
|
||
logger.debug("Starting process_article()")
|
||
|
||
if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
|
||
# Bypass to get redirection
|
||
article_url = bypass_google_link(article_url)
|
||
# Error?
|
||
if (article_url is None):
|
||
return None, {}, "error"
|
||
elif ("missingkids.org/poster" in article_url):
|
||
# Get status
|
||
article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
|
||
article_elements = {
|
||
"url_full": article_url,
|
||
"url_canonical": url_canonical
|
||
}
|
||
return url_canonical, article_elements, article_status
|
||
else:
|
||
# Avoid Too many requests (feeds, ...)
|
||
time.sleep(0.75)
|
||
|
||
logger.debug("Processing: {}".format(article_url))
|
||
|
||
# Default status unless something happens
|
||
article_status = "valid"
|
||
|
||
# Parse article
|
||
# TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
|
||
# TODO: Language per config
|
||
article = GNews(language).get_full_article(url=article_url)
|
||
|
||
# Article parsed?
|
||
if (article is None) or (not article.is_parsed):
|
||
logger.debug("Article not parsed: {}".format(article_url))
|
||
return article_url, {}, "error"
|
||
|
||
# Canonical link as main URL
|
||
url_canonical = article.canonical_link
|
||
# Empty canonical URL?
|
||
if (article.canonical_link is None) or (article.canonical_link == ""):
|
||
# URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
|
||
if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
|
||
logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
|
||
try:
|
||
# Remove text after parameter call
|
||
url = article.url.split("?")[0]
|
||
# Remove comment-stream
|
||
url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
|
||
# Article
|
||
article_attempt = GNews(language).get_full_article(url=url)
|
||
# Retrieving same title? Update article based on clean URL
|
||
if (article_attempt is not None) and (article_attempt.title == article.title):
|
||
article = article_attempt
|
||
except Exception as e:
|
||
logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
|
||
else: # Default behaviour
|
||
logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
|
||
|
||
# By default, URL same as canonical
|
||
url_canonical = article.url
|
||
|
||
elif (article.url != article.canonical_link):
|
||
# If different, stick to canonical URL
|
||
logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
|
||
else:
|
||
# If same, continue...
|
||
pass
|
||
|
||
# Update config to determine if content is valid
|
||
article.config.MIN_WORD_COUNT = 150
|
||
article.config.MIN_SENT_COUNT = 6
|
||
|
||
# Valid URL?
|
||
if (not article.is_valid_url()):
|
||
logger.debug("Not a valid news article: {}".format(url_canonical))
|
||
article_status = "invalid"
|
||
# Is the article's body text is long enough to meet standard article requirements?
|
||
if (not article.is_valid_body()):
|
||
logger.debug("Article body not valid: {}".format(url_canonical))
|
||
article_status = "unknown"
|
||
|
||
if (article.images != article.imgs):
|
||
logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
|
||
|
||
# article.keywords, article.meta_keywords, article.summary
|
||
# article.movies
|
||
# article.top_image
|
||
|
||
# Check if article status needs to be updated
|
||
article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
|
||
|
||
article_elements = {
|
||
'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
|
||
'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com
|
||
'title': article.title, # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020
|
||
'description': article.meta_description, # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office
|
||
'text': article.text, # ${Article content}
|
||
'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
|
||
'authors': article.authors, # ['Christopher Knaus']
|
||
'language': article.meta_lang, # en
|
||
'tags': list(article.tags), # ['Wide Open Border', '’My Son Hunter’ Movie', ...]
|
||
'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...]
|
||
'url_canonical': url_canonical, # Canonical URL (redirection)
|
||
# 'html': article.html, # HTML article
|
||
}
|
||
logger.debug("Processing OK: {}".format(url_canonical))
|
||
return url_canonical, article_elements, article_status
|
||
except Exception as e:
|
||
logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
|
||
return None, {}, "error" |