Files
matitos_news/app_urls/api/obsolete_src/url_utils.py
2025-03-12 17:56:40 +01:00

263 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from gnews import GNews
import dateutil.parser
from datetime import datetime, timedelta
from .utils import remove_http_s
import time
import random
import traceback
import requests
import json
import re
from bs4 import BeautifulSoup
from .logger import get_logger
logger = get_logger()
def get_published_date(article):
try:
"""
# Already fetched publish date information?
if (publish_date_ is not None):
return publish_date_
"""
# List of potential publish dates
potential_dates = []
# Publish date is the best match
potential_dates.append(article.publish_date)
# Publish date metadata is the following best match
potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
# Iterate remaining keys
for key in article.meta_data.keys():
if ("date" in key):
potential_dates.append(article.meta_data[key])
def invalid_date(p_date):
# Today + 2 days, article from the future?
today_plus_two = datetime.utcnow() + timedelta(days=2)
# Article from the future?
return p_date.timestamp() > today_plus_two.timestamp()
for date_ in potential_dates:
# String date? parse
if (type(date_) == str):
try:
date_ = dateutil.parser.parse(date_)
except Exception as e:
logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
date_ = None
# Valid?
if (date_ is not None) and (not invalid_date(date_)):
return date_
logger.debug("Article with no published date: {}".format(article.url))
return None
except Exception as e:
logger.info("Error while retrieving published date for URL: {}".format(article.url))
return None
def get_url_host(article_source_url, url):
# https://www.blabla.com/blabla -> www.blabla.com
if (article_source_url != ""):
# Article source URL already extracted, save path if any
return remove_http_s(article_source_url) # .split("/")[0]
else:
return remove_http_s(url).split("/")[0]
def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
# Status "raw", "duplicated" and "error" should remain the way they are
# Assumption: List of patterns sorted by importance
if (article_status in ["valid", "invalid", "unknown"]):
# Regular expression pattern matching: https://regexr.com/
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
# Matching?
matching = bool(re.match(regex_pattern, url))
# Update article status
if (matching):
if (status_if_match != article_status):
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
return status_if_match
# Pattern matching not required or not found, original article status
return article_status
def bypass_google_link(article_url):
def bypass_google_consent(article_url):
# Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
# https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
try:
# Request
r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
# Decode
soup = BeautifulSoup(r.text, 'html.parser')
url_of_interest = soup.a['href']
except Exception as e:
logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
url_of_interest = None
# Not able to bypass?
if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
url_of_interest = None
return url_of_interest
def bypass_google_using_service(article_url):
try:
# e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
gbypass_endpoint = "http://selenium_app:80/get_redirection"
# Timeout: 5 minutes
r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
# Decode
redirect_url = json.loads(r.text).get("redirect_url", "")
except Exception as e:
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
redirect_url = ""
return redirect_url
logger.debug("Starting gbypass_endpoint()")
article_url_bypassed = None
# Bypass using request
if ("consent.google.com" in article_url):
article_url_bypassed = bypass_google_consent(article_url)
# Not bypassed yet? Bypass using service
if (article_url_bypassed is None):
article_url_bypassed = bypass_google_using_service(article_url)
# if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
if (article_url_bypassed == "") or (article_url_bypassed is None):
# Empty URL returned by Gbypass
logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
return None
else:
logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
return article_url_bypassed
def process_article(article_url, list_pattern_status_tuple, language="en"):
# TODO:
"""
https://github.com/fhamborg/news-please
https://github.com/fhamborg/Giveme5W1HQwer123$
https://github.com/santhoshse7en/news-fetch
"""
try:
logger.debug("Starting process_article()")
if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
# Bypass to get redirection
article_url = bypass_google_link(article_url)
# Error?
if (article_url is None):
return None, {}, "error"
elif ("missingkids.org/poster" in article_url):
# Get status
article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
article_elements = {
"url_full": article_url,
"url_canonical": url_canonical
}
return url_canonical, article_elements, article_status
else:
# Avoid Too many requests (feeds, ...)
time.sleep(0.75)
logger.debug("Processing: {}".format(article_url))
# Default status unless something happens
article_status = "valid"
# Parse article
# TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
# TODO: Language per config
article = GNews(language).get_full_article(url=article_url)
# Article parsed?
if (article is None) or (not article.is_parsed):
logger.debug("Article not parsed: {}".format(article_url))
return article_url, {}, "error"
# Canonical link as main URL
url_canonical = article.canonical_link
# Empty canonical URL?
if (article.canonical_link is None) or (article.canonical_link == ""):
# URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
try:
# Remove text after parameter call
url = article.url.split("?")[0]
# Remove comment-stream
url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
# Article
article_attempt = GNews(language).get_full_article(url=url)
# Retrieving same title? Update article based on clean URL
if (article_attempt is not None) and (article_attempt.title == article.title):
article = article_attempt
except Exception as e:
logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
else: # Default behaviour
logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
# By default, URL same as canonical
url_canonical = article.url
elif (article.url != article.canonical_link):
# If different, stick to canonical URL
logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
else:
# If same, continue...
pass
# Update config to determine if content is valid
article.config.MIN_WORD_COUNT = 150
article.config.MIN_SENT_COUNT = 6
# Valid URL?
if (not article.is_valid_url()):
logger.debug("Not a valid news article: {}".format(url_canonical))
article_status = "invalid"
# Is the article's body text is long enough to meet standard article requirements?
if (not article.is_valid_body()):
logger.debug("Article body not valid: {}".format(url_canonical))
article_status = "unknown"
if (article.images != article.imgs):
logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
# article.keywords, article.meta_keywords, article.summary
# article.movies
# article.top_image
# Check if article status needs to be updated
article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
article_elements = {
'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com
'title': article.title, # Report: Election Integrity Partnership Worked with Feds to Censor News Sites in 2020
'description': article.meta_description, # Coalition committed to respond in early 2022 but failed to do so, while Labor has not issued a full response since taking office
'text': article.text, # ${Article content}
'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
'authors': article.authors, # ['Christopher Knaus']
'language': article.meta_lang, # en
'tags': list(article.tags), # ['Wide Open Border', 'My Son Hunter Movie', ...]
'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...]
'url_canonical': url_canonical, # Canonical URL (redirection)
# 'html': article.html, # HTML article
}
logger.debug("Processing OK: {}".format(url_canonical))
return url_canonical, article_elements, article_status
except Exception as e:
logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
return None, {}, "error"