matitos_news/app_urls/api/obsolete_src/url_utils.py

from gnews import GNews
import dateutil.parser
from datetime import datetime, timedelta
from .utils import remove_http_s
import time
import random
import traceback
import requests
import json
import re
from bs4 import BeautifulSoup

from .logger import get_logger
logger = get_logger()

def get_published_date(article):
    try:
        """
        # Already fetched publish date information?
        if (publish_date_ is not None):
            return publish_date_
        """

        # List of potential publish dates
        potential_dates = []
        # Publish date is the best match
        potential_dates.append(article.publish_date)
        # Publish date metadata is the following best match
        potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
        # Iterate remaining keys
        for key in article.meta_data.keys():
            if ("date" in key):
                potential_dates.append(article.meta_data[key])

        def invalid_date(p_date):
            # Today + 2 days, article from the future?
            today_plus_two = datetime.utcnow() + timedelta(days=2)
            # Article from the future?
            return p_date.timestamp() > today_plus_two.timestamp()

        for date_ in potential_dates:
            # String date? parse
            if (type(date_) == str):
                try:
                    date_ = dateutil.parser.parse(date_)
                except Exception as e:
                    logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
                    date_ = None
            # Valid?
            if (date_ is not None) and (not invalid_date(date_)):
                return date_

        logger.debug("Article with no published date: {}".format(article.url))
        return None
    except Exception as e:
        logger.info("Error while retrieving published date for URL: {}".format(article.url))
        return None

def get_url_host(article_source_url, url):
    # https://www.blabla.com/blabla -> www.blabla.com
    if (article_source_url != ""):
        # Article source URL already extracted, save path if any
        return remove_http_s(article_source_url) # .split("/")[0]
    else:
        return remove_http_s(url).split("/")[0]

def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
    # Regex pattern to update status on "valid", "invalid", and "unknown" status only
    # Status "raw", "duplicated" and "error" should remain the way they are
    # Assumption: List of patterns sorted by importance
    if (article_status in ["valid", "invalid", "unknown"]):
        # Regular expression pattern matching: https://regexr.com/
        for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
            # Matching?
            matching = bool(re.match(regex_pattern, url))
            # Update article status
            if (matching):
                if (status_if_match != article_status):
                    logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
                return status_if_match
    # Pattern matching not required or not found, original article status
    return article_status


def bypass_google_link(article_url):

    def bypass_google_consent(article_url):
        # Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
        article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")

        # https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
        }
        cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}

        try:
            # Request
            r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
            # Decode
            soup = BeautifulSoup(r.text, 'html.parser')
            url_of_interest = soup.a['href']
        except Exception as e:
            logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
            url_of_interest = None

        # Not able to bypass?
        if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
            url_of_interest = None
        return url_of_interest

    def bypass_google_using_service(article_url):
        try:
            # e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
            gbypass_endpoint = "http://selenium_app:80/get_redirection"
            # Timeout: 5 minutes
            r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
            # Decode
            redirect_url = json.loads(r.text).get("redirect_url", "")
        except Exception as e:
            logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
            redirect_url = ""

        return redirect_url

    logger.debug("Starting gbypass_endpoint()")

    article_url_bypassed = None
    # Bypass using request
    if ("consent.google.com" in article_url):
        article_url_bypassed = bypass_google_consent(article_url)
    # Not bypassed yet? Bypass using service
    if (article_url_bypassed is None):
        article_url_bypassed = bypass_google_using_service(article_url)

    # if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
    if (article_url_bypassed == "") or (article_url_bypassed is None):
        # Empty URL returned by Gbypass
        logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
        return None
    else:
        logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
        return article_url_bypassed

def process_article(article_url, list_pattern_status_tuple, language="en"):
    # TODO:
    """
    https://github.com/fhamborg/news-please
    https://github.com/fhamborg/Giveme5W1HQwer123$

    https://github.com/santhoshse7en/news-fetch
    """
    try:
        logger.debug("Starting process_article()")

        if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
            # Bypass to get redirection
            article_url = bypass_google_link(article_url)
            # Error?
            if (article_url is None):
                return None, {}, "error"
        elif ("missingkids.org/poster" in article_url):
            # Get status
            article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
            article_elements = {
                "url_full": article_url,
                "url_canonical": url_canonical
            }
            return url_canonical, article_elements, article_status
        else:
            # Avoid Too many requests (feeds, ...)
            time.sleep(0.75)

        logger.debug("Processing: {}".format(article_url))

        # Default status unless something happens
        article_status = "valid"

        # Parse article
        # TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
        # TODO: Language per config
        article = GNews(language).get_full_article(url=article_url)

        # Article parsed?
        if (article is None) or (not article.is_parsed):
            logger.debug("Article not parsed: {}".format(article_url))
            return article_url, {}, "error"

        # Canonical link as main URL
        url_canonical = article.canonical_link
        # Empty canonical URL?
        if (article.canonical_link is None) or (article.canonical_link == ""):
            # URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
            if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
                logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
                try:
                    # Remove text after parameter call
                    url = article.url.split("?")[0]
                    # Remove comment-stream
                    url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
                    # Article
                    article_attempt = GNews(language).get_full_article(url=url)
                    # Retrieving same title? Update article based on clean URL
                    if (article_attempt is not None) and (article_attempt.title == article.title):
                        article = article_attempt
                except Exception as e:
                    logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
            else:  # Default behaviour
                logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))

            # By default, URL same as canonical
            url_canonical = article.url

        elif (article.url != article.canonical_link):
            # If different, stick to canonical URL
            logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
        else:
            # If same, continue...
            pass

        # Update config to determine if content is valid
        article.config.MIN_WORD_COUNT = 150
        article.config.MIN_SENT_COUNT = 6

        # Valid URL?
        if (not article.is_valid_url()):
            logger.debug("Not a valid news article: {}".format(url_canonical))
            article_status = "invalid"
        # Is the article's body text is long enough to meet standard article requirements?
        if (not article.is_valid_body()):
            logger.debug("Article body not valid: {}".format(url_canonical))
            article_status = "unknown"

        if (article.images != article.imgs):
            logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))

        # article.keywords, article.meta_keywords, article.summary
        # article.movies
        # article.top_image

        # Check if article status needs to be updated
        article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)

        article_elements = {
            'url_full': article.url,                          # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
            'url_host': get_url_host(article.source_url, url_canonical),    # www.breitbart.com
            'title': article.title,                           # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020
            'description': article.meta_description,          # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office
            'text': article.text,                             # ${Article content}
            'published_date': get_published_date(article),    # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
            'authors': article.authors,                       # ['Christopher Knaus']
            'language': article.meta_lang,                    # en
            'tags': list(article.tags),                       # ['Wide Open Border', '’My Son Hunter’ Movie', ...]
            'images': list(article.images),                   # [URL_IMAGE_1, URL_IMAGE_2, ...]
            'url_canonical': url_canonical,                   # Canonical URL (redirection)
            # 'html': article.html,                           # HTML article
        }
        logger.debug("Processing OK: {}".format(url_canonical))
        return url_canonical, article_elements, article_status
    except Exception as e:
        logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
        return None, {}, "error"