Refactoring fetcher, working feeds and raw url writer

2025-03-12 17:56:40 +01:00
parent e124dbc21a
commit 61c31ee9aa
24 changed files with 2085 additions and 194 deletions
--- a/app_urls/api/obsolete_src/url_utils.py
+++ b/app_urls/api/obsolete_src/url_utils.py
@@ -0,0 +1,263 @@
+from gnews import GNews
+import dateutil.parser
+from datetime import datetime, timedelta
+from .utils import remove_http_s
+import time
+import random
+import traceback
+import requests
+import json
+import re
+from bs4 import BeautifulSoup
+
+from .logger import get_logger
+logger = get_logger()
+
+def get_published_date(article):
+    try:
+        """
+        # Already fetched publish date information?
+        if (publish_date_ is not None):
+            return publish_date_
+        """
+        
+        # List of potential publish dates
+        potential_dates = []
+        # Publish date is the best match
+        potential_dates.append(article.publish_date)
+        # Publish date metadata is the following best match
+        potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
+        # Iterate remaining keys
+        for key in article.meta_data.keys():
+            if ("date" in key):
+                potential_dates.append(article.meta_data[key])
+
+        def invalid_date(p_date):
+            # Today + 2 days, article from the future?
+            today_plus_two = datetime.utcnow() + timedelta(days=2)
+            # Article from the future?
+            return p_date.timestamp() > today_plus_two.timestamp()
+        
+        for date_ in potential_dates:
+            # String date? parse
+            if (type(date_) == str):
+                try:
+                    date_ = dateutil.parser.parse(date_)
+                except Exception as e:
+                    logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
+                    date_ = None
+            # Valid?
+            if (date_ is not None) and (not invalid_date(date_)):
+                return date_
+            
+        logger.debug("Article with no published date: {}".format(article.url))
+        return None
+    except Exception as e:
+        logger.info("Error while retrieving published date for URL: {}".format(article.url))
+        return None
+
+def get_url_host(article_source_url, url):
+    # https://www.blabla.com/blabla -> www.blabla.com
+    if (article_source_url != ""):
+        # Article source URL already extracted, save path if any
+        return remove_http_s(article_source_url) # .split("/")[0]
+    else:
+        return remove_http_s(url).split("/")[0]
+
+def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
+    # Regex pattern to update status on "valid", "invalid", and "unknown" status only
+    # Status "raw", "duplicated" and "error" should remain the way they are
+    # Assumption: List of patterns sorted by importance
+    if (article_status in ["valid", "invalid", "unknown"]):
+        # Regular expression pattern matching: https://regexr.com/
+        for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
+            # Matching?
+            matching = bool(re.match(regex_pattern, url))
+            # Update article status
+            if (matching):
+                if (status_if_match != article_status):
+                    logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
+                return status_if_match
+    # Pattern matching not required or not found, original article status
+    return article_status
+
+
+
+def bypass_google_link(article_url):
+
+    def bypass_google_consent(article_url):
+        # Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
+        article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
+
+        # https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
+        }
+        cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
+
+        try:
+            # Request
+            r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
+            # Decode
+            soup = BeautifulSoup(r.text, 'html.parser')
+            url_of_interest = soup.a['href']
+        except Exception as e:
+            logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
+            url_of_interest = None
+        
+        # Not able to bypass?
+        if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
+            url_of_interest = None
+        return url_of_interest
+
+    def bypass_google_using_service(article_url):
+        try:
+            # e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
+            gbypass_endpoint = "http://selenium_app:80/get_redirection"
+            # Timeout: 5 minutes
+            r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
+            # Decode
+            redirect_url = json.loads(r.text).get("redirect_url", "")
+        except Exception as e:
+            logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
+            redirect_url = ""
+            
+        return redirect_url
+    
+    logger.debug("Starting gbypass_endpoint()")
+    
+    article_url_bypassed = None
+    # Bypass using request
+    if ("consent.google.com" in article_url):
+        article_url_bypassed = bypass_google_consent(article_url)
+    # Not bypassed yet? Bypass using service
+    if (article_url_bypassed is None):
+        article_url_bypassed = bypass_google_using_service(article_url)
+
+    # if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
+    if (article_url_bypassed == "") or (article_url_bypassed is None):
+        # Empty URL returned by Gbypass
+        logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
+        return None
+    else:
+        logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
+        return article_url_bypassed
+
+def process_article(article_url, list_pattern_status_tuple, language="en"):
+    # TODO:
+    """
+    https://github.com/fhamborg/news-please
+    https://github.com/fhamborg/Giveme5W1HQwer123$
+    
+    https://github.com/santhoshse7en/news-fetch
+    """
+    try:
+        logger.debug("Starting process_article()")
+
+        if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
+            # Bypass to get redirection
+            article_url = bypass_google_link(article_url)
+            # Error?
+            if (article_url is None):
+                return None, {}, "error"
+        elif ("missingkids.org/poster" in article_url):
+            # Get status
+            article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
+            article_elements = {
+                "url_full": article_url,
+                "url_canonical": url_canonical
+            }
+            return url_canonical, article_elements, article_status
+        else:
+            # Avoid Too many requests (feeds, ...)
+            time.sleep(0.75)
+
+        logger.debug("Processing: {}".format(article_url))
+
+        # Default status unless something happens
+        article_status = "valid"
+
+        # Parse article
+        # TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
+        # TODO: Language per config
+        article = GNews(language).get_full_article(url=article_url)
+
+        # Article parsed?
+        if (article is None) or (not article.is_parsed):
+            logger.debug("Article not parsed: {}".format(article_url))
+            return article_url, {}, "error"
+
+        # Canonical link as main URL
+        url_canonical = article.canonical_link
+        # Empty canonical URL?
+        if (article.canonical_link is None) or (article.canonical_link == ""):
+            # URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
+            if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
+                logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
+                try:
+                    # Remove text after parameter call
+                    url = article.url.split("?")[0]
+                    # Remove comment-stream
+                    url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
+                    # Article
+                    article_attempt = GNews(language).get_full_article(url=url)
+                    # Retrieving same title? Update article based on clean URL
+                    if (article_attempt is not None) and (article_attempt.title == article.title):
+                        article = article_attempt
+                except Exception as e:
+                    logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
+            else:  # Default behaviour
+                logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
+
+            # By default, URL same as canonical
+            url_canonical = article.url
+
+        elif (article.url != article.canonical_link):
+            # If different, stick to canonical URL
+            logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
+        else:
+            # If same, continue...
+            pass
+        
+        # Update config to determine if content is valid
+        article.config.MIN_WORD_COUNT = 150
+        article.config.MIN_SENT_COUNT = 6
+        
+        # Valid URL?
+        if (not article.is_valid_url()):
+            logger.debug("Not a valid news article: {}".format(url_canonical))
+            article_status = "invalid"
+        # Is the article's body text is long enough to meet standard article requirements?
+        if (not article.is_valid_body()):
+            logger.debug("Article body not valid: {}".format(url_canonical))
+            article_status = "unknown"
+
+        if (article.images != article.imgs):
+            logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
+
+        # article.keywords, article.meta_keywords, article.summary
+        # article.movies 
+        # article.top_image
+
+        # Check if article status needs to be updated
+        article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
+
+        article_elements = {
+            'url_full': article.url,                          # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
+            'url_host': get_url_host(article.source_url, url_canonical),    # www.breitbart.com
+            'title': article.title,                           # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020
+            'description': article.meta_description,          # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office
+            'text': article.text,                             # ${Article content}
+            'published_date': get_published_date(article),    # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
+            'authors': article.authors,                       # ['Christopher Knaus']
+            'language': article.meta_lang,                    # en
+            'tags': list(article.tags),                       # ['Wide Open Border', '’My Son Hunter’ Movie', ...]
+            'images': list(article.images),                   # [URL_IMAGE_1, URL_IMAGE_2, ...]
+            'url_canonical': url_canonical,                   # Canonical URL (redirection)
+            # 'html': article.html,                           # HTML article
+        }
+        logger.debug("Processing OK: {}".format(url_canonical))
+        return url_canonical, article_elements, article_status
+    except Exception as e:
+        logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
+        return None, {}, "error"