from gnews import GNews import dateutil.parser from datetime import datetime, timedelta from .utils import remove_http_s import time import random import traceback import requests import json import re from bs4 import BeautifulSoup from .logger import get_logger logger = get_logger() def get_published_date(article): try: """ # Already fetched publish date information? if (publish_date_ is not None): return publish_date_ """ # List of potential publish dates potential_dates = [] # Publish date is the best match potential_dates.append(article.publish_date) # Publish date metadata is the following best match potential_dates.append(article.meta_data.get('article', {}).get("published_time", None)) # Iterate remaining keys for key in article.meta_data.keys(): if ("date" in key): potential_dates.append(article.meta_data[key]) def invalid_date(p_date): # Today + 2 days, article from the future? today_plus_two = datetime.utcnow() + timedelta(days=2) # Article from the future? return p_date.timestamp() > today_plus_two.timestamp() for date_ in potential_dates: # String date? parse if (type(date_) == str): try: date_ = dateutil.parser.parse(date_) except Exception as e: logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url)) date_ = None # Valid? if (date_ is not None) and (not invalid_date(date_)): return date_ logger.debug("Article with no published date: {}".format(article.url)) return None except Exception as e: logger.info("Error while retrieving published date for URL: {}".format(article.url)) return None def get_url_host(article_source_url, url): # https://www.blabla.com/blabla -> www.blabla.com if (article_source_url != ""): # Article source URL already extracted, save path if any return remove_http_s(article_source_url) # .split("/")[0] else: return remove_http_s(url).split("/")[0] def get_status_pattern_matching(url, article_status, list_pattern_status_tuple): # Regex pattern to update status on "valid", "invalid", and "unknown" status only # Status "raw", "duplicated" and "error" should remain the way they are # Assumption: List of patterns sorted by importance if (article_status in ["valid", "invalid", "unknown"]): # Regular expression pattern matching: https://regexr.com/ for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple: # Matching? matching = bool(re.match(regex_pattern, url)) # Update article status if (matching): if (status_if_match != article_status): logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url)) return status_if_match # Pattern matching not required or not found, original article status return article_status def bypass_google_link(article_url): def bypass_google_consent(article_url): # Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1 article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "") # https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' } cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'} try: # Request r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300) # Decode soup = BeautifulSoup(r.text, 'html.parser') url_of_interest = soup.a['href'] except Exception as e: logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e))) url_of_interest = None # Not able to bypass? if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest): url_of_interest = None return url_of_interest def bypass_google_using_service(article_url): try: # e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen" gbypass_endpoint = "http://selenium_app:80/get_redirection" # Timeout: 5 minutes r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300) # Decode redirect_url = json.loads(r.text).get("redirect_url", "") except Exception as e: logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e))) redirect_url = "" return redirect_url logger.debug("Starting gbypass_endpoint()") article_url_bypassed = None # Bypass using request if ("consent.google.com" in article_url): article_url_bypassed = bypass_google_consent(article_url) # Not bypassed yet? Bypass using service if (article_url_bypassed is None): article_url_bypassed = bypass_google_using_service(article_url) # if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed): if (article_url_bypassed == "") or (article_url_bypassed is None): # Empty URL returned by Gbypass logger.warning("Error while bypassing Gnews for URL: {}".format(article_url)) return None else: logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url)) return article_url_bypassed def process_article(article_url, list_pattern_status_tuple, language="en"): # TODO: """ https://github.com/fhamborg/news-please https://github.com/fhamborg/Giveme5W1HQwer123$ https://github.com/santhoshse7en/news-fetch """ try: logger.debug("Starting process_article()") if ("news.google.com" in article_url) or ("consent.google.com" in article_url): # Bypass to get redirection article_url = bypass_google_link(article_url) # Error? if (article_url is None): return None, {}, "error" elif ("missingkids.org/poster" in article_url): # Get status article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True) article_elements = { "url_full": article_url, "url_canonical": url_canonical } return url_canonical, article_elements, article_status else: # Avoid Too many requests (feeds, ...) time.sleep(0.75) logger.debug("Processing: {}".format(article_url)) # Default status unless something happens article_status = "valid" # Parse article # TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None # TODO: Language per config article = GNews(language).get_full_article(url=article_url) # Article parsed? if (article is None) or (not article.is_parsed): logger.debug("Article not parsed: {}".format(article_url)) return article_url, {}, "error" # Canonical link as main URL url_canonical = article.canonical_link # Empty canonical URL? if (article.canonical_link is None) or (article.canonical_link == ""): # URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")): logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url)) try: # Remove text after parameter call url = article.url.split("?")[0] # Remove comment-stream url = url.replace("#comment-stream", "").replace("#disqus_thread", "") # Article article_attempt = GNews(language).get_full_article(url=url) # Retrieving same title? Update article based on clean URL if (article_attempt is not None) and (article_attempt.title == article.title): article = article_attempt except Exception as e: logger.info("Article parsing of URL without parameters failed: {}".format(article.url)) else: # Default behaviour logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url)) # By default, URL same as canonical url_canonical = article.url elif (article.url != article.canonical_link): # If different, stick to canonical URL logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link)) else: # If same, continue... pass # Update config to determine if content is valid article.config.MIN_WORD_COUNT = 150 article.config.MIN_SENT_COUNT = 6 # Valid URL? if (not article.is_valid_url()): logger.debug("Not a valid news article: {}".format(url_canonical)) article_status = "invalid" # Is the article's body text is long enough to meet standard article requirements? if (not article.is_valid_body()): logger.debug("Article body not valid: {}".format(url_canonical)) article_status = "unknown" if (article.images != article.imgs): logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs)) # article.keywords, article.meta_keywords, article.summary # article.movies # article.top_image # Check if article status needs to be updated article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple) article_elements = { 'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/ 'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com 'title': article.title, # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020 'description': article.meta_description, # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office 'text': article.text, # ${Article content} 'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00' 'authors': article.authors, # ['Christopher Knaus'] 'language': article.meta_lang, # en 'tags': list(article.tags), # ['Wide Open Border', '’My Son Hunter’ Movie', ...] 'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...] 'url_canonical': url_canonical, # Canonical URL (redirection) # 'html': article.html, # HTML article } logger.debug("Processing OK: {}".format(url_canonical)) return url_canonical, article_elements, article_status except Exception as e: logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc())) return None, {}, "error"