Refactoring fetcher, working feeds and raw url writer

This commit is contained in:
Luciano Gervasoni
2025-03-12 17:56:40 +01:00
parent e124dbc21a
commit 61c31ee9aa
24 changed files with 2085 additions and 194 deletions

View File

@@ -0,0 +1,263 @@
from gnews import GNews
import dateutil.parser
from datetime import datetime, timedelta
from .utils import remove_http_s
import time
import random
import traceback
import requests
import json
import re
from bs4 import BeautifulSoup
from .logger import get_logger
logger = get_logger()
def get_published_date(article):
try:
"""
# Already fetched publish date information?
if (publish_date_ is not None):
return publish_date_
"""
# List of potential publish dates
potential_dates = []
# Publish date is the best match
potential_dates.append(article.publish_date)
# Publish date metadata is the following best match
potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
# Iterate remaining keys
for key in article.meta_data.keys():
if ("date" in key):
potential_dates.append(article.meta_data[key])
def invalid_date(p_date):
# Today + 2 days, article from the future?
today_plus_two = datetime.utcnow() + timedelta(days=2)
# Article from the future?
return p_date.timestamp() > today_plus_two.timestamp()
for date_ in potential_dates:
# String date? parse
if (type(date_) == str):
try:
date_ = dateutil.parser.parse(date_)
except Exception as e:
logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
date_ = None
# Valid?
if (date_ is not None) and (not invalid_date(date_)):
return date_
logger.debug("Article with no published date: {}".format(article.url))
return None
except Exception as e:
logger.info("Error while retrieving published date for URL: {}".format(article.url))
return None
def get_url_host(article_source_url, url):
# https://www.blabla.com/blabla -> www.blabla.com
if (article_source_url != ""):
# Article source URL already extracted, save path if any
return remove_http_s(article_source_url) # .split("/")[0]
else:
return remove_http_s(url).split("/")[0]
def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
# Status "raw", "duplicated" and "error" should remain the way they are
# Assumption: List of patterns sorted by importance
if (article_status in ["valid", "invalid", "unknown"]):
# Regular expression pattern matching: https://regexr.com/
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
# Matching?
matching = bool(re.match(regex_pattern, url))
# Update article status
if (matching):
if (status_if_match != article_status):
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
return status_if_match
# Pattern matching not required or not found, original article status
return article_status
def bypass_google_link(article_url):
def bypass_google_consent(article_url):
# Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
# https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
try:
# Request
r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
# Decode
soup = BeautifulSoup(r.text, 'html.parser')
url_of_interest = soup.a['href']
except Exception as e:
logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
url_of_interest = None
# Not able to bypass?
if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
url_of_interest = None
return url_of_interest
def bypass_google_using_service(article_url):
try:
# e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
gbypass_endpoint = "http://selenium_app:80/get_redirection"
# Timeout: 5 minutes
r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
# Decode
redirect_url = json.loads(r.text).get("redirect_url", "")
except Exception as e:
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
redirect_url = ""
return redirect_url
logger.debug("Starting gbypass_endpoint()")
article_url_bypassed = None
# Bypass using request
if ("consent.google.com" in article_url):
article_url_bypassed = bypass_google_consent(article_url)
# Not bypassed yet? Bypass using service
if (article_url_bypassed is None):
article_url_bypassed = bypass_google_using_service(article_url)
# if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
if (article_url_bypassed == "") or (article_url_bypassed is None):
# Empty URL returned by Gbypass
logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
return None
else:
logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
return article_url_bypassed
def process_article(article_url, list_pattern_status_tuple, language="en"):
# TODO:
"""
https://github.com/fhamborg/news-please
https://github.com/fhamborg/Giveme5W1HQwer123$
https://github.com/santhoshse7en/news-fetch
"""
try:
logger.debug("Starting process_article()")
if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
# Bypass to get redirection
article_url = bypass_google_link(article_url)
# Error?
if (article_url is None):
return None, {}, "error"
elif ("missingkids.org/poster" in article_url):
# Get status
article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
article_elements = {
"url_full": article_url,
"url_canonical": url_canonical
}
return url_canonical, article_elements, article_status
else:
# Avoid Too many requests (feeds, ...)
time.sleep(0.75)
logger.debug("Processing: {}".format(article_url))
# Default status unless something happens
article_status = "valid"
# Parse article
# TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
# TODO: Language per config
article = GNews(language).get_full_article(url=article_url)
# Article parsed?
if (article is None) or (not article.is_parsed):
logger.debug("Article not parsed: {}".format(article_url))
return article_url, {}, "error"
# Canonical link as main URL
url_canonical = article.canonical_link
# Empty canonical URL?
if (article.canonical_link is None) or (article.canonical_link == ""):
# URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
try:
# Remove text after parameter call
url = article.url.split("?")[0]
# Remove comment-stream
url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
# Article
article_attempt = GNews(language).get_full_article(url=url)
# Retrieving same title? Update article based on clean URL
if (article_attempt is not None) and (article_attempt.title == article.title):
article = article_attempt
except Exception as e:
logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
else: # Default behaviour
logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
# By default, URL same as canonical
url_canonical = article.url
elif (article.url != article.canonical_link):
# If different, stick to canonical URL
logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
else:
# If same, continue...
pass
# Update config to determine if content is valid
article.config.MIN_WORD_COUNT = 150
article.config.MIN_SENT_COUNT = 6
# Valid URL?
if (not article.is_valid_url()):
logger.debug("Not a valid news article: {}".format(url_canonical))
article_status = "invalid"
# Is the article's body text is long enough to meet standard article requirements?
if (not article.is_valid_body()):
logger.debug("Article body not valid: {}".format(url_canonical))
article_status = "unknown"
if (article.images != article.imgs):
logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
# article.keywords, article.meta_keywords, article.summary
# article.movies
# article.top_image
# Check if article status needs to be updated
article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
article_elements = {
'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com
'title': article.title, # Report: Election Integrity Partnership Worked with Feds to Censor News Sites in 2020
'description': article.meta_description, # Coalition committed to respond in early 2022 but failed to do so, while Labor has not issued a full response since taking office
'text': article.text, # ${Article content}
'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
'authors': article.authors, # ['Christopher Knaus']
'language': article.meta_lang, # en
'tags': list(article.tags), # ['Wide Open Border', 'My Son Hunter Movie', ...]
'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...]
'url_canonical': url_canonical, # Canonical URL (redirection)
# 'html': article.html, # HTML article
}
logger.debug("Processing OK: {}".format(url_canonical))
return url_canonical, article_elements, article_status
except Exception as e:
logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
return None, {}, "error"