Url content
This commit is contained in:
456
app_fetcher/src/db_utils.py
Normal file
456
app_fetcher/src/db_utils.py
Normal file
@@ -0,0 +1,456 @@
|
||||
import psycopg
|
||||
import redis
|
||||
import traceback
|
||||
import random
|
||||
import requests
|
||||
import json
|
||||
import os
|
||||
from .url_utils import process_article
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
|
||||
# The rest, elsewhere
|
||||
|
||||
class URL_DB_Writer():
|
||||
def __init__(self, db_connect_info, redis_connect_info):
|
||||
logger.debug("Initializing URL DB writer")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_instance = redis.Redis(host=redis_connect_info.get("redis_host"), port=redis_connect_info.get("redis_port"))
|
||||
self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours
|
||||
|
||||
try:
|
||||
self.redis_instance.ping()
|
||||
logger.debug("Succesfully pinged Redis")
|
||||
except Exception as e:
|
||||
logger.warning("Error trying to ping Redis: {}".format(str(e)))
|
||||
|
||||
def get_urls_count(self, last_minutes_check):
|
||||
#####################
|
||||
### Get number of URLs within last X minutes
|
||||
#####################
|
||||
try:
|
||||
# Update
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0]
|
||||
except Exception as e:
|
||||
logger.warning("Error updating URLs status: {}".format(str(e)))
|
||||
num_urls = None
|
||||
return num_urls
|
||||
|
||||
def _format(self, values):
|
||||
# Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729
|
||||
# String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value)
|
||||
if (type(values) == list) or (type(values) == tuple):
|
||||
insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")"
|
||||
elif (type(values) == str):
|
||||
insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" )
|
||||
else:
|
||||
logger.warning("Error formatting input values: {}".format(values))
|
||||
assert False
|
||||
return insert_args
|
||||
|
||||
def _get_cached_canonical_url(self, url):
|
||||
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
|
||||
try:
|
||||
filter_url = self.redis_instance.get(url)
|
||||
if (filter_url is not None):
|
||||
filter_url = filter_url.decode("utf-8")
|
||||
except Exception as e:
|
||||
logger.warning("Exception querying Redis: {}".format(str(e)))
|
||||
filter_url = None
|
||||
return filter_url
|
||||
|
||||
def _update_urls_status(self, dict_status_ids):
|
||||
#####################
|
||||
### Update status to array of URL IDs
|
||||
#####################
|
||||
try:
|
||||
# Update
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
# Autocommit at end of transaction (Atomic insert of URLs and sources)
|
||||
with conn.transaction() as tx:
|
||||
for key_status, value_ids in dict_status_ids.items():
|
||||
cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids])))
|
||||
except Exception as e:
|
||||
logger.warning("Error updating URLs status: {}".format(str(e)))
|
||||
|
||||
def _get_missing_kids_urls(self, num_urls=None):
|
||||
#####################
|
||||
### Get list of Missing Kids URLs
|
||||
#####################
|
||||
try:
|
||||
missing_kids_ids_and_urls = []
|
||||
if (num_urls is None):
|
||||
limit = 500
|
||||
else:
|
||||
limit = num_urls
|
||||
offset = 0
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
while True:
|
||||
# Query
|
||||
missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
|
||||
# Finished?
|
||||
if (len(missing_kids_ids_and_urls_query) == 0):
|
||||
break
|
||||
# Extend
|
||||
missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query
|
||||
# Offset
|
||||
offset += len(missing_kids_ids_and_urls_query)
|
||||
# Stop?
|
||||
if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls):
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Error getting Missing Kids URLs: {}".format(str(e)))
|
||||
missing_kids_ids_and_urls = []
|
||||
return missing_kids_ids_and_urls
|
||||
|
||||
def _get_error_urls(self, num_urls=None):
|
||||
#####################
|
||||
### Get list of Missing Kids URLs
|
||||
#####################
|
||||
try:
|
||||
error_urls = []
|
||||
if (num_urls is None):
|
||||
limit = 500
|
||||
else:
|
||||
limit = num_urls
|
||||
offset = 0
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
while True:
|
||||
# Query
|
||||
error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
|
||||
# Finished?
|
||||
if (len(error_urls_query) == 0):
|
||||
break
|
||||
# Extend
|
||||
error_urls = error_urls + error_urls_query
|
||||
# Offset
|
||||
offset += len(error_urls_query)
|
||||
# Stop?
|
||||
if (num_urls is not None) and (len(error_urls) >= num_urls):
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Error getting Error URLs: {}".format(str(e)))
|
||||
error_urls = []
|
||||
return error_urls
|
||||
|
||||
def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched...
|
||||
"""
|
||||
# TODO: REFACTOR
|
||||
For each input url
|
||||
|
||||
Already processed?
|
||||
-> Update on Redis expire time
|
||||
-> Associate to source
|
||||
Not processed? Get main URL:
|
||||
-> URL Canonical valid?
|
||||
-> Rely on this as main URL
|
||||
-> URL Canonical not valid?
|
||||
-> Use input url, unless it's a news.google.com link
|
||||
-> If news.google.com link, filter out. REDIS?
|
||||
Main URL processing:
|
||||
-> Update in REDIS, association url -> url_canonical
|
||||
-> url != url_canonical: Add in duplicate table
|
||||
If both != news.google.com
|
||||
"""
|
||||
|
||||
# URLs to insert, URLs duplicated association, URL to Canonical form
|
||||
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {}
|
||||
|
||||
# URL VS CANONICAL:
|
||||
# News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen
|
||||
# Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/
|
||||
|
||||
for url in urls_fetched:
|
||||
# Domain to filter? Input url
|
||||
filter_due_to_domain = False
|
||||
for domain_to_filter in list_domains_to_filter:
|
||||
if (domain_to_filter in url):
|
||||
logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url))
|
||||
filter_due_to_domain = True
|
||||
if (filter_due_to_domain):
|
||||
continue
|
||||
|
||||
# URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
|
||||
cached_canonical_url = self._get_cached_canonical_url(url)
|
||||
if (cached_canonical_url is not None):
|
||||
# Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry)
|
||||
dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y
|
||||
# If url has been processed, so was its canonical form
|
||||
logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url))
|
||||
continue
|
||||
|
||||
# Process TODO: Add language...
|
||||
url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple)
|
||||
# TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id)
|
||||
|
||||
# Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB)
|
||||
if (url_canonical is None) and ("news.google.com" in url):
|
||||
logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url))
|
||||
continue
|
||||
# Canonical URL still news.google.com? Continue (avoid inserting in DB)
|
||||
if (url_canonical is not None) and ("news.google.com" in url_canonical):
|
||||
logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical))
|
||||
continue
|
||||
|
||||
# Domain to filter? Input canonical_url
|
||||
filter_due_to_domain = False
|
||||
for domain_to_filter in list_domains_to_filter:
|
||||
if (url_canonical is not None) and (domain_to_filter in url_canonical):
|
||||
filter_due_to_domain = True
|
||||
if (filter_due_to_domain):
|
||||
logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical))
|
||||
continue
|
||||
|
||||
if (url_canonical is None) or (article_status == "error"):
|
||||
logger.debug("Processing failed for URL: {}".format(url))
|
||||
# Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based
|
||||
if ("news.google.com" in url) or ("consent.google.com" in url):
|
||||
logging.debug("Not able to process Google News link, skipping: {}".format(url))
|
||||
else:
|
||||
dict_full_urls_to_canonical[url] = url # X -> X
|
||||
list_insert_url_tuple_args.append( (url, article_status) )
|
||||
continue
|
||||
|
||||
# URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different
|
||||
if (url_canonical != url):
|
||||
list_tuple_canonical_duplicate_urls.append( (url_canonical, url) )
|
||||
# Dict: url -> canonical (update association)
|
||||
dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X
|
||||
|
||||
# Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
|
||||
if (self._get_cached_canonical_url(url_canonical) is not None):
|
||||
# Canonical URL was already processed
|
||||
logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical))
|
||||
else:
|
||||
# Insert url_canonical to DB formatted
|
||||
list_insert_url_tuple_args.append( (url_canonical, article_status) )
|
||||
# Canonical URL different? Process
|
||||
if (url_canonical != url):
|
||||
if ("news.google.com" in url) or ("consent.google.com" in url):
|
||||
logging.debug("Not adding google.news.com based link, skipping: {}".format(url))
|
||||
else:
|
||||
# Fetched url -> duplicate (using canonical as main link)
|
||||
article_status = "duplicate"
|
||||
# Insert url (non-canonical) to DB formatted
|
||||
list_insert_url_tuple_args.append( (url, article_status) )
|
||||
|
||||
return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical
|
||||
|
||||
def _insert_urls(self, cursor, list_insert_url_tuple_args):
|
||||
#####################
|
||||
### Insert URLs with status
|
||||
#####################
|
||||
if (len(list_insert_url_tuple_args) > 0):
|
||||
insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] )
|
||||
# Insert. (url_1, status_1), (url_2, status_2), ...
|
||||
sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args)
|
||||
# logger.debug("SQL CODE: {}".format(sql_code))
|
||||
c = cursor.execute(sql_code)
|
||||
# NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT)
|
||||
# https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488
|
||||
|
||||
def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls):
|
||||
#####################
|
||||
### Insert duplicated URLs
|
||||
#####################
|
||||
if (len(list_tuple_canonical_duplicate_urls) > 0):
|
||||
# Flatten, format, set to remove duplicates
|
||||
args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")"
|
||||
|
||||
# Dict: url -> id
|
||||
dict_url_to_id = {}
|
||||
# Get url -> id association to populate duplicated URLs
|
||||
for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall():
|
||||
dict_url_to_id[url_] = id_
|
||||
|
||||
# Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB
|
||||
# ORIGINAL CODE. Issue, might not have found association to all urls
|
||||
### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls]
|
||||
|
||||
list_tuple_canonical_duplicate_urls_ids = []
|
||||
for (url_1, url_2) in list_tuple_canonical_duplicate_urls:
|
||||
id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2)
|
||||
if (id_url_1 is None) or (id_url_2 is None):
|
||||
logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2))
|
||||
else:
|
||||
list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) )
|
||||
|
||||
if (len(list_tuple_canonical_duplicate_urls_ids) > 0):
|
||||
insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] )
|
||||
# Insert. (id_url_canonical_1, id_url_1), ...
|
||||
sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args)
|
||||
# logger.debug("SQL CODE: {}".format(sql_code))
|
||||
c = cursor.execute(sql_code)
|
||||
|
||||
def _get_pattern_status_list(self):
|
||||
#####################
|
||||
### Get list of domains to filter
|
||||
#####################
|
||||
# TODO: Cache on redis and query once every N hours? ...
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
# TODO: Cache on Redis
|
||||
list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall()
|
||||
except Exception as e:
|
||||
logger.warning("Error getting pattern status list: {}".format(str(e)))
|
||||
list_pattern_status = []
|
||||
return list_pattern_status
|
||||
|
||||
def _get_domains_to_filter(self):
|
||||
#####################
|
||||
### Get list of domains to filter
|
||||
#####################
|
||||
# TODO: Cache on redis and query once every N hours? ...
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
# TODO: Cache on Redis
|
||||
sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ]
|
||||
except Exception as e:
|
||||
logger.warning("Error getting domains to filter: {}".format(str(e)))
|
||||
sites_to_filter = []
|
||||
return sites_to_filter
|
||||
|
||||
def _get_cached_source_id(self, source):
|
||||
### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
|
||||
try:
|
||||
source_id = self.redis_instance.get(source)
|
||||
if (source_id is not None):
|
||||
source_id = source_id.decode("utf-8")
|
||||
except Exception as e:
|
||||
logger.warning("Exception querying Redis: {}".format(str(e)))
|
||||
source_id = None
|
||||
return source_id
|
||||
|
||||
def _get_source_id(self, cursor, source):
|
||||
#####################
|
||||
### Get source corresponding id
|
||||
#####################
|
||||
# Cached?
|
||||
id_source = self._get_cached_source_id(source)
|
||||
if (id_source is None):
|
||||
c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone()
|
||||
if (c is None) or (len(c) == 0):
|
||||
# Source does not exist, insert and get id
|
||||
c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone()
|
||||
# Decode source id
|
||||
id_source = c[0]
|
||||
# Cache
|
||||
self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds)
|
||||
return id_source
|
||||
|
||||
def _get_urls_id(self, cursor, urls_full):
|
||||
#####################
|
||||
### Get id of inserted and filtered URLs
|
||||
#####################
|
||||
# TODO: Cache url -> url_id, url_canonical
|
||||
if (len(urls_full) == 0):
|
||||
return []
|
||||
# Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source
|
||||
in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")"
|
||||
id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ]
|
||||
return id_urls_related
|
||||
|
||||
def _insert_urls_source(self, cursor, id_urls_related, id_source):
|
||||
#####################
|
||||
### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ...
|
||||
#####################
|
||||
if (len(id_urls_related) == 0) or (id_source is None):
|
||||
return
|
||||
columns = "(id_url, id_source)"
|
||||
insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] )
|
||||
# Insert
|
||||
sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args)
|
||||
# logger.debug("SQL CODE: {}".format(sql_code))
|
||||
c = cursor.execute(sql_code)
|
||||
|
||||
def write_batch(self, urls_fetched, source):
|
||||
# Chunks of 50 elements
|
||||
n = 50
|
||||
# Divide in small chunks
|
||||
urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)]
|
||||
# Process
|
||||
for urls_fetched_chunk_i in urls_fetched_chunks:
|
||||
self._write_small_batch(urls_fetched_chunk_i, source)
|
||||
|
||||
def _write_small_batch(self, urls_fetched, source):
|
||||
try:
|
||||
logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source))
|
||||
|
||||
if (len(urls_fetched) == 0):
|
||||
logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
|
||||
return
|
||||
|
||||
# Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests)
|
||||
random.shuffle(urls_fetched)
|
||||
|
||||
# Get list of domains to filter
|
||||
list_domains_to_filter = self._get_domains_to_filter()
|
||||
# Get list of (pattern, priority, status) tuples to override status if required
|
||||
list_pattern_status_tuple = self._get_pattern_status_list()
|
||||
# Sort pattern tuples by priority
|
||||
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
# Process URLs to update DB
|
||||
list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple)
|
||||
# Full set of URL and its canonical form (to associate them to a search), both to insert and filter
|
||||
urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) )
|
||||
|
||||
# Insert
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# Open cursor
|
||||
cursor = conn.cursor()
|
||||
# Autocommit at end of transaction (Atomic insert of URLs and sources)
|
||||
with conn.transaction() as tx:
|
||||
# Insert processed URLs
|
||||
self._insert_urls(cursor, list_insert_url_tuple_args)
|
||||
# Insert URLs duplicated (canonical != fetched url)
|
||||
self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls)
|
||||
|
||||
# Get source id in DB
|
||||
id_source = self._get_source_id(cursor, source)
|
||||
# Get IDs of all related URLs
|
||||
id_urls_related = self._get_urls_id(cursor, urls_full)
|
||||
# Insert search source associated to URLs
|
||||
self._insert_urls_source(cursor, id_urls_related, id_source)
|
||||
|
||||
# Update Redis status of inserted and filtered URLs after writing to DB
|
||||
for url, url_canonical in dict_full_urls_to_canonical.items():
|
||||
try:
|
||||
# Set with updated expiry time
|
||||
self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds)
|
||||
if (url != url_canonical):
|
||||
self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds)
|
||||
except Exception as e:
|
||||
logger.warning("Exception running set in Redis: {}".format(str(e)))
|
||||
|
||||
if (len(list_insert_url_tuple_args) > 0):
|
||||
try:
|
||||
webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
|
||||
endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token)
|
||||
|
||||
payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) })
|
||||
r = requests.post(endpoint_message, data=payload)
|
||||
except Exception as e:
|
||||
logger.warning("Webhook failed: {}".format(str(e)))
|
||||
|
||||
logger.debug("URL DB write finished")
|
||||
except Exception as e:
|
||||
logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) )
|
||||
logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) )
|
||||
39
app_fetcher/src/fetcher_status.py
Normal file
39
app_fetcher/src/fetcher_status.py
Normal file
@@ -0,0 +1,39 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import json
|
||||
import logging
|
||||
import requests
|
||||
import os
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class FetcherStatus():
|
||||
def __init__(self, db_connect_info, redis_connect_info, last_minutes_check) -> None:
|
||||
self.db_connect_info = db_connect_info
|
||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
||||
self.last_minutes_check = last_minutes_check
|
||||
|
||||
def check_warning(self):
|
||||
try:
|
||||
logger.info("Starting fetcher check for last minutes {}".format(self.last_minutes_check))
|
||||
|
||||
# Get number of URLs
|
||||
num_urls = self.db_writer.get_urls_count(last_minutes_check=self.last_minutes_check)
|
||||
logger.debug("Fetched #URLs {} during the last {} minutes".format(num_urls, self.last_minutes_check))
|
||||
|
||||
webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
|
||||
endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlfetchwarnings/message?zapikey={}".format(webhook_token)
|
||||
|
||||
if (num_urls is None):
|
||||
try:
|
||||
payload = json.dumps({"text": "[WARNING] Error on query to DB"})
|
||||
r = requests.post(endpoint_message, data=payload)
|
||||
except Exception as e:
|
||||
logger.warning("Webhook failed: {}".format(str(e)))
|
||||
elif (num_urls == 0):
|
||||
try:
|
||||
payload = json.dumps({"text": "[WARNING] No URLs fetched for {} minutes".format(self.last_minutes_check) })
|
||||
r = requests.post(endpoint_message, data=payload)
|
||||
except Exception as e:
|
||||
logger.warning("Webhook failed: {}".format(str(e)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))
|
||||
27
app_fetcher/src/google_bypass.py
Normal file
27
app_fetcher/src/google_bypass.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import requests
|
||||
import json
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class GoogleByPass():
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def bypass_google_urls(self, list_urls):
|
||||
if (len(list_urls) == 0):
|
||||
return []
|
||||
|
||||
try:
|
||||
# Endpoint
|
||||
gbypass_endpoint = "http://selenium_app:80/get_redirection"
|
||||
# Timeout: 20 minutes
|
||||
timeout = 60*20
|
||||
r = requests.post(gbypass_endpoint, json={"list_urls": list_urls}, timeout=timeout)
|
||||
# Decode
|
||||
list_urls_redirections = json.loads(r.text).get("list_urls_redirections", [])
|
||||
except Exception as e:
|
||||
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
|
||||
list_urls_redirections = []
|
||||
|
||||
return list_urls_redirections
|
||||
69
app_fetcher/src/missing_kids_status.py
Normal file
69
app_fetcher/src/missing_kids_status.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import requests
|
||||
from .db_utils import URL_DB_Writer
|
||||
from .url_utils import get_missing_kid_status
|
||||
import time
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class MissingKidsStatus():
|
||||
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
|
||||
self.num_urls = num_urls
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
||||
|
||||
def update_missing_kids_status(self):
|
||||
try:
|
||||
logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls))
|
||||
# List of URLs
|
||||
list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls)
|
||||
# Dict: status -> IDs to update to new status
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
# Check URLs with invalid status?
|
||||
skip_invalid_check = False
|
||||
|
||||
flush_every, flush_current = 20, 0
|
||||
# Iterate URLs
|
||||
for (id, url, current_status) in list_ids_and_urls:
|
||||
# Skip duplicate URLs
|
||||
if (current_status == "duplicate"):
|
||||
continue
|
||||
# Skip invalid URLs?
|
||||
if (skip_invalid_check):
|
||||
if (current_status == "invalid"):
|
||||
continue
|
||||
|
||||
# Get status
|
||||
new_status = get_missing_kid_status(url)
|
||||
# Different? Update
|
||||
if (current_status != new_status):
|
||||
# Extend array
|
||||
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
|
||||
# Debugging dict
|
||||
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
|
||||
# +1 processed
|
||||
flush_current += 1
|
||||
|
||||
# Flush batch?
|
||||
if (flush_every == flush_current):
|
||||
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
|
||||
# Update DB
|
||||
self.db_writer._update_urls_status(dict_status_ids)
|
||||
# Reset
|
||||
flush_current = 0
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
# Flush remaining batch
|
||||
if (flush_current > 0):
|
||||
logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls))
|
||||
# Update DB
|
||||
self.db_writer._update_urls_status(dict_status_ids)
|
||||
# Reset
|
||||
flush_current = 0
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
logger.info("Finished updating status to Missing Kids URLs")
|
||||
except Exception as e:
|
||||
logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e)))
|
||||
|
||||
60
app_fetcher/src/news_feed.py
Normal file
60
app_fetcher/src/news_feed.py
Normal file
@@ -0,0 +1,60 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import feedparser
|
||||
import dateutil
|
||||
import psycopg
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class NewsFeed():
|
||||
def __init__(self, db_connect_info, redis_connect_info) -> None:
|
||||
logger.debug("Initializing News feed")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
|
||||
def _get_feed_urls(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
|
||||
# Decode (tuple with 1 element)
|
||||
list_url_feeds = [l[0] for l in list_url_feeds]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching RSS sites: " + str(e))
|
||||
list_url_feeds = []
|
||||
return list_url_feeds
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting NewsFeed.run()")
|
||||
|
||||
# Get feeds
|
||||
list_url_feeds = self._get_feed_urls()
|
||||
logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds)))
|
||||
|
||||
# Process via RSS feeds
|
||||
for url_feed in list_url_feeds:
|
||||
# Initialize
|
||||
urls_fetched, urls_publish_date = [], []
|
||||
# Fetch feeds
|
||||
feeds = feedparser.parse(url_feed)
|
||||
# Parse
|
||||
for f in feeds.get("entries", []):
|
||||
# Get URL
|
||||
url = f.get("link", None)
|
||||
# Process?
|
||||
if (url is not None):
|
||||
# Available publish date?
|
||||
publish_date = f.get("published", None)
|
||||
if (publish_date is not None):
|
||||
publish_date = dateutil.parser.parse(publish_date)
|
||||
urls_publish_date.append(publish_date)
|
||||
# URL
|
||||
urls_fetched.append(url)
|
||||
|
||||
# URL fetching source
|
||||
source = "feed {}".format(url_feed)
|
||||
# Write to DB
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
db_writer.write_batch(urls_fetched, source)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsFeed.run(): {}".format(str(e)))
|
||||
40
app_fetcher/src/news_missing_kids.py
Normal file
40
app_fetcher/src/news_missing_kids.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import requests
|
||||
import json
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class NewsMissingKids():
|
||||
def __init__(self, db_connect_info, redis_connect_info, num_pages) -> None:
|
||||
logger.debug("Initializing News MissingKids")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
self.num_pages = num_pages
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting NewsMissingKids.run()")
|
||||
try:
|
||||
# Missing kids fetching endpoint, parameter number of pages to fetch
|
||||
missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}".format(self.num_pages)
|
||||
# Timeout
|
||||
if (self.num_pages > 15):
|
||||
timeout = 60*90 # 1.5h
|
||||
else:
|
||||
timeout = 60*5 # 5 min
|
||||
# Request
|
||||
r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
|
||||
# Decode
|
||||
urls_fetched = json.loads(r.text).get("list_urls", [])
|
||||
except Exception as e:
|
||||
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
|
||||
urls_fetched = []
|
||||
|
||||
# URL fetching source
|
||||
source = "missingkids fetcher"
|
||||
# Write to DB
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
db_writer.write_batch(urls_fetched, source)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e)))
|
||||
58
app_fetcher/src/news_parsing.py
Normal file
58
app_fetcher/src/news_parsing.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import newspaper
|
||||
import psycopg
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class NewsSiteParsing():
|
||||
def __init__(self, db_connect_info, redis_connect_info) -> None:
|
||||
logger.debug("Initializing News SiteParsing newspaper3k")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
|
||||
def _get_url_hosts(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
|
||||
# Decode (tuple with 1 element)
|
||||
list_url_hosts = [l[0] for l in list_url_hosts]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching RSS sites: " + str(e))
|
||||
list_url_hosts = []
|
||||
return list_url_hosts
|
||||
|
||||
def _postprocess(self, article_urls):
|
||||
return [url.replace("#comment-stream", "") for url in article_urls]
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting NewsSiteParsing.run() for {}")
|
||||
|
||||
# Get feeds
|
||||
list_url_hosts = self._get_url_hosts()
|
||||
logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
|
||||
|
||||
# Process newspaper3k build method
|
||||
for url_host_feed in list_url_hosts:
|
||||
# Protocol
|
||||
if not (url_host_feed.startswith("http")):
|
||||
url_host_feed_formatted = "https://" + url_host_feed
|
||||
else:
|
||||
url_host_feed_formatted = url_host_feed
|
||||
|
||||
logger.debug("Fetching newspaper3k parsing based on URL: {}".format(url_host_feed_formatted))
|
||||
# Source object
|
||||
url_host_built = newspaper.build(url_host_feed_formatted)
|
||||
# Get articles URL list
|
||||
urls_fetched = url_host_built.article_urls()
|
||||
# Post-processing
|
||||
urls_fetched = self._postprocess(urls_fetched)
|
||||
|
||||
# URL fetching source
|
||||
source = "newspaper3k {}".format(url_host_feed)
|
||||
# Write to DB
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
db_writer.write_batch(urls_fetched, source)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))
|
||||
181
app_fetcher/src/news_search.py
Normal file
181
app_fetcher/src/news_search.py
Normal file
@@ -0,0 +1,181 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import psycopg
|
||||
from .utils import get_searxng_instances
|
||||
from .search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch
|
||||
from threading import Thread
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class NewsSearch():
|
||||
def __init__(self, db_connect_info, redis_connect_info, full=True) -> None:
|
||||
logger.debug("Initializing News feed")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
||||
self.full_search = full
|
||||
|
||||
def _get_url_host_list(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# List of URL host
|
||||
list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()]
|
||||
# Clean http / https from URLs
|
||||
list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host]
|
||||
# Clean last slash if exists
|
||||
list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching URL host list: " + str(e))
|
||||
list_url_host = []
|
||||
return list_url_host
|
||||
|
||||
def _get_search_list(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# List of keyword searches
|
||||
list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching searches list: " + str(e))
|
||||
list_search_text = []
|
||||
return list_search_text
|
||||
|
||||
def _run_fetching(self, search_text):
|
||||
logger.debug("Starting _run_fetching() for {}".format(search_text))
|
||||
|
||||
# Initialize DB Writer
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
|
||||
# Common parameters
|
||||
lang, region = "en", "US"
|
||||
|
||||
### PreSearch
|
||||
dict_params_news = {"search": search_text}
|
||||
FetcherPreSearch(**dict_params_news).fetch_articles(db_writer)
|
||||
|
||||
### DuckDuckGo
|
||||
period = "d"
|
||||
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
|
||||
FetcherDuckDuckGo(**dict_params_news).fetch_articles(db_writer)
|
||||
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
|
||||
FetcherDuckDuckGo(**dict_params_general).fetch_articles(db_writer)
|
||||
|
||||
if (self.full_search):
|
||||
# Avoid site:{} search due to G-Bypass required time
|
||||
if ("site:" not in search_text):
|
||||
### GNews
|
||||
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
|
||||
FetcherGNews(**dict_params).fetch_articles(db_writer)
|
||||
|
||||
### GoogleNews
|
||||
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
FetcherGoogleNews(**dict_params_news).fetch_articles(db_writer)
|
||||
# dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
|
||||
|
||||
'''
|
||||
# Method run concurrently, minimize overlapping
|
||||
time.sleep(random.uniform(1, 15))
|
||||
list_threads = []
|
||||
|
||||
def run_search(FetcherObject, dict_params):
|
||||
# Initialize DB Writer
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
# Fetch and write to DB
|
||||
FetcherObject(**dict_params).fetch_articles(db_writer)
|
||||
|
||||
"""
|
||||
### SearxNG
|
||||
period = "day"
|
||||
for searx_instance in get_searxng_instances():
|
||||
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
# Append thread
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherSearxNews, dict_params_news, )) )
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherSearxNews, dict_params_general, )) )
|
||||
"""
|
||||
|
||||
### PreSearch
|
||||
dict_params_news = {"search": search_text}
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherPreSearch, dict_params_news, )) )
|
||||
|
||||
### DuckDuckGo
|
||||
period = "d"
|
||||
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
|
||||
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
|
||||
# Append thread
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherDuckDuckGo, dict_params_news, )) )
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherDuckDuckGo, dict_params_general, )) )
|
||||
|
||||
if (self.full_search):
|
||||
# Avoid site:{} search due to G-Bypass required time
|
||||
if ("site:" not in search_text):
|
||||
### GNews
|
||||
for period in ["1d"]: # ["1d", "6h"]:
|
||||
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
|
||||
# Append thread
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherGNews, dict_params, )) )
|
||||
|
||||
### GoogleNews
|
||||
for period in ["1d"]: # ["1d", "6h"]:
|
||||
# News
|
||||
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherGoogleNews, dict_params_news, )) )
|
||||
if False:
|
||||
dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherGoogleNews, dict_params_general, )) )
|
||||
|
||||
# Run
|
||||
MULTITHREADED = False
|
||||
logger.debug("Fetching threads starting")
|
||||
if MULTITHREADED:
|
||||
for t in list_threads:
|
||||
t.start()
|
||||
# Join
|
||||
for t in list_threads:
|
||||
t.join()
|
||||
else:
|
||||
for t in list_threads:
|
||||
t.start()
|
||||
t.join()
|
||||
logger.debug("Fetching threads finished")
|
||||
'''
|
||||
logger.debug("Finished _run_fetching()")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.info("Fetching text searches & URL hosts of interest")
|
||||
|
||||
# Get text searches of interest
|
||||
list_search_text_of_interest = self._get_search_list()
|
||||
|
||||
# Get URL host of interest
|
||||
list_url_host = self._get_url_host_list()
|
||||
# Get text searches for URL hosts
|
||||
list_search_text_url_host = ["site:{}".format(l) for l in list_url_host]
|
||||
|
||||
MULTITHREADED = False
|
||||
if MULTITHREADED:
|
||||
# Run fetching
|
||||
list_fetching_threads = []
|
||||
for search_text in list_search_text_of_interest + list_search_text_url_host:
|
||||
logger.debug("Fetching news for search: {}".format(search_text))
|
||||
# Append thread
|
||||
list_fetching_threads.append( Thread(target=self._run_fetching, args=(search_text, )) )
|
||||
|
||||
# Run
|
||||
for t in list_fetching_threads:
|
||||
t.start()
|
||||
# Join
|
||||
for t in list_fetching_threads:
|
||||
t.join()
|
||||
else:
|
||||
for search_text in list_search_text_of_interest + list_search_text_url_host:
|
||||
logger.debug("Fetching news for search: {}".format(search_text))
|
||||
self._run_fetching(search_text)
|
||||
|
||||
logger.info("Finished fetching text searches & URL hosts of interest")
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsSearch.run(): {}".format(str(e)))
|
||||
|
||||
321
app_fetcher/src/search_sources.py
Normal file
321
app_fetcher/src/search_sources.py
Normal file
@@ -0,0 +1,321 @@
|
||||
from duckduckgo_search import DDGS
|
||||
from gnews import GNews
|
||||
from GoogleNews import GoogleNews
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import numpy as np
|
||||
import random
|
||||
from .user_agents import user_agents_list
|
||||
from .google_bypass import GoogleByPass
|
||||
from abc import ABC, abstractmethod
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
|
||||
|
||||
# Generic fetcher (fetches articles, writes to DB)
|
||||
class FetcherAbstract(ABC):
|
||||
@abstractmethod
|
||||
def _fetch(self):
|
||||
pass
|
||||
|
||||
def fetch_articles(self, db_writer):
|
||||
logger.debug("Starting fetch() for {}".format(self.name))
|
||||
# Fetch articles
|
||||
list_news = self._fetch()
|
||||
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
|
||||
# Write to DB
|
||||
db_writer.write_batch(list_news, self.name)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class FetcherPreSearch(FetcherAbstract):
|
||||
def __init__(self, search):
|
||||
"""
|
||||
# period ->
|
||||
- h = hours (eg: 12h)
|
||||
- d = days (eg: 7d)
|
||||
- m = months (eg: 6m)
|
||||
- y = years (eg: 1y)
|
||||
"""
|
||||
self.search = search
|
||||
self.period = "1d" # TODO Fixed for the moment
|
||||
# self.lang = lang
|
||||
# self.region = region
|
||||
search_category = "news"
|
||||
self.name = "presearch {} {} {}".format(search, search_category, self.period)
|
||||
|
||||
def _fetch(self):
|
||||
try:
|
||||
# PreSearch fetching endpoint, parameter search keyword
|
||||
presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search)
|
||||
# Timeout: 15 minutes
|
||||
r = requests.get(presearch_fetch_endpoint, timeout=900)
|
||||
# Decode
|
||||
list_news = json.loads(r.text).get("list_urls", [])
|
||||
except Exception as e:
|
||||
logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e)))
|
||||
list_news = []
|
||||
return list_news
|
||||
|
||||
|
||||
|
||||
class FetcherGNews(FetcherAbstract):
|
||||
def __init__(self, search, period, lang="en", region="US"):
|
||||
"""
|
||||
# period ->
|
||||
- h = hours (eg: 12h)
|
||||
- d = days (eg: 7d)
|
||||
- m = months (eg: 6m)
|
||||
- y = years (eg: 1y)
|
||||
"""
|
||||
self.search = search
|
||||
self.period = period
|
||||
self.lang = lang
|
||||
self.region = region
|
||||
search_category = "news"
|
||||
self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
|
||||
|
||||
def _fetch(self):
|
||||
try:
|
||||
list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search)
|
||||
# Decode
|
||||
list_news = []
|
||||
for l in list_dict_news:
|
||||
list_news.append(l.get("url"))
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
||||
list_news = []
|
||||
|
||||
# Bypass Google links
|
||||
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
|
||||
|
||||
return list_news_redirections
|
||||
|
||||
class FetcherGoogleNews(FetcherAbstract):
|
||||
def __init__(self, search, search_category="news", period="1d", lang="en", region="US"):
|
||||
assert(search_category in ["news", "general"])
|
||||
|
||||
self.lang = lang
|
||||
self.region = region
|
||||
self.period = period
|
||||
self.search_category = search_category
|
||||
self.search = search
|
||||
self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region))
|
||||
|
||||
def _fetch(self):
|
||||
try:
|
||||
# Initialize
|
||||
g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region)
|
||||
g.enableException(True)
|
||||
|
||||
if (self.search_category == "general"):
|
||||
set_links = set()
|
||||
# Search
|
||||
g.search(self.search)
|
||||
|
||||
# Iterate pages
|
||||
MAX_ITER_PAGES = 15
|
||||
for i in range(MAX_ITER_PAGES):
|
||||
time.sleep(random.uniform(1, 1.5))
|
||||
num_before = len(set_links)
|
||||
|
||||
# Get page
|
||||
try:
|
||||
links = g.page_at(i)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e)))
|
||||
break
|
||||
# Links
|
||||
for l in links:
|
||||
# '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ'
|
||||
url = l.get("link").split("url=")[-1]
|
||||
set_links.add(url)
|
||||
|
||||
num_after = len(set_links)
|
||||
|
||||
# Finished?
|
||||
if (num_before == num_after):
|
||||
logger.debug("Iterated {} pages on GoogleNews general search".format(i))
|
||||
break
|
||||
# To list
|
||||
list_news = list(set_links)
|
||||
elif (self.search_category == "news"):
|
||||
# Search
|
||||
g.get_news(self.search)
|
||||
# Fetch
|
||||
list_news = g.get_links()
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
||||
list_news = []
|
||||
|
||||
# Bypass Google links
|
||||
list_news_redirections = GoogleByPass().bypass_google_urls(list_news)
|
||||
|
||||
return list_news_redirections
|
||||
|
||||
class FetcherDuckDuckGo(FetcherAbstract):
|
||||
def __init__(self, search, search_category, period, lang="wt", region="wt"):
|
||||
assert(search_category in ["news", "general"])
|
||||
assert(period in ["d", "w", "m", "y"])
|
||||
self.search = search
|
||||
self.search_category = search_category
|
||||
self.period = period
|
||||
self.lang_region = "{}-{}".format(lang, region)
|
||||
self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region)
|
||||
|
||||
def _fetch(self):
|
||||
try:
|
||||
list_news = []
|
||||
with DDGS(timeout=10) as ddgs:
|
||||
if (self.search_category == "general"):
|
||||
generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region)
|
||||
elif (self.search_category == "news"):
|
||||
generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region)
|
||||
|
||||
for l in generator_links:
|
||||
list_news.append( l.get("url", l.get("href")) )
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
||||
list_news = []
|
||||
return list_news
|
||||
|
||||
|
||||
class FetcherSearxNews(FetcherAbstract):
|
||||
def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"):
|
||||
assert(search_category in ["news", "general"])
|
||||
assert(period in [None, "day", "week", "month", "year"])
|
||||
# Random header (minimize prob of web-scrapping detection)
|
||||
self.headers = {
|
||||
'User-agent': str(np.random.choice(user_agents_list)),
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Accept': '*/*',
|
||||
'Connection': 'keep-alive',
|
||||
}
|
||||
""" # Optional header
|
||||
self.headers = {
|
||||
'User-agent': str(np.random.choice(user_agents_list)),
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'TE': 'trailers',
|
||||
'Sec-Fetch-Site': 'cross-site',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
}
|
||||
"""
|
||||
self.search = search
|
||||
self.searx_instance = searx_instance
|
||||
self.lang_region = "{}-{}".format(lang, region)
|
||||
self.search_category = search_category
|
||||
self.period = period
|
||||
self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5
|
||||
self.request_timeout = 240
|
||||
|
||||
period_name_mapping = {
|
||||
None: "no_date_range",
|
||||
"day": "1d",
|
||||
"week": "1w",
|
||||
"month": "1m",
|
||||
"year": "1y",
|
||||
}
|
||||
self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region)
|
||||
logger.info("SearX - Initialized SearX fetcher: {}".format(self.name))
|
||||
|
||||
def _request_and_decode(self, url_search):
|
||||
# Initial random time sleep (minimize chance of getting blocked)
|
||||
time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher))
|
||||
# Request
|
||||
logger.debug("SearX - Searching: {}".format(url_search))
|
||||
try:
|
||||
r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout)
|
||||
except Exception as e:
|
||||
logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e))
|
||||
return []
|
||||
|
||||
if (r.status_code == 200):
|
||||
# Status code Ok
|
||||
pass
|
||||
elif (r.status_code == 429):
|
||||
# TooManyRequests, "Rate limit exceeded"
|
||||
logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text))
|
||||
return []
|
||||
elif (r.status_code != 200):
|
||||
logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text))
|
||||
return []
|
||||
else:
|
||||
logger.debug("SearX - Status code: {}".format(r.status_code))
|
||||
|
||||
# Decode request
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
page_url_set = set()
|
||||
# h3 links
|
||||
for elem in soup.find_all('h3'):
|
||||
# Get url
|
||||
url = elem.find('a').get('href')
|
||||
page_url_set.add(url)
|
||||
return page_url_set
|
||||
|
||||
def _get_news_list(self):
|
||||
############################################################
|
||||
# Domain & search parameter
|
||||
search_domain = os.path.join(self.searx_instance, "search?q=")
|
||||
# Search keywords
|
||||
search_formatted = self.search.replace(" ", "+").replace(":", "%3A")
|
||||
# Period formatted
|
||||
period_formatted = "&time_range={}".format(self.period) if self.period is not None else ""
|
||||
# Search parameters
|
||||
search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted)
|
||||
# Combined url search
|
||||
url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters)
|
||||
############################################################
|
||||
|
||||
# Request and decode on page=1
|
||||
url_set = self._request_and_decode(url_search_nopage)
|
||||
# No results?
|
||||
if (len(url_set) == 0):
|
||||
logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage))
|
||||
return []
|
||||
|
||||
# Iterate pages
|
||||
search_numpage = 2
|
||||
while True:
|
||||
# Combine url search with page number
|
||||
url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage)
|
||||
# Request and decode on page=X
|
||||
url_set_i = self._request_and_decode(url_search_with_page)
|
||||
|
||||
# Length before merging
|
||||
length_current = len(url_set)
|
||||
# Merge
|
||||
url_set = url_set.union(url_set_i)
|
||||
# Length after merging
|
||||
length_merged = len(url_set)
|
||||
|
||||
# No new elements?
|
||||
if (length_current == length_merged):
|
||||
logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage))
|
||||
break
|
||||
# Next page
|
||||
search_numpage += 1
|
||||
|
||||
return list(url_set)
|
||||
|
||||
def _fetch(self):
|
||||
try:
|
||||
# Fetch news
|
||||
list_news = self._get_news_list()
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self.name, str(e)))
|
||||
list_news = []
|
||||
return list_news
|
||||
63
app_fetcher/src/url_status.py
Normal file
63
app_fetcher/src/url_status.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
from .url_utils import process_article
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class UpdateErrorURLs():
|
||||
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
|
||||
self.num_urls = num_urls
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
||||
|
||||
def update_error_urls_status(self):
|
||||
try:
|
||||
logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls))
|
||||
# List of URLs with status 'error'
|
||||
list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls)
|
||||
# Current status
|
||||
current_status = "error"
|
||||
# Dict: status -> IDs to update to new status
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
# Get list of (pattern, priority, status) tuples to override status if required
|
||||
list_pattern_status_tuple = self.db_writer._get_pattern_status_list()
|
||||
# Sort pattern tuples by priority
|
||||
list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)
|
||||
|
||||
flush_every, flush_current = 20, 0
|
||||
# Iterate URLs
|
||||
for (id, url) in list_ids_and_urls:
|
||||
# Get status
|
||||
url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple)
|
||||
# Different? Update
|
||||
if (current_status != new_status):
|
||||
# Extend array
|
||||
dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id]
|
||||
# Debugging dict
|
||||
dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url]
|
||||
# +1 processed
|
||||
flush_current += 1
|
||||
|
||||
# Flush batch?
|
||||
if (flush_every == flush_current):
|
||||
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
|
||||
# Update DB
|
||||
self.db_writer._update_urls_status(dict_status_ids)
|
||||
# Reset
|
||||
flush_current = 0
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
# Flush remaining batch
|
||||
if (flush_current > 0):
|
||||
logger.info("Updating status to URLs with error: {}".format(dict_status_urls))
|
||||
# Update DB
|
||||
self.db_writer._update_urls_status(dict_status_ids)
|
||||
# Reset
|
||||
flush_current = 0
|
||||
dict_status_ids, dict_status_urls = {}, {}
|
||||
|
||||
logger.info("Finished updating status to URLs with error")
|
||||
except Exception as e:
|
||||
logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))
|
||||
289
app_fetcher/src/url_utils.py
Normal file
289
app_fetcher/src/url_utils.py
Normal file
@@ -0,0 +1,289 @@
|
||||
from gnews import GNews
|
||||
import dateutil.parser
|
||||
from datetime import datetime, timedelta
|
||||
from .utils import remove_http_s
|
||||
import time
|
||||
import random
|
||||
import traceback
|
||||
import requests
|
||||
import json
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
def get_published_date(article):
|
||||
try:
|
||||
"""
|
||||
# Already fetched publish date information?
|
||||
if (publish_date_ is not None):
|
||||
return publish_date_
|
||||
"""
|
||||
|
||||
# List of potential publish dates
|
||||
potential_dates = []
|
||||
# Publish date is the best match
|
||||
potential_dates.append(article.publish_date)
|
||||
# Publish date metadata is the following best match
|
||||
potential_dates.append(article.meta_data.get('article', {}).get("published_time", None))
|
||||
# Iterate remaining keys
|
||||
for key in article.meta_data.keys():
|
||||
if ("date" in key):
|
||||
potential_dates.append(article.meta_data[key])
|
||||
|
||||
def invalid_date(p_date):
|
||||
# Today + 2 days, article from the future?
|
||||
today_plus_two = datetime.utcnow() + timedelta(days=2)
|
||||
# Article from the future?
|
||||
return p_date.timestamp() > today_plus_two.timestamp()
|
||||
|
||||
for date_ in potential_dates:
|
||||
# String date? parse
|
||||
if (type(date_) == str):
|
||||
try:
|
||||
date_ = dateutil.parser.parse(date_)
|
||||
except Exception as e:
|
||||
logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url))
|
||||
date_ = None
|
||||
# Valid?
|
||||
if (date_ is not None) and (not invalid_date(date_)):
|
||||
return date_
|
||||
|
||||
logger.debug("Article with no published date: {}".format(article.url))
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.info("Error while retrieving published date for URL: {}".format(article.url))
|
||||
return None
|
||||
|
||||
def get_url_host(article_source_url, url):
|
||||
# https://www.blabla.com/blabla -> www.blabla.com
|
||||
if (article_source_url != ""):
|
||||
# Article source URL already extracted, save path if any
|
||||
return remove_http_s(article_source_url) # .split("/")[0]
|
||||
else:
|
||||
return remove_http_s(url).split("/")[0]
|
||||
|
||||
def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
|
||||
# Regex pattern to update status on "valid", "invalid", and "unknown" status only
|
||||
# Status "raw", "duplicated" and "error" should remain the way they are
|
||||
# Assumption: List of patterns sorted by importance
|
||||
if (article_status in ["valid", "invalid", "unknown"]):
|
||||
# Regular expression pattern matching: https://regexr.com/
|
||||
for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple:
|
||||
# Matching?
|
||||
matching = bool(re.match(regex_pattern, url))
|
||||
# Update article status
|
||||
if (matching):
|
||||
if (status_if_match != article_status):
|
||||
logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url))
|
||||
return status_if_match
|
||||
# Pattern matching not required or not found, original article status
|
||||
return article_status
|
||||
|
||||
def get_missing_kid_status(url, return_canonical_url=False):
|
||||
# Sleep
|
||||
time.sleep(0.75)
|
||||
try:
|
||||
# Request
|
||||
r = requests.get(url, timeout=300)
|
||||
# Decode
|
||||
status_code = r.status_code
|
||||
# Canonical URL removing parameters
|
||||
url_canonical = r.url
|
||||
except Exception as e:
|
||||
logger.warning("Exception on get URL status request: {}. {}".format(url, str(e)))
|
||||
status_code = None
|
||||
url_canonical = url
|
||||
|
||||
if (status_code == 200):
|
||||
status = "valid"
|
||||
elif (status_code == 404):
|
||||
status = "invalid"
|
||||
else:
|
||||
status = "unknown"
|
||||
|
||||
logger.debug("Missing Kid URL {} status: {}".format(url, status))
|
||||
if (return_canonical_url):
|
||||
return status, url_canonical
|
||||
else:
|
||||
return status
|
||||
|
||||
def bypass_google_link(article_url):
|
||||
|
||||
def bypass_google_consent(article_url):
|
||||
# Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1
|
||||
article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "")
|
||||
|
||||
# https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
|
||||
}
|
||||
cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'}
|
||||
|
||||
try:
|
||||
# Request
|
||||
r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300)
|
||||
# Decode
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
url_of_interest = soup.a['href']
|
||||
except Exception as e:
|
||||
logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e)))
|
||||
url_of_interest = None
|
||||
|
||||
# Not able to bypass?
|
||||
if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest):
|
||||
url_of_interest = None
|
||||
return url_of_interest
|
||||
|
||||
def bypass_google_using_service(article_url):
|
||||
try:
|
||||
# e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen"
|
||||
gbypass_endpoint = "http://selenium_app:80/get_redirection"
|
||||
# Timeout: 5 minutes
|
||||
r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300)
|
||||
# Decode
|
||||
redirect_url = json.loads(r.text).get("redirect_url", "")
|
||||
except Exception as e:
|
||||
logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e)))
|
||||
redirect_url = ""
|
||||
|
||||
return redirect_url
|
||||
|
||||
logger.debug("Starting gbypass_endpoint()")
|
||||
|
||||
article_url_bypassed = None
|
||||
# Bypass using request
|
||||
if ("consent.google.com" in article_url):
|
||||
article_url_bypassed = bypass_google_consent(article_url)
|
||||
# Not bypassed yet? Bypass using service
|
||||
if (article_url_bypassed is None):
|
||||
article_url_bypassed = bypass_google_using_service(article_url)
|
||||
|
||||
# if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed):
|
||||
if (article_url_bypassed == "") or (article_url_bypassed is None):
|
||||
# Empty URL returned by Gbypass
|
||||
logger.warning("Error while bypassing Gnews for URL: {}".format(article_url))
|
||||
return None
|
||||
else:
|
||||
logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url))
|
||||
return article_url_bypassed
|
||||
|
||||
def process_article(article_url, list_pattern_status_tuple, language="en"):
|
||||
# TODO:
|
||||
"""
|
||||
https://github.com/fhamborg/news-please
|
||||
https://github.com/fhamborg/Giveme5W1H
|
||||
https://github.com/santhoshse7en/news-fetch
|
||||
"""
|
||||
try:
|
||||
logger.debug("Starting process_article()")
|
||||
|
||||
if ("news.google.com" in article_url) or ("consent.google.com" in article_url):
|
||||
# Bypass to get redirection
|
||||
article_url = bypass_google_link(article_url)
|
||||
# Error?
|
||||
if (article_url is None):
|
||||
return None, {}, "error"
|
||||
elif ("missingkids.org/poster" in article_url):
|
||||
# Get status
|
||||
article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True)
|
||||
article_elements = {
|
||||
"url_full": article_url,
|
||||
"url_canonical": url_canonical
|
||||
}
|
||||
return url_canonical, article_elements, article_status
|
||||
else:
|
||||
# Avoid Too many requests (feeds, ...)
|
||||
time.sleep(0.75)
|
||||
|
||||
logger.debug("Processing: {}".format(article_url))
|
||||
|
||||
# Default status unless something happens
|
||||
article_status = "valid"
|
||||
|
||||
# Parse article
|
||||
# TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None
|
||||
# TODO: Language per config
|
||||
article = GNews(language).get_full_article(url=article_url)
|
||||
|
||||
# Article parsed?
|
||||
if (article is None) or (not article.is_parsed):
|
||||
logger.debug("Article not parsed: {}".format(article_url))
|
||||
return article_url, {}, "error"
|
||||
|
||||
# Canonical link as main URL
|
||||
url_canonical = article.canonical_link
|
||||
# Empty canonical URL?
|
||||
if (article.canonical_link is None) or (article.canonical_link == ""):
|
||||
# URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link
|
||||
if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")):
|
||||
logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url))
|
||||
try:
|
||||
# Remove text after parameter call
|
||||
url = article.url.split("?")[0]
|
||||
# Remove comment-stream
|
||||
url = url.replace("#comment-stream", "").replace("#disqus_thread", "")
|
||||
# Article
|
||||
article_attempt = GNews(language).get_full_article(url=url)
|
||||
# Retrieving same title? Update article based on clean URL
|
||||
if (article_attempt is not None) and (article_attempt.title == article.title):
|
||||
article = article_attempt
|
||||
except Exception as e:
|
||||
logger.info("Article parsing of URL without parameters failed: {}".format(article.url))
|
||||
else: # Default behaviour
|
||||
logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url))
|
||||
|
||||
# By default, URL same as canonical
|
||||
url_canonical = article.url
|
||||
|
||||
elif (article.url != article.canonical_link):
|
||||
# If different, stick to canonical URL
|
||||
logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link))
|
||||
else:
|
||||
# If same, continue...
|
||||
pass
|
||||
|
||||
# Update config to determine if content is valid
|
||||
article.config.MIN_WORD_COUNT = 150
|
||||
article.config.MIN_SENT_COUNT = 6
|
||||
|
||||
# Valid URL?
|
||||
if (not article.is_valid_url()):
|
||||
logger.debug("Not a valid news article: {}".format(url_canonical))
|
||||
article_status = "invalid"
|
||||
# Is the article's body text is long enough to meet standard article requirements?
|
||||
if (not article.is_valid_body()):
|
||||
logger.debug("Article body not valid: {}".format(url_canonical))
|
||||
article_status = "unknown"
|
||||
|
||||
if (article.images != article.imgs):
|
||||
logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs))
|
||||
|
||||
# article.keywords, article.meta_keywords, article.summary
|
||||
# article.movies
|
||||
# article.top_image
|
||||
|
||||
# Check if article status needs to be updated
|
||||
article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple)
|
||||
|
||||
article_elements = {
|
||||
'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/
|
||||
'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com
|
||||
'title': article.title, # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020
|
||||
'description': article.meta_description, # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office
|
||||
'text': article.text, # ${Article content}
|
||||
'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00'
|
||||
'authors': article.authors, # ['Christopher Knaus']
|
||||
'language': article.meta_lang, # en
|
||||
'tags': list(article.tags), # ['Wide Open Border', '’My Son Hunter’ Movie', ...]
|
||||
'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...]
|
||||
'url_canonical': url_canonical, # Canonical URL (redirection)
|
||||
# 'html': article.html, # HTML article
|
||||
}
|
||||
logger.debug("Processing OK: {}".format(url_canonical))
|
||||
return url_canonical, article_elements, article_status
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc()))
|
||||
return None, {}, "error"
|
||||
64
app_fetcher/src/user_agents.py
Normal file
64
app_fetcher/src/user_agents.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
|
||||
|
||||
user_agents_list = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
|
||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
|
||||
]
|
||||
33
app_fetcher/src/utils.py
Normal file
33
app_fetcher/src/utils.py
Normal file
@@ -0,0 +1,33 @@
|
||||
|
||||
def remove_http_s(url):
|
||||
url = url.replace("https://", "") if url.startswith("https://") else url
|
||||
url = url.replace("http://", "") if url.startswith("http://") else url
|
||||
return url
|
||||
|
||||
def is_valid_url(url):
|
||||
if (url.startswith("https://")):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_searxng_instances():
|
||||
# SearxNG instances: https://searx.space/
|
||||
searx_instances = set()
|
||||
searx_instances.add("https://searx.work/")
|
||||
searx_instances.add("https://search.ononoki.org/")
|
||||
searx_instances.add("https://searxng.nicfab.eu/")
|
||||
searx_instances.add("https://searx.be/")
|
||||
|
||||
# searx_instances.add("https://searx.fmac.xyz/")
|
||||
# searx_instances.add("https://northboot.xyz/") # FIX
|
||||
|
||||
# searx_instances.add("https://serx.ml/") # Offline
|
||||
# searx_instances.add("https://searx.ru/")
|
||||
# searx_instances.add("https://searx.sp-codes.de/")
|
||||
# searx_instances.add("https://searxng.nicfab.eu/")
|
||||
# searx_instances.add("https://s.frlt.one/")
|
||||
# searx_instances.add("https://search.sapti.me/")
|
||||
|
||||
# To list
|
||||
list_searx_instances = list(searx_instances)
|
||||
return list_searx_instances
|
||||
Reference in New Issue
Block a user