import psycopg import redis import traceback import random import requests import json import os from .url_utils import process_article import logging logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') logger = logging.getLogger("news_fetcher") # TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ... # The rest, elsewhere class URL_DB_Writer(): def __init__(self, db_connect_info, redis_connect_info): logger.debug("Initializing URL DB writer") self.db_connect_info = db_connect_info self.redis_instance = redis.Redis(host=redis_connect_info.get("redis_host"), port=redis_connect_info.get("redis_port")) self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours try: self.redis_instance.ping() logger.debug("Succesfully pinged Redis") except Exception as e: logger.warning("Error trying to ping Redis: {}".format(str(e))) def get_urls_count(self, last_minutes_check): ##################### ### Get number of URLs within last X minutes ##################### try: # Update with psycopg.connect(self.db_connect_info) as conn: # Open cursor cursor = conn.cursor() num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0] except Exception as e: logger.warning("Error updating URLs status: {}".format(str(e))) num_urls = None return num_urls def _format(self, values): # Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729 # String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value) if (type(values) == list) or (type(values) == tuple): insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")" elif (type(values) == str): insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" ) else: logger.warning("Error formatting input values: {}".format(values)) assert False return insert_args def _get_cached_canonical_url(self, url): ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB try: filter_url = self.redis_instance.get(url) if (filter_url is not None): filter_url = filter_url.decode("utf-8") except Exception as e: logger.warning("Exception querying Redis: {}".format(str(e))) filter_url = None return filter_url def _update_urls_status(self, dict_status_ids): ##################### ### Update status to array of URL IDs ##################### try: # Update with psycopg.connect(self.db_connect_info) as conn: # Open cursor cursor = conn.cursor() # Autocommit at end of transaction (Atomic insert of URLs and sources) with conn.transaction() as tx: for key_status, value_ids in dict_status_ids.items(): cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids]))) except Exception as e: logger.warning("Error updating URLs status: {}".format(str(e))) def _get_missing_kids_urls(self, num_urls=None): ##################### ### Get list of Missing Kids URLs ##################### try: missing_kids_ids_and_urls = [] if (num_urls is None): limit = 500 else: limit = num_urls offset = 0 with psycopg.connect(self.db_connect_info) as conn: # Open cursor cursor = conn.cursor() while True: # Query missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall() # Finished? if (len(missing_kids_ids_and_urls_query) == 0): break # Extend missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query # Offset offset += len(missing_kids_ids_and_urls_query) # Stop? if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls): break except Exception as e: logger.warning("Error getting Missing Kids URLs: {}".format(str(e))) missing_kids_ids_and_urls = [] return missing_kids_ids_and_urls def _get_error_urls(self, num_urls=None): ##################### ### Get list of Missing Kids URLs ##################### try: error_urls = [] if (num_urls is None): limit = 500 else: limit = num_urls offset = 0 with psycopg.connect(self.db_connect_info) as conn: # Open cursor cursor = conn.cursor() while True: # Query error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall() # Finished? if (len(error_urls_query) == 0): break # Extend error_urls = error_urls + error_urls_query # Offset offset += len(error_urls_query) # Stop? if (num_urls is not None) and (len(error_urls) >= num_urls): break except Exception as e: logger.warning("Error getting Error URLs: {}".format(str(e))) error_urls = [] return error_urls def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched... """ # TODO: REFACTOR For each input url Already processed? -> Update on Redis expire time -> Associate to source Not processed? Get main URL: -> URL Canonical valid? -> Rely on this as main URL -> URL Canonical not valid? -> Use input url, unless it's a news.google.com link -> If news.google.com link, filter out. REDIS? Main URL processing: -> Update in REDIS, association url -> url_canonical -> url != url_canonical: Add in duplicate table If both != news.google.com """ # URLs to insert, URLs duplicated association, URL to Canonical form list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {} # URL VS CANONICAL: # News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen # Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/ for url in urls_fetched: # Domain to filter? Input url filter_due_to_domain = False for domain_to_filter in list_domains_to_filter: if (domain_to_filter in url): logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url)) filter_due_to_domain = True if (filter_due_to_domain): continue # URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB cached_canonical_url = self._get_cached_canonical_url(url) if (cached_canonical_url is not None): # Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry) dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y # If url has been processed, so was its canonical form logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url)) continue # Process TODO: Add language... url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple) # TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id) # Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB) if (url_canonical is None) and ("news.google.com" in url): logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url)) continue # Canonical URL still news.google.com? Continue (avoid inserting in DB) if (url_canonical is not None) and ("news.google.com" in url_canonical): logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical)) continue # Domain to filter? Input canonical_url filter_due_to_domain = False for domain_to_filter in list_domains_to_filter: if (url_canonical is not None) and (domain_to_filter in url_canonical): filter_due_to_domain = True if (filter_due_to_domain): logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical)) continue if (url_canonical is None) or (article_status == "error"): logger.debug("Processing failed for URL: {}".format(url)) # Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based if ("news.google.com" in url) or ("consent.google.com" in url): logging.debug("Not able to process Google News link, skipping: {}".format(url)) else: dict_full_urls_to_canonical[url] = url # X -> X list_insert_url_tuple_args.append( (url, article_status) ) continue # URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different if (url_canonical != url): list_tuple_canonical_duplicate_urls.append( (url_canonical, url) ) # Dict: url -> canonical (update association) dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X # Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB if (self._get_cached_canonical_url(url_canonical) is not None): # Canonical URL was already processed logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical)) else: # Insert url_canonical to DB formatted list_insert_url_tuple_args.append( (url_canonical, article_status) ) # Canonical URL different? Process if (url_canonical != url): if ("news.google.com" in url) or ("consent.google.com" in url): logging.debug("Not adding google.news.com based link, skipping: {}".format(url)) else: # Fetched url -> duplicate (using canonical as main link) article_status = "duplicate" # Insert url (non-canonical) to DB formatted list_insert_url_tuple_args.append( (url, article_status) ) return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical def _insert_urls(self, cursor, list_insert_url_tuple_args): ##################### ### Insert URLs with status ##################### if (len(list_insert_url_tuple_args) > 0): insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] ) # Insert. (url_1, status_1), (url_2, status_2), ... sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args) # logger.debug("SQL CODE: {}".format(sql_code)) c = cursor.execute(sql_code) # NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT) # https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488 def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls): ##################### ### Insert duplicated URLs ##################### if (len(list_tuple_canonical_duplicate_urls) > 0): # Flatten, format, set to remove duplicates args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")" # Dict: url -> id dict_url_to_id = {} # Get url -> id association to populate duplicated URLs for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall(): dict_url_to_id[url_] = id_ # Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB # ORIGINAL CODE. Issue, might not have found association to all urls ### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls] list_tuple_canonical_duplicate_urls_ids = [] for (url_1, url_2) in list_tuple_canonical_duplicate_urls: id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2) if (id_url_1 is None) or (id_url_2 is None): logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2)) else: list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) ) if (len(list_tuple_canonical_duplicate_urls_ids) > 0): insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] ) # Insert. (id_url_canonical_1, id_url_1), ... sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args) # logger.debug("SQL CODE: {}".format(sql_code)) c = cursor.execute(sql_code) def _get_pattern_status_list(self): ##################### ### Get list of domains to filter ##################### # TODO: Cache on redis and query once every N hours? ... try: with psycopg.connect(self.db_connect_info) as conn: # Open cursor cursor = conn.cursor() # TODO: Cache on Redis list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall() except Exception as e: logger.warning("Error getting pattern status list: {}".format(str(e))) list_pattern_status = [] return list_pattern_status def _get_domains_to_filter(self): ##################### ### Get list of domains to filter ##################### # TODO: Cache on redis and query once every N hours? ... try: with psycopg.connect(self.db_connect_info) as conn: # Open cursor cursor = conn.cursor() # TODO: Cache on Redis sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ] except Exception as e: logger.warning("Error getting domains to filter: {}".format(str(e))) sites_to_filter = [] return sites_to_filter def _get_cached_source_id(self, source): ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB try: source_id = self.redis_instance.get(source) if (source_id is not None): source_id = source_id.decode("utf-8") except Exception as e: logger.warning("Exception querying Redis: {}".format(str(e))) source_id = None return source_id def _get_source_id(self, cursor, source): ##################### ### Get source corresponding id ##################### # Cached? id_source = self._get_cached_source_id(source) if (id_source is None): c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone() if (c is None) or (len(c) == 0): # Source does not exist, insert and get id c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone() # Decode source id id_source = c[0] # Cache self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds) return id_source def _get_urls_id(self, cursor, urls_full): ##################### ### Get id of inserted and filtered URLs ##################### # TODO: Cache url -> url_id, url_canonical if (len(urls_full) == 0): return [] # Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")" id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ] return id_urls_related def _insert_urls_source(self, cursor, id_urls_related, id_source): ##################### ### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ... ##################### if (len(id_urls_related) == 0) or (id_source is None): return columns = "(id_url, id_source)" insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] ) # Insert sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args) # logger.debug("SQL CODE: {}".format(sql_code)) c = cursor.execute(sql_code) def write_batch(self, urls_fetched, source): # Chunks of 50 elements n = 50 # Divide in small chunks urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)] # Process for urls_fetched_chunk_i in urls_fetched_chunks: self._write_small_batch(urls_fetched_chunk_i, source) def _write_small_batch(self, urls_fetched, source): try: logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source)) if (len(urls_fetched) == 0): logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source)) return # Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests) random.shuffle(urls_fetched) # Get list of domains to filter list_domains_to_filter = self._get_domains_to_filter() # Get list of (pattern, priority, status) tuples to override status if required list_pattern_status_tuple = self._get_pattern_status_list() # Sort pattern tuples by priority list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True) # Process URLs to update DB list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple) # Full set of URL and its canonical form (to associate them to a search), both to insert and filter urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) ) # Insert with psycopg.connect(self.db_connect_info) as conn: # Open cursor cursor = conn.cursor() # Autocommit at end of transaction (Atomic insert of URLs and sources) with conn.transaction() as tx: # Insert processed URLs self._insert_urls(cursor, list_insert_url_tuple_args) # Insert URLs duplicated (canonical != fetched url) self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls) # Get source id in DB id_source = self._get_source_id(cursor, source) # Get IDs of all related URLs id_urls_related = self._get_urls_id(cursor, urls_full) # Insert search source associated to URLs self._insert_urls_source(cursor, id_urls_related, id_source) # Update Redis status of inserted and filtered URLs after writing to DB for url, url_canonical in dict_full_urls_to_canonical.items(): try: # Set with updated expiry time self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds) if (url != url_canonical): self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds) except Exception as e: logger.warning("Exception running set in Redis: {}".format(str(e))) if (len(list_insert_url_tuple_args) > 0): try: webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN") endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token) payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) }) r = requests.post(endpoint_message, data=payload) except Exception as e: logger.warning("Webhook failed: {}".format(str(e))) logger.debug("URL DB write finished") except Exception as e: logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) ) logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) )