matitos_news/app_fetcher/src/db_utils.py

import psycopg
import redis
import traceback
import random
import requests
import json
import os
from .url_utils import process_article
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")

# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
# The rest, elsewhere

class URL_DB_Writer():
    def __init__(self, db_connect_info, redis_connect_info):
        logger.debug("Initializing URL DB writer")
        self.db_connect_info = db_connect_info
        self.redis_instance = redis.Redis(host=redis_connect_info.get("redis_host"), port=redis_connect_info.get("redis_port"))
        self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours

        try:
            self.redis_instance.ping()
            logger.debug("Succesfully pinged Redis")
        except Exception as e:
            logger.warning("Error trying to ping Redis: {}".format(str(e)))

    def get_urls_count(self, last_minutes_check):
        #####################
        ### Get number of URLs within last X minutes
        #####################
        try:
            # Update
            with psycopg.connect(self.db_connect_info) as conn:
                # Open cursor
                cursor = conn.cursor()
                num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0]
        except Exception as e:
            logger.warning("Error updating URLs status: {}".format(str(e)))
            num_urls = None
        return num_urls

    def _format(self, values):
        # Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729
        # String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value)
        if (type(values) == list) or (type(values) == tuple):
            insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")"
        elif (type(values) == str):
            insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" )
        else:
            logger.warning("Error formatting input values: {}".format(values))
            assert False
        return insert_args

    def _get_cached_canonical_url(self, url):
        ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
        try:
            filter_url = self.redis_instance.get(url)
            if (filter_url is not None):
                filter_url = filter_url.decode("utf-8")
        except Exception as e:
            logger.warning("Exception querying Redis: {}".format(str(e)))
            filter_url = None
        return filter_url

    def _update_urls_status(self, dict_status_ids):
        #####################
        ### Update status to array of URL IDs
        #####################
        try:
            # Update
            with psycopg.connect(self.db_connect_info) as conn:
                # Open cursor
                cursor = conn.cursor()
                # Autocommit at end of transaction (Atomic insert of URLs and sources)
                with conn.transaction() as tx:
                    for key_status, value_ids in dict_status_ids.items():
                        cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids])))
        except Exception as e:
            logger.warning("Error updating URLs status: {}".format(str(e)))

    def _get_missing_kids_urls(self, num_urls=None):
        #####################
        ### Get list of Missing Kids URLs
        #####################
        try:
            missing_kids_ids_and_urls = []
            if (num_urls is None):
                limit = 500
            else:
                limit = num_urls
            offset = 0
            with psycopg.connect(self.db_connect_info) as conn:
                # Open cursor
                cursor = conn.cursor()
                while True:
                    # Query
                    missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
                    # Finished?
                    if (len(missing_kids_ids_and_urls_query) == 0):
                        break
                    # Extend
                    missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query
                    # Offset
                    offset += len(missing_kids_ids_and_urls_query)
                    # Stop?
                    if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls):
                        break

        except Exception as e:
            logger.warning("Error getting Missing Kids URLs: {}".format(str(e)))
            missing_kids_ids_and_urls = []
        return missing_kids_ids_and_urls

    def _get_error_urls(self, num_urls=None):
        #####################
        ### Get list of Missing Kids URLs
        #####################
        try:
            error_urls = []
            if (num_urls is None):
                limit = 500
            else:
                limit = num_urls
            offset = 0
            with psycopg.connect(self.db_connect_info) as conn:
                # Open cursor
                cursor = conn.cursor()
                while True:
                    # Query
                    error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall()
                    # Finished?
                    if (len(error_urls_query) == 0):
                        break
                    # Extend
                    error_urls = error_urls + error_urls_query
                    # Offset
                    offset += len(error_urls_query)
                    # Stop?
                    if (num_urls is not None) and (len(error_urls) >= num_urls):
                        break

        except Exception as e:
            logger.warning("Error getting Error URLs: {}".format(str(e)))
            error_urls = []
        return error_urls

    def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched...
        """
        # TODO: REFACTOR
        For each input url

            Already processed?
                -> Update on Redis expire time
                -> Associate to source
            Not processed? Get main URL:
                -> URL Canonical valid?
                    -> Rely on this as main URL
                -> URL Canonical not valid?
                    -> Use input url, unless it's a news.google.com link
                        -> If news.google.com link, filter out. REDIS?
            Main URL processing:
                -> Update in REDIS, association url -> url_canonical
                -> url != url_canonical: Add in duplicate table
                    If both != news.google.com
        """

        # URLs to insert, URLs duplicated association, URL to Canonical form
        list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {}

        # URL VS CANONICAL:
        # News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen
        # Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/

        for url in urls_fetched:
            # Domain to filter? Input url
            filter_due_to_domain = False
            for domain_to_filter in list_domains_to_filter:
                if (domain_to_filter in url):
                    logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url))
                    filter_due_to_domain = True
            if (filter_due_to_domain):
                continue

            # URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
            cached_canonical_url = self._get_cached_canonical_url(url)
            if (cached_canonical_url is not None):
                # Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry)
                dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y
                # If url has been processed, so was its canonical form
                logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url))
                continue

            # Process TODO: Add language...
            url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple)
            # TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id)

            # Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB)
            if (url_canonical is None) and ("news.google.com" in url):
                logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url))
                continue
            # Canonical URL still news.google.com? Continue (avoid inserting in DB)
            if (url_canonical is not None) and ("news.google.com" in url_canonical):
                logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical))
                continue

            # Domain to filter? Input canonical_url
            filter_due_to_domain = False
            for domain_to_filter in list_domains_to_filter:
                if (url_canonical is not None) and (domain_to_filter in url_canonical):
                    filter_due_to_domain = True
            if (filter_due_to_domain):
                logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical))
                continue

            if (url_canonical is None) or (article_status == "error"):
                logger.debug("Processing failed for URL: {}".format(url))
                # Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based
                if ("news.google.com" in url) or ("consent.google.com" in url):
                    logging.debug("Not able to process Google News link, skipping: {}".format(url))
                else:
                    dict_full_urls_to_canonical[url] = url # X -> X
                    list_insert_url_tuple_args.append( (url, article_status) )
                continue

            # URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different
            if (url_canonical != url):
                list_tuple_canonical_duplicate_urls.append( (url_canonical, url) )
            # Dict: url -> canonical (update association)
            dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X

            # Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB
            if (self._get_cached_canonical_url(url_canonical) is not None):
                # Canonical URL was already processed
                logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical))
            else:
                # Insert url_canonical to DB formatted
                list_insert_url_tuple_args.append( (url_canonical, article_status) )
                # Canonical URL different? Process
                if (url_canonical != url):
                    if ("news.google.com" in url) or ("consent.google.com" in url):
                        logging.debug("Not adding google.news.com based link, skipping: {}".format(url))
                    else:
                        # Fetched url -> duplicate (using canonical as main link)
                        article_status = "duplicate"
                        # Insert url (non-canonical) to DB formatted
                        list_insert_url_tuple_args.append( (url, article_status) )

        return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical

    def _insert_urls(self, cursor, list_insert_url_tuple_args):
        #####################
        ### Insert URLs with status
        #####################
        if (len(list_insert_url_tuple_args) > 0):
            insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] )
            # Insert. (url_1, status_1), (url_2, status_2), ...
            sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args)
            # logger.debug("SQL CODE: {}".format(sql_code))
            c = cursor.execute(sql_code)
            # NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT)
            # https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488

    def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls):
        #####################
        ### Insert duplicated URLs
        #####################
        if (len(list_tuple_canonical_duplicate_urls) > 0):
            # Flatten, format, set to remove duplicates
            args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")"

            # Dict: url -> id
            dict_url_to_id = {}
            # Get url -> id association to populate duplicated URLs
            for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall():
                dict_url_to_id[url_] = id_

            # Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB
            # ORIGINAL CODE. Issue, might not have found association to all urls
            ### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls]

            list_tuple_canonical_duplicate_urls_ids = []
            for (url_1, url_2) in list_tuple_canonical_duplicate_urls:
                id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2)
                if (id_url_1 is None) or (id_url_2 is None):
                    logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2))
                else:
                    list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) )

            if (len(list_tuple_canonical_duplicate_urls_ids) > 0):
                insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] )
                # Insert. (id_url_canonical_1, id_url_1), ...
                sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args)
                # logger.debug("SQL CODE: {}".format(sql_code))
                c = cursor.execute(sql_code)

    def _get_pattern_status_list(self):
        #####################
        ### Get list of domains to filter
        #####################
        # TODO: Cache on redis and query once every N hours? ...
        try:
            with psycopg.connect(self.db_connect_info) as conn:
                # Open cursor
                cursor = conn.cursor()
                # TODO: Cache on Redis
                list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall()
        except Exception as e:
            logger.warning("Error getting pattern status list: {}".format(str(e)))
            list_pattern_status = []
        return list_pattern_status

    def _get_domains_to_filter(self):
        #####################
        ### Get list of domains to filter
        #####################
        # TODO: Cache on redis and query once every N hours? ...
        try:
            with psycopg.connect(self.db_connect_info) as conn:
                # Open cursor
                cursor = conn.cursor()
                # TODO: Cache on Redis
                sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ]
        except Exception as e:
            logger.warning("Error getting domains to filter: {}".format(str(e)))
            sites_to_filter = []
        return sites_to_filter

    def _get_cached_source_id(self, source):
        ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB
        try:
            source_id = self.redis_instance.get(source)
            if (source_id is not None):
                source_id = source_id.decode("utf-8")
        except Exception as e:
            logger.warning("Exception querying Redis: {}".format(str(e)))
            source_id = None
        return source_id

    def _get_source_id(self, cursor, source):
        #####################
        ### Get source corresponding id
        #####################
        # Cached?
        id_source = self._get_cached_source_id(source)
        if (id_source is None):
            c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone()
            if (c is None) or (len(c) == 0):
                # Source does not exist, insert and get id
                c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone()
            # Decode source id
            id_source = c[0]
        # Cache
        self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds)
        return id_source

    def _get_urls_id(self, cursor, urls_full):
        #####################
        ### Get id of inserted and filtered URLs
        #####################
        # TODO: Cache url -> url_id, url_canonical
        if (len(urls_full) == 0):
            return []
        # Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source
        in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")"
        id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ]
        return id_urls_related

    def _insert_urls_source(self, cursor, id_urls_related, id_source):
        #####################
        ### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ...
        #####################
        if (len(id_urls_related) == 0) or (id_source is None):
            return
        columns = "(id_url, id_source)"
        insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] )
        # Insert
        sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args)
        # logger.debug("SQL CODE: {}".format(sql_code))
        c = cursor.execute(sql_code)

    def write_batch(self, urls_fetched, source):
        # Chunks of 50 elements
        n = 50
        # Divide in small chunks
        urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)]
        # Process
        for urls_fetched_chunk_i in urls_fetched_chunks:
            self._write_small_batch(urls_fetched_chunk_i, source)

    def _write_small_batch(self, urls_fetched, source):
        try:
            logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source))

            if (len(urls_fetched) == 0):
                logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source))
                return

            # Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests)
            random.shuffle(urls_fetched)

            # Get list of domains to filter
            list_domains_to_filter = self._get_domains_to_filter()
            # Get list of (pattern, priority, status) tuples to override status if required
            list_pattern_status_tuple = self._get_pattern_status_list()
            # Sort pattern tuples by priority
            list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True)

            # Process URLs to update DB
            list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple)
            # Full set of URL and its canonical form (to associate them to a search), both to insert and filter
            urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) )

            # Insert
            with psycopg.connect(self.db_connect_info) as conn:
                # Open cursor
                cursor = conn.cursor()
                # Autocommit at end of transaction (Atomic insert of URLs and sources)
                with conn.transaction() as tx:
                    # Insert processed URLs
                    self._insert_urls(cursor, list_insert_url_tuple_args)
                    # Insert URLs duplicated (canonical != fetched url)
                    self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls)

                    # Get source id in DB
                    id_source = self._get_source_id(cursor, source)
                    # Get IDs of all related URLs
                    id_urls_related = self._get_urls_id(cursor, urls_full)
                    # Insert search source associated to URLs
                    self._insert_urls_source(cursor, id_urls_related, id_source)

            # Update Redis status of inserted and filtered URLs after writing to DB
            for url, url_canonical in dict_full_urls_to_canonical.items():
                try:
                    # Set with updated expiry time
                    self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds)
                    if (url != url_canonical):
                        self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds)
                except Exception as e:
                    logger.warning("Exception running set in Redis: {}".format(str(e)))

            if (len(list_insert_url_tuple_args) > 0):
                try:
                    webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
                    endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token)

                    payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) })
                    r = requests.post(endpoint_message, data=payload)
                except Exception as e:
                    logger.warning("Webhook failed: {}".format(str(e)))

            logger.debug("URL DB write finished")
        except Exception as e:
            logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) )
            logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) )