diff --git a/1-DB.ipynb b/1-DB.ipynb index 17fb87e..edcfd28 100644 --- a/1-DB.ipynb +++ b/1-DB.ipynb @@ -118,14 +118,23 @@ " title TEXT,\n", " description TEXT,\n", " content TEXT,\n", + " valid_content BOOLEAN,\n", " language CHAR(2), -- ISO 639-1 Code\n", + " keywords TEXT[],\n", " tags TEXT[],\n", " authors TEXT[],\n", - " image_urls TEXT[]\n", + " image_main TEXT,\n", + " images_url TEXT[],\n", + " videos_url TEXT[],\n", + " url_host TEXT, -- www.breitbart.com\n", + " site_name TEXT -- Breitbart News\n", " );\n", " CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n", " CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n", + " CREATE INDEX idx_date_published ON URL_CONTENT (date_published);\n", + " CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);\n", " CREATE INDEX idx_language ON URL_CONTENT (language);\n", + " CREATE INDEX idx_url_host ON URL_CONTENT (url_host);\n", " \"\"\")\n", "\n", " # Feeds\n", diff --git a/A_Development.ipynb b/A_Development.ipynb index bee90df..ffc3b57 100644 --- a/A_Development.ipynb +++ b/A_Development.ipynb @@ -101,95 +101,11 @@ "outputs": [], "source": [] }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert',\n", - " 'foxnews.com')" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# !pip install trafilatura trafilatura[all] cchardet\n", - "import courlan\n", - "url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n", - "url = \"https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n", - "courlan.check_url(url)" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [], - "source": [ - "import newspaper\n", - "\n", - "article = newspaper.article(url)" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "datetime.datetime(2025, 3, 4, 4, 0, 31, tzinfo=tzoffset(None, -18000))" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "article.publish_date" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], "source": [ "# !pip install trafilatura\n", "import trafilatura\n", @@ -197,9 +113,18 @@ "\n", "url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n", "# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n", + "url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n", "\n", "# Fetch\n", - "doc = trafilatura.fetch_url(url)\n", + "doc = trafilatura.fetch_url(url)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "# Content & metadata\n", "metadata = trafilatura.extract_metadata(doc)\n", "content = trafilatura.extract(doc)" @@ -207,40 +132,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'author': 'Audrey Conklin',\n", - " 'body': ,\n", - " 'categories': [],\n", - " 'comments': None,\n", - " 'commentsbody': ,\n", - " 'date': '2025-03-03',\n", - " 'description': \"Disgraced parenting blogger and mom of six Ruby Franke's \"\n", - " '\"power\" and public image\" allowed her crimes against her '\n", - " 'children to go \"unchecked,\" according to a defense attorney.',\n", - " 'filedate': '2025-03-08',\n", - " 'fingerprint': None,\n", - " 'hostname': 'foxnews.com',\n", - " 'id': None,\n", - " 'image': 'https://static.foxnews.com/foxnews.com/content/uploads/2024/03/967e1c1b-Franke.jpg',\n", - " 'language': None,\n", - " 'license': None,\n", - " 'pagetype': 'article',\n", - " 'raw_text': None,\n", - " 'sitename': 'Fox News',\n", - " 'tags': [],\n", - " 'text': None,\n", - " 'title': \"Utah mommy blogger Ruby Franke's power, public image allowed child \"\n", - " \"abuse to go 'unchecked': expert\",\n", - " 'url': 'https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert'}\n" - ] - } - ], + "outputs": [], "source": [ "pprint(metadata.as_dict())" ] @@ -263,85 +157,50 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 18.6 ms, sys: 40 μs, total: 18.7 ms\n", - "Wall time: 18 ms\n" - ] - }, - { - "data": { - "text/plain": [ - "'en'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "'''\n", - "!pip install lingua-language-detector\n", - "import lingua\n", - "ld = lingua.LanguageDetectorBuilder.from_all_languages().build()\n", - "l = ld.detect_language_of(content)\n", - "'''\n", + "# !pip install newspaper4k\n", "# !pip install langdetect \n", + "import newspaper\n", "import langdetect\n", "langdetect.DetectorFactory.seed = 0\n", - "langdetect.detect(content)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !pip install newspaper4k" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import newspaper\n", "\n", + "\n", + "\n", + "# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n", + "#url = \"https://www.waff.com/2025/03/11/colbert-heights-high-school-employee-arrested-child-abuse/\"\n", + "\n", + "\n", + "\n", + "#url = \"https://www.bloomberg.com/news/articles/2025-03-12/eu-launches-metals-tariff-retaliation-on-26-billion-of-us-goods\"\n", + "\n", + "\n", + "url = \"https://apnews.com/article/canada-trump-us-tariffs-steel-2517a6a2baf0596cb1a43d3a7d1e7939\"\n", "url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n", - "url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n", + "#url = \"https://www.ft.com/content/6d7c6915-4ceb-43fc-9896-590036b12a87\"\n", + "#url = \"https://www.lanacion.com.ar/politica/milei-en-bahia-blanca-un-viaje-sorpresa-para-frenar-las-criticas-y-mostrar-cercania-nid12032025/\"\n", + "#url = \"https://www.missingkids.org/poster/NCMC/2043547/1\"\n", "\n", - "article = newspaper.article(url)\n", + "try:\n", + " article = newspaper.article(url)\n", + "except newspaper.ArticleException as e:\n", + " print(\"ArticleException: {}\".format(str(e)))\n", + "except Exception as e:\n", + " print(\"Err: {}\".format(str(e)))\n", "\n", - "url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])" + "# url_photo = set([i for i in article.images if \"api.missingkids.org/photographs\" in i])\n", + "# article.is_valid_url(), article.is_parsed, article.is_media_news(), article.is_valid_body()\n", + "article.meta_data\n", + "\n" ] }, { @@ -351,6 +210,13 @@ "outputs": [], "source": [] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, diff --git a/app_urls/README.md b/app_urls/README.md index 330c3d4..2704272 100644 --- a/app_urls/README.md +++ b/app_urls/README.md @@ -3,6 +3,36 @@ conda create -n matitos_urls python=3.12 conda activate matitos_urls pip install django psycopg[binary] django-rq +pip install feedparser python-dateutil newspaper4k lxml[html_clean] +``` + +* From automated inspectdb +``` +# 1) Inspect DB, generate models.py +python manage.py inspectdb + +# 2) models.py, within class Urls, add: + + class STATUS_ENUM(models.TextChoices): + RAW = "raw" + ERROR = "error" + VALID = "valid" + UNKNOWN = "unknown" + INVALID = "invalid" + DUPLICATE = "duplicate" + +# Update status + status = models.TextField(choices=STATUS_ENUM, default=STATUS_ENUM.RAW) # This field type is a guess. + +# To class Meta, add default ordering + class Meta: + managed = False + db_table = 'urls' # db_table = '{}_urls'.format(project_name) + ordering = ["-ts_fetch"] + +# Fields default: + ts_fetch = models.DateTimeField(auto_now_add=True) + status = models.TextField(default='raw') # This field type is a guess. ``` * Environment variables @@ -25,10 +55,20 @@ python manage.py makemigrations python manage.py migrate --fake ``` - +* Deploy ``` # Server python manage.py runserver + # Worker python manage.py rqworker default +while true; do python manage.py rqworker default --burst; sleep 5; done ``` + +* Utils +``` +python manage.py rqstats +python manage.py rqstats --interval=1 # Refreshes every second +python manage.py rqstats --json # Output as JSON +python manage.py rqstats --yaml # Output as YAML +``` \ No newline at end of file diff --git a/app_urls/api/models.py b/app_urls/api/models.py index 7dbc48b..ed1575b 100644 --- a/app_urls/api/models.py +++ b/app_urls/api/models.py @@ -55,8 +55,8 @@ class UrlContent(models.Model): class Urls(models.Model): url = models.TextField(unique=True) - ts_fetch = models.DateTimeField() - status = models.TextField() # This field type is a guess. + ts_fetch = models.DateTimeField(auto_now_add=True) + status = models.TextField(default='raw') # This field type is a guess. class Meta: managed = False diff --git a/app_urls/api/obsolete_src/db_utils.py b/app_urls/api/obsolete_src/db_utils.py new file mode 100644 index 0000000..eca1a73 --- /dev/null +++ b/app_urls/api/obsolete_src/db_utils.py @@ -0,0 +1,502 @@ +import psycopg +import redis +import traceback +import random +import requests +import json +import os +from .url_utils import process_article +from .logger import get_logger +logger = get_logger() + +# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ... +# The rest, elsewhere + +class DB_Handler(): + def __init__(self, db_connect_info, redis_connect_info): + logger.debug("Initializing URL DB writer") + self.db_connect_info = db_connect_info + self.redis_instance = redis.Redis(host=redis_connect_info.get("host"), port=redis_connect_info.get("port")) + self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours + + try: + self.redis_instance.ping() + logger.debug("Succesfully pinged Redis") + except Exception as e: + logger.warning("Error trying to ping Redis: {}".format(str(e))) + + def get_urls_count(self, last_minutes_check): + ##################### + ### Get number of URLs within last X minutes + ##################### + try: + # Update + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0] + except Exception as e: + logger.warning("Error updating URLs status: {}".format(str(e))) + num_urls = None + return num_urls + + def _get_url_host_list(self): + try: + with psycopg.connect(self.db_connect_info) as conn: + # List of URL host + list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()] + # Clean http / https from URLs + list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host] + # Clean last slash if exists + list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host] + except Exception as e: + logger.warning("Exception fetching URL host list: " + str(e)) + list_url_host = [] + return list_url_host + + def _get_search_list(self): + try: + with psycopg.connect(self.db_connect_info) as conn: + # List of keyword searches + list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()] + except Exception as e: + logger.warning("Exception fetching searches list: " + str(e)) + list_search_text = [] + return list_search_text + + def _get_feed_urls(self): + try: + with psycopg.connect(self.db_connect_info) as conn: + list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall() + # Decode (tuple with 1 element) + list_url_feeds = [l[0] for l in list_url_feeds] + except Exception as e: + logger.warning("Exception fetching RSS sites: " + str(e)) + list_url_feeds = [] + return list_url_feeds + + def _get_url_hosts(self): + try: + with psycopg.connect(self.db_connect_info) as conn: + list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall() + # Decode (tuple with 1 element) + list_url_hosts = [l[0] for l in list_url_hosts] + except Exception as e: + logger.warning("Exception fetching RSS sites: " + str(e)) + list_url_hosts = [] + return list_url_hosts + + def _format(self, values): + # Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729 + # String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value) + if (type(values) == list) or (type(values) == tuple): + insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")" + elif (type(values) == str): + insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" ) + else: + logger.warning("Error formatting input values: {}".format(values)) + assert False + return insert_args + + def _get_cached_canonical_url(self, url): + ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB + try: + filter_url = self.redis_instance.get(url) + if (filter_url is not None): + filter_url = filter_url.decode("utf-8") + except Exception as e: + logger.warning("Exception querying Redis: {}".format(str(e))) + filter_url = None + return filter_url + + def _update_urls_status(self, dict_status_ids): + ##################### + ### Update status to array of URL IDs + ##################### + try: + # Update + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + # Autocommit at end of transaction (Atomic insert of URLs and sources) + with conn.transaction() as tx: + for key_status, value_ids in dict_status_ids.items(): + cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids]))) + except Exception as e: + logger.warning("Error updating URLs status: {}".format(str(e))) + + def _get_missing_kids_urls(self, num_urls=None): + ##################### + ### Get list of Missing Kids URLs + ##################### + try: + missing_kids_ids_and_urls = [] + if (num_urls is None): + limit = 500 + else: + limit = num_urls + offset = 0 + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + while True: + # Query + missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall() + # Finished? + if (len(missing_kids_ids_and_urls_query) == 0): + break + # Extend + missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query + # Offset + offset += len(missing_kids_ids_and_urls_query) + # Stop? + if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls): + break + + except Exception as e: + logger.warning("Error getting Missing Kids URLs: {}".format(str(e))) + missing_kids_ids_and_urls = [] + return missing_kids_ids_and_urls + + def _get_error_urls(self, num_urls=None): + ##################### + ### Get list of Missing Kids URLs + ##################### + try: + error_urls = [] + if (num_urls is None): + limit = 500 + else: + limit = num_urls + offset = 0 + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + while True: + # Query + error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall() + # Finished? + if (len(error_urls_query) == 0): + break + # Extend + error_urls = error_urls + error_urls_query + # Offset + offset += len(error_urls_query) + # Stop? + if (num_urls is not None) and (len(error_urls) >= num_urls): + break + + except Exception as e: + logger.warning("Error getting Error URLs: {}".format(str(e))) + error_urls = [] + return error_urls + + def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched... + """ + # TODO: REFACTOR + For each input url + + Already processed? + -> Update on Redis expire time + -> Associate to source + Not processed? Get main URL: + -> URL Canonical valid? + -> Rely on this as main URL + -> URL Canonical not valid? + -> Use input url, unless it's a news.google.com link + -> If news.google.com link, filter out. REDIS? + Main URL processing: + -> Update in REDIS, association url -> url_canonical + -> url != url_canonical: Add in duplicate table + If both != news.google.com + """ + + # URLs to insert, URLs duplicated association, URL to Canonical form + list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {} + + # URL VS CANONICAL: + # News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen + # Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/ + + for url in urls_fetched: + # Domain to filter? Input url + filter_due_to_domain = False + for domain_to_filter in list_domains_to_filter: + if (domain_to_filter in url): + logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url)) + filter_due_to_domain = True + if (filter_due_to_domain): + continue + + # URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB + cached_canonical_url = self._get_cached_canonical_url(url) + if (cached_canonical_url is not None): + # Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry) + dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y + # If url has been processed, so was its canonical form + logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url)) + continue + + # Process TODO: Add language... + url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple) + # TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id) + + # Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB) + if (url_canonical is None) and ("news.google.com" in url): + logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url)) + continue + # Canonical URL still news.google.com? Continue (avoid inserting in DB) + if (url_canonical is not None) and ("news.google.com" in url_canonical): + logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical)) + continue + + # Domain to filter? Input canonical_url + filter_due_to_domain = False + for domain_to_filter in list_domains_to_filter: + if (url_canonical is not None) and (domain_to_filter in url_canonical): + filter_due_to_domain = True + if (filter_due_to_domain): + logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical)) + continue + + if (url_canonical is None) or (article_status == "error"): + logger.debug("Processing failed for URL: {}".format(url)) + # Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based + if ("news.google.com" in url) or ("consent.google.com" in url): + logging.debug("Not able to process Google News link, skipping: {}".format(url)) + else: + dict_full_urls_to_canonical[url] = url # X -> X + list_insert_url_tuple_args.append( (url, article_status) ) + continue + + # URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different + if (url_canonical != url): + list_tuple_canonical_duplicate_urls.append( (url_canonical, url) ) + # Dict: url -> canonical (update association) + dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X + + # Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB + if (self._get_cached_canonical_url(url_canonical) is not None): + # Canonical URL was already processed + logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical)) + else: + # Insert url_canonical to DB formatted + list_insert_url_tuple_args.append( (url_canonical, article_status) ) + # Canonical URL different? Process + if (url_canonical != url): + if ("news.google.com" in url) or ("consent.google.com" in url): + logging.debug("Not adding google.news.com based link, skipping: {}".format(url)) + else: + # Fetched url -> duplicate (using canonical as main link) + article_status = "duplicate" + # Insert url (non-canonical) to DB formatted + list_insert_url_tuple_args.append( (url, article_status) ) + + return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical + + def _insert_urls(self, cursor, list_insert_url_tuple_args): + ##################### + ### Insert URLs with status + ##################### + if (len(list_insert_url_tuple_args) > 0): + insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] ) + # Insert. (url_1, status_1), (url_2, status_2), ... + sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args) + # logger.debug("SQL CODE: {}".format(sql_code)) + c = cursor.execute(sql_code) + # NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT) + # https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488 + + def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls): + ##################### + ### Insert duplicated URLs + ##################### + if (len(list_tuple_canonical_duplicate_urls) > 0): + # Flatten, format, set to remove duplicates + args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")" + + # Dict: url -> id + dict_url_to_id = {} + # Get url -> id association to populate duplicated URLs + for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall(): + dict_url_to_id[url_] = id_ + + # Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB + # ORIGINAL CODE. Issue, might not have found association to all urls + ### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls] + + list_tuple_canonical_duplicate_urls_ids = [] + for (url_1, url_2) in list_tuple_canonical_duplicate_urls: + id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2) + if (id_url_1 is None) or (id_url_2 is None): + logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2)) + else: + list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) ) + + if (len(list_tuple_canonical_duplicate_urls_ids) > 0): + insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] ) + # Insert. (id_url_canonical_1, id_url_1), ... + sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args) + # logger.debug("SQL CODE: {}".format(sql_code)) + c = cursor.execute(sql_code) + + def _get_pattern_status_list(self): + ##################### + ### Get list of domains to filter + ##################### + # TODO: Cache on redis and query once every N hours? ... + try: + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + # TODO: Cache on Redis + list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall() + except Exception as e: + logger.warning("Error getting pattern status list: {}".format(str(e))) + list_pattern_status = [] + return list_pattern_status + + def _get_domains_to_filter(self): + ##################### + ### Get list of domains to filter + ##################### + # TODO: Cache on redis and query once every N hours? ... + try: + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + # TODO: Cache on Redis + sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ] + except Exception as e: + logger.warning("Error getting domains to filter: {}".format(str(e))) + sites_to_filter = [] + return sites_to_filter + + def _get_cached_source_id(self, source): + ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB + try: + source_id = self.redis_instance.get(source) + if (source_id is not None): + source_id = source_id.decode("utf-8") + except Exception as e: + logger.warning("Exception querying Redis: {}".format(str(e))) + source_id = None + return source_id + + def _get_source_id(self, cursor, source): + ##################### + ### Get source corresponding id + ##################### + # Cached? + id_source = self._get_cached_source_id(source) + if (id_source is None): + c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone() + if (c is None) or (len(c) == 0): + # Source does not exist, insert and get id + c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone() + # Decode source id + id_source = c[0] + # Cache + print("*"*10, source, id_source) + self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds) + return id_source + + def _get_urls_id(self, cursor, urls_full): + ##################### + ### Get id of inserted and filtered URLs + ##################### + # TODO: Cache url -> url_id, url_canonical + if (len(urls_full) == 0): + return [] + # Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source + in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")" + id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ] + return id_urls_related + + def _insert_urls_source(self, cursor, id_urls_related, id_source): + ##################### + ### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ... + ##################### + if (len(id_urls_related) == 0) or (id_source is None): + return + columns = "(id_url, id_source)" + insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] ) + # Insert + sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args) + # logger.debug("SQL CODE: {}".format(sql_code)) + c = cursor.execute(sql_code) + + def write_batch(self, urls_fetched, source): + # Chunks of 50 elements + n = 50 + # Divide in small chunks + urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)] + # Process + for urls_fetched_chunk_i in urls_fetched_chunks: + self._write_small_batch(urls_fetched_chunk_i, source) + + def _write_small_batch(self, urls_fetched, source): + try: + logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source)) + + if (len(urls_fetched) == 0): + logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source)) + return + + # Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests) + random.shuffle(urls_fetched) + + # Get list of domains to filter + list_domains_to_filter = self._get_domains_to_filter() + # Get list of (pattern, priority, status) tuples to override status if required + list_pattern_status_tuple = self._get_pattern_status_list() + # Sort pattern tuples by priority + list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True) + + # Process URLs to update DB + list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple) + # Full set of URL and its canonical form (to associate them to a search), both to insert and filter + urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) ) + + # Insert + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + # Autocommit at end of transaction (Atomic insert of URLs and sources) + with conn.transaction() as tx: + # Insert processed URLs + self._insert_urls(cursor, list_insert_url_tuple_args) + # Insert URLs duplicated (canonical != fetched url) + self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls) + + # Get source id in DB + id_source = self._get_source_id(cursor, source) + # Get IDs of all related URLs + id_urls_related = self._get_urls_id(cursor, urls_full) + # Insert search source associated to URLs + self._insert_urls_source(cursor, id_urls_related, id_source) + + # Update Redis status of inserted and filtered URLs after writing to DB + for url, url_canonical in dict_full_urls_to_canonical.items(): + try: + # Set with updated expiry time + self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds) + if (url != url_canonical): + self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds) + except Exception as e: + logger.warning("Exception running set in Redis: {}".format(str(e))) + + if (len(list_insert_url_tuple_args) > 0): + try: + webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN") + endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token) + + payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) }) + r = requests.post(endpoint_message, data=payload) + except Exception as e: + logger.warning("Webhook failed: {}".format(str(e))) + + logger.debug("URL DB write finished") + except Exception as e: + logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) ) + logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) ) \ No newline at end of file diff --git a/app_urls/api/obsolete_src/fetch_feed.py b/app_urls/api/obsolete_src/fetch_feed.py new file mode 100644 index 0000000..dc12736 --- /dev/null +++ b/app_urls/api/obsolete_src/fetch_feed.py @@ -0,0 +1,48 @@ +from .db_utils import DB_Handler +import feedparser +import dateutil +from .logger import get_logger +logger = get_logger() + +class FetchFeeds(): + def __init__(self, db_handler: DB_Handler) -> None: + logger.debug("Initializing News feed") + self.db_handler = db_handler + + def run(self): + try: + logger.debug("Starting NewsFeed.run()") + # Get feeds + list_url_feeds = self.db_handler._get_feed_urls() + logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds))) + + # Process via RSS feeds + for url_feed in list_url_feeds: + # Initialize + urls_fetched, urls_publish_date = [], [] + # Fetch feeds + feeds = feedparser.parse(url_feed) + # Parse + for f in feeds.get("entries", []): + # Get URL + url = f.get("link", None) + # Process? + if (url is not None): + # Available publish date? + publish_date_parsed = f.get("published_parsed") + if (publish_date_parsed is None): + publish_date = f.get("published", None) + if (publish_date is not None): + publish_date_parsed = dateutil.parser.parse(publish_date) + + # Published date + urls_publish_date.append(publish_date_parsed) + # URL + urls_fetched.append(url) + + # URL fetching source + source = "feed {}".format(url_feed) + # Write to DB + self.db_handler.write_batch(urls_fetched, source) + except Exception as e: + logger.warning("Exception in NewsFeed.run(): {}".format(str(e))) diff --git a/app_urls/api/obsolete_src/fetch_parser.py b/app_urls/api/obsolete_src/fetch_parser.py new file mode 100644 index 0000000..c3a73cb --- /dev/null +++ b/app_urls/api/obsolete_src/fetch_parser.py @@ -0,0 +1,45 @@ +from .db_utils import DB_Handler +import newspaper +from .logger import get_logger +logger = get_logger() + +class FetchParser(): + def __init__(self, db_handler: DB_Handler) -> None: + logger.debug("Initializing News SiteParsing newspaper4k") + self.db_handler = db_handler + + # TODO: MOVE LOGIC ELSEWHERE! + def _postprocess(self, article_urls): + return [url.replace("#comment-stream", "") for url in article_urls] + + def run(self): + try: + logger.debug("Starting NewsSiteParsing.run() for {}") + + # Get URL hosts + list_url_hosts = self.db_handler._get_url_hosts() + logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts))) + + # Process newspaper4k build method + for url_host_feed in list_url_hosts: + # Protocol + if not (url_host_feed.startswith("http")): + url_host_feed_formatted = "https://" + url_host_feed + else: + url_host_feed_formatted = url_host_feed + + logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted)) + # Source object + url_host_built = newspaper.build(url_host_feed_formatted) + # Get articles URL list + urls_fetched = url_host_built.article_urls() + # TODO: MOVE! + # Post-processing + urls_fetched = self._postprocess(urls_fetched) + + # URL fetching source + source = "newspaper4k {}".format(url_host_feed) + # Write to DB + self.db_handler.write_batch(urls_fetched, source) + except Exception as e: + logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e))) \ No newline at end of file diff --git a/app_urls/api/obsolete_src/fetch_search.py b/app_urls/api/obsolete_src/fetch_search.py new file mode 100644 index 0000000..9d6a17d --- /dev/null +++ b/app_urls/api/obsolete_src/fetch_search.py @@ -0,0 +1,73 @@ +from .db_utils import DB_Handler +from .utils import get_searxng_instances +from .fetch_search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch +from .logger import get_logger +logger = get_logger() + +class FetchSearcher(): + def __init__(self, db_handler: DB_Handler, full=True) -> None: + logger.debug("Initializing News feed") + self.db_handler = db_handler + self.full_search = full + + def _run_fetching(self, search_text): + logger.debug("Starting _run_fetching() for {}".format(search_text)) + + # Common parameters + lang, region = "en", "US" + + ### PreSearch + dict_params_news = {"search": search_text} + FetcherPreSearch(**dict_params_news).fetch_articles(self.db_handler) + + ### DuckDuckGo + period = "d" + dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period} + FetcherDuckDuckGo(**dict_params_news).fetch_articles(self.db_handler) + dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period} + FetcherDuckDuckGo(**dict_params_general).fetch_articles(self.db_handler) + + if (self.full_search): + # Avoid site:{} search due to G-Bypass required time + if ("site:" not in search_text): + ### GNews + dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period} + FetcherGNews(**dict_params).fetch_articles(self.db_handler) + + ### GoogleNews + dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period} + FetcherGoogleNews(**dict_params_news).fetch_articles(self.db_handler) + # dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period} + + if False: + ### SearxNG + period = "day" + for searx_instance in get_searxng_instances(): + dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period} + dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period} + # Append thread + FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler) + FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler) + + logger.debug("Finished _run_fetching()") + + def run(self): + try: + logger.info("Fetching text searches & URL hosts of interest") + + # Get text searches of interest + list_search_text_of_interest = self.db_handler._get_search_list() + + # Get URL host of interest + list_url_host = self.db_handler._get_url_host_list() + # Get text searches for URL hosts + list_search_text_url_host = ["site:{}".format(l) for l in list_url_host] + + for search_text in list_search_text_of_interest + list_search_text_url_host: + logger.debug("Fetching news for search: {}".format(search_text)) + self._run_fetching(search_text) + + logger.info("Finished fetching text searches & URL hosts of interest") + except Exception as e: + logger.warning("Exception in NewsSearch.run(): {}".format(str(e))) + \ No newline at end of file diff --git a/app_urls/api/obsolete_src/fetch_search_sources.py b/app_urls/api/obsolete_src/fetch_search_sources.py new file mode 100644 index 0000000..25813b5 --- /dev/null +++ b/app_urls/api/obsolete_src/fetch_search_sources.py @@ -0,0 +1,384 @@ +from duckduckgo_search import DDGS +from gnews import GNews +from GoogleNews import GoogleNews + +import requests +from bs4 import BeautifulSoup +import os +import time +import json +import numpy as np +import random +from .google_bypass import GoogleByPass +from abc import ABC, abstractmethod +from .logger import get_logger +logger = get_logger() + + + +# Generic fetcher (fetches articles, writes to DB) +class FetcherAbstract(ABC): + @abstractmethod + def _fetch(self): + pass + + def fetch_articles(self, db_writer): + logger.debug("Starting fetch() for {}".format(self.name)) + # Fetch articles + list_news = self._fetch() + logger.info("Found #{} articles for search: {}".format(len(list_news), self.name)) + # Write to DB + db_writer.write_batch(list_news, self.name) + +# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/ + +user_agents_list = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48", + "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41", + "Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" +] + + + + + +class FetcherPreSearch(FetcherAbstract): + def __init__(self, search): + """ + # period -> + - h = hours (eg: 12h) + - d = days (eg: 7d) + - m = months (eg: 6m) + - y = years (eg: 1y) + """ + self.search = search + self.period = "1d" # TODO Fixed for the moment + # self.lang = lang + # self.region = region + search_category = "news" + self.name = "presearch {} {} {}".format(search, search_category, self.period) + + def _fetch(self): + try: + # PreSearch fetching endpoint, parameter search keyword + presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search) + # Timeout: 15 minutes + r = requests.get(presearch_fetch_endpoint, timeout=900) + # Decode + list_news = json.loads(r.text).get("list_urls", []) + except Exception as e: + logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e))) + list_news = [] + return list_news + + + +class FetcherGNews(FetcherAbstract): + def __init__(self, search, period, lang="en", region="US"): + """ + # period -> + - h = hours (eg: 12h) + - d = days (eg: 7d) + - m = months (eg: 6m) + - y = years (eg: 1y) + """ + self.search = search + self.period = period + self.lang = lang + self.region = region + search_category = "news" + self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region)) + + def _fetch(self): + try: + list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search) + # Decode + list_news = [] + for l in list_dict_news: + list_news.append(l.get("url")) + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self.name, str(e))) + list_news = [] + + # Bypass Google links + list_news_redirections = GoogleByPass().bypass_google_urls(list_news) + + return list_news_redirections + +class FetcherGoogleNews(FetcherAbstract): + def __init__(self, search, search_category="news", period="1d", lang="en", region="US"): + assert(search_category in ["news", "general"]) + + self.lang = lang + self.region = region + self.period = period + self.search_category = search_category + self.search = search + self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region)) + + def _fetch(self): + try: + # Initialize + g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region) + g.enableException(True) + + if (self.search_category == "general"): + set_links = set() + # Search + g.search(self.search) + + # Iterate pages + MAX_ITER_PAGES = 15 + for i in range(MAX_ITER_PAGES): + time.sleep(random.uniform(1, 1.5)) + num_before = len(set_links) + + # Get page + try: + links = g.page_at(i) + except Exception as e: + logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e))) + break + # Links + for l in links: + # '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ' + url = l.get("link").split("url=")[-1] + set_links.add(url) + + num_after = len(set_links) + + # Finished? + if (num_before == num_after): + logger.debug("Iterated {} pages on GoogleNews general search".format(i)) + break + # To list + list_news = list(set_links) + elif (self.search_category == "news"): + # Search + g.get_news(self.search) + # Fetch + list_news = g.get_links() + + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self.name, str(e))) + list_news = [] + + # Bypass Google links + list_news_redirections = GoogleByPass().bypass_google_urls(list_news) + + return list_news_redirections + +class FetcherDuckDuckGo(FetcherAbstract): + def __init__(self, search, search_category, period, lang="wt", region="wt"): + assert(search_category in ["news", "general"]) + assert(period in ["d", "w", "m", "y"]) + self.search = search + self.search_category = search_category + self.period = period + self.lang_region = "{}-{}".format(lang, region) + self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region) + + def _fetch(self): + try: + list_news = [] + with DDGS(timeout=10) as ddgs: + if (self.search_category == "general"): + generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region) + elif (self.search_category == "news"): + generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region) + + for l in generator_links: + list_news.append( l.get("url", l.get("href")) ) + + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self.name, str(e))) + list_news = [] + return list_news + + +class FetcherSearxNews(FetcherAbstract): + def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"): + assert(search_category in ["news", "general"]) + assert(period in [None, "day", "week", "month", "year"]) + # Random header (minimize prob of web-scrapping detection) + self.headers = { + 'User-agent': str(np.random.choice(user_agents_list)), + 'Accept-Encoding': 'gzip, deflate', + 'Accept': '*/*', + 'Connection': 'keep-alive', + } + """ # Optional header + self.headers = { + 'User-agent': str(np.random.choice(user_agents_list)), + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'TE': 'trailers', + 'Sec-Fetch-Site': 'cross-site', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Dest': 'document', + } + """ + self.search = search + self.searx_instance = searx_instance + self.lang_region = "{}-{}".format(lang, region) + self.search_category = search_category + self.period = period + self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5 + self.request_timeout = 240 + + period_name_mapping = { + None: "no_date_range", + "day": "1d", + "week": "1w", + "month": "1m", + "year": "1y", + } + self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region) + logger.info("SearX - Initialized SearX fetcher: {}".format(self.name)) + + def _request_and_decode(self, url_search): + # Initial random time sleep (minimize chance of getting blocked) + time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher)) + # Request + logger.debug("SearX - Searching: {}".format(url_search)) + try: + r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout) + except Exception as e: + logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e)) + return [] + + if (r.status_code == 200): + # Status code Ok + pass + elif (r.status_code == 429): + # TooManyRequests, "Rate limit exceeded" + logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text)) + return [] + elif (r.status_code != 200): + logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text)) + return [] + else: + logger.debug("SearX - Status code: {}".format(r.status_code)) + + # Decode request + soup = BeautifulSoup(r.text, 'html.parser') + page_url_set = set() + # h3 links + for elem in soup.find_all('h3'): + # Get url + url = elem.find('a').get('href') + page_url_set.add(url) + return page_url_set + + def _get_news_list(self): + ############################################################ + # Domain & search parameter + search_domain = os.path.join(self.searx_instance, "search?q=") + # Search keywords + search_formatted = self.search.replace(" ", "+").replace(":", "%3A") + # Period formatted + period_formatted = "&time_range={}".format(self.period) if self.period is not None else "" + # Search parameters + search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted) + # Combined url search + url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters) + ############################################################ + + # Request and decode on page=1 + url_set = self._request_and_decode(url_search_nopage) + # No results? + if (len(url_set) == 0): + logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage)) + return [] + + # Iterate pages + search_numpage = 2 + while True: + # Combine url search with page number + url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage) + # Request and decode on page=X + url_set_i = self._request_and_decode(url_search_with_page) + + # Length before merging + length_current = len(url_set) + # Merge + url_set = url_set.union(url_set_i) + # Length after merging + length_merged = len(url_set) + + # No new elements? + if (length_current == length_merged): + logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage)) + break + # Next page + search_numpage += 1 + + return list(url_set) + + def _fetch(self): + try: + # Fetch news + list_news = self._get_news_list() + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self.name, str(e))) + list_news = [] + return list_news diff --git a/app_urls/api/obsolete_src/google_bypass.py b/app_urls/api/obsolete_src/google_bypass.py new file mode 100644 index 0000000..6e34e72 --- /dev/null +++ b/app_urls/api/obsolete_src/google_bypass.py @@ -0,0 +1,26 @@ +import requests +import json +from .logger import get_logger +logger = get_logger() + +class GoogleByPass(): + def __init__(self) -> None: + pass + + def bypass_google_urls(self, list_urls): + if (len(list_urls) == 0): + return [] + + try: + # Endpoint + gbypass_endpoint = "http://selenium_app:80/get_redirection" + # Timeout: 20 minutes + timeout = 60*20 + r = requests.post(gbypass_endpoint, json={"list_urls": list_urls}, timeout=timeout) + # Decode + list_urls_redirections = json.loads(r.text).get("list_urls_redirections", []) + except Exception as e: + logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e))) + list_urls_redirections = [] + + return list_urls_redirections diff --git a/app_urls/api/obsolete_src/logger.py b/app_urls/api/obsolete_src/logger.py new file mode 100644 index 0000000..83f00b3 --- /dev/null +++ b/app_urls/api/obsolete_src/logger.py @@ -0,0 +1,22 @@ +import logging + +import os +os.makedirs("logs", exist_ok=True) + +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") +logger.setLevel(logging.INFO) + +# To file log: INFO / WARNING / ERROR +fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4) +fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +logger.addHandler(fh) + +# To file log: WARNING / ERROR +fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1) +fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh_.setLevel(logging.WARNING) +logger.addHandler(fh_) + +def get_logger(): + return logger diff --git a/app_urls/api/obsolete_src/missing_kids_fetch.py b/app_urls/api/obsolete_src/missing_kids_fetch.py new file mode 100644 index 0000000..ea92cb7 --- /dev/null +++ b/app_urls/api/obsolete_src/missing_kids_fetch.py @@ -0,0 +1,36 @@ +from .db_utils import DB_Handler +import requests +import json +from .logger import get_logger +logger = get_logger() + +class MissingKidsFetch(): + def __init__(self, db_handler: DB_Handler, num_pages) -> None: + logger.debug("Initializing News MissingKids") + self.db_handler = db_handler + self.num_pages = num_pages + self.missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}" + + def run(self): + try: + logger.debug("Starting NewsMissingKids.run()") + try: + # Timeout + if (self.num_pages > 15): + timeout = 60*90 # 1.5h + else: + timeout = 60*5 # 5 min + # Request + r = requests.get(self.missingkids_fetch_endpoint.format(self.num_pages), timeout=timeout) + # Decode + urls_fetched = json.loads(r.text).get("list_urls", []) + except Exception as e: + logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e))) + urls_fetched = [] + + # URL fetching source + source = "missingkids fetcher" + # Write to DB + self.db_handler.write_batch(urls_fetched, source) + except Exception as e: + logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e))) diff --git a/app_urls/api/obsolete_src/missing_kids_status.py b/app_urls/api/obsolete_src/missing_kids_status.py new file mode 100644 index 0000000..df0768a --- /dev/null +++ b/app_urls/api/obsolete_src/missing_kids_status.py @@ -0,0 +1,98 @@ +from .db_utils import URL_DB_Writer +from .url_utils import get_missing_kid_status +from .logger import get_logger +logger = get_logger() + + +def get_missing_kid_status(url, return_canonical_url=False): + import time + import requests + + # Sleep + time.sleep(0.75) + try: + # Request + r = requests.get(url, timeout=300) + # Decode + status_code = r.status_code + # Canonical URL removing parameters + url_canonical = r.url + except Exception as e: + logger.warning("Exception on get URL status request: {}. {}".format(url, str(e))) + status_code = None + url_canonical = url + + if (status_code == 200): + status = "valid" + elif (status_code == 404): + status = "invalid" + else: + status = "unknown" + + logger.debug("Missing Kid URL {} status: {}".format(url, status)) + if (return_canonical_url): + return status, url_canonical + else: + return status + +class MissingKidsStatus(): + def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None: + self.num_urls = num_urls + self.db_connect_info = db_connect_info + self.redis_connect_info = redis_connect_info + self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info) + + def update_missing_kids_status(self): + try: + logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls)) + # List of URLs + list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls) + # Dict: status -> IDs to update to new status + dict_status_ids, dict_status_urls = {}, {} + # Check URLs with invalid status? + skip_invalid_check = False + + flush_every, flush_current = 20, 0 + # Iterate URLs + for (id, url, current_status) in list_ids_and_urls: + # Skip duplicate URLs + if (current_status == "duplicate"): + continue + # Skip invalid URLs? + if (skip_invalid_check): + if (current_status == "invalid"): + continue + + # Get status + new_status = get_missing_kid_status(url) + # Different? Update + if (current_status != new_status): + # Extend array + dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id] + # Debugging dict + dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url] + # +1 processed + flush_current += 1 + + # Flush batch? + if (flush_every == flush_current): + logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls)) + # Update DB + self.db_writer._update_urls_status(dict_status_ids) + # Reset + flush_current = 0 + dict_status_ids, dict_status_urls = {}, {} + + # Flush remaining batch + if (flush_current > 0): + logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls)) + # Update DB + self.db_writer._update_urls_status(dict_status_ids) + # Reset + flush_current = 0 + dict_status_ids, dict_status_urls = {}, {} + + logger.info("Finished updating status to Missing Kids URLs") + except Exception as e: + logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e))) + \ No newline at end of file diff --git a/app_urls/api/obsolete_src/url_status.py b/app_urls/api/obsolete_src/url_status.py new file mode 100644 index 0000000..3948417 --- /dev/null +++ b/app_urls/api/obsolete_src/url_status.py @@ -0,0 +1,62 @@ +from .db_utils import URL_DB_Writer +from .url_utils import process_article +from .logger import get_logger +logger = get_logger() + +class UpdateErrorURLs(): + def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None: + self.num_urls = num_urls + self.db_connect_info = db_connect_info + self.redis_connect_info = redis_connect_info + self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info) + + def update_error_urls_status(self): + try: + logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls)) + # List of URLs with status 'error' + list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls) + # Current status + current_status = "error" + # Dict: status -> IDs to update to new status + dict_status_ids, dict_status_urls = {}, {} + + # Get list of (pattern, priority, status) tuples to override status if required + list_pattern_status_tuple = self.db_writer._get_pattern_status_list() + # Sort pattern tuples by priority + list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True) + + flush_every, flush_current = 20, 0 + # Iterate URLs + for (id, url) in list_ids_and_urls: + # Get status + url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple) + # Different? Update + if (current_status != new_status): + # Extend array + dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id] + # Debugging dict + dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url] + # +1 processed + flush_current += 1 + + # Flush batch? + if (flush_every == flush_current): + logger.info("Updating status to URLs with error: {}".format(dict_status_urls)) + # Update DB + self.db_writer._update_urls_status(dict_status_ids) + # Reset + flush_current = 0 + dict_status_ids, dict_status_urls = {}, {} + + # Flush remaining batch + if (flush_current > 0): + logger.info("Updating status to URLs with error: {}".format(dict_status_urls)) + # Update DB + self.db_writer._update_urls_status(dict_status_ids) + # Reset + flush_current = 0 + dict_status_ids, dict_status_urls = {}, {} + + logger.info("Finished updating status to URLs with error") + except Exception as e: + logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e))) diff --git a/app_urls/api/obsolete_src/url_utils.py b/app_urls/api/obsolete_src/url_utils.py new file mode 100644 index 0000000..ca57cb4 --- /dev/null +++ b/app_urls/api/obsolete_src/url_utils.py @@ -0,0 +1,263 @@ +from gnews import GNews +import dateutil.parser +from datetime import datetime, timedelta +from .utils import remove_http_s +import time +import random +import traceback +import requests +import json +import re +from bs4 import BeautifulSoup + +from .logger import get_logger +logger = get_logger() + +def get_published_date(article): + try: + """ + # Already fetched publish date information? + if (publish_date_ is not None): + return publish_date_ + """ + + # List of potential publish dates + potential_dates = [] + # Publish date is the best match + potential_dates.append(article.publish_date) + # Publish date metadata is the following best match + potential_dates.append(article.meta_data.get('article', {}).get("published_time", None)) + # Iterate remaining keys + for key in article.meta_data.keys(): + if ("date" in key): + potential_dates.append(article.meta_data[key]) + + def invalid_date(p_date): + # Today + 2 days, article from the future? + today_plus_two = datetime.utcnow() + timedelta(days=2) + # Article from the future? + return p_date.timestamp() > today_plus_two.timestamp() + + for date_ in potential_dates: + # String date? parse + if (type(date_) == str): + try: + date_ = dateutil.parser.parse(date_) + except Exception as e: + logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url)) + date_ = None + # Valid? + if (date_ is not None) and (not invalid_date(date_)): + return date_ + + logger.debug("Article with no published date: {}".format(article.url)) + return None + except Exception as e: + logger.info("Error while retrieving published date for URL: {}".format(article.url)) + return None + +def get_url_host(article_source_url, url): + # https://www.blabla.com/blabla -> www.blabla.com + if (article_source_url != ""): + # Article source URL already extracted, save path if any + return remove_http_s(article_source_url) # .split("/")[0] + else: + return remove_http_s(url).split("/")[0] + +def get_status_pattern_matching(url, article_status, list_pattern_status_tuple): + # Regex pattern to update status on "valid", "invalid", and "unknown" status only + # Status "raw", "duplicated" and "error" should remain the way they are + # Assumption: List of patterns sorted by importance + if (article_status in ["valid", "invalid", "unknown"]): + # Regular expression pattern matching: https://regexr.com/ + for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple: + # Matching? + matching = bool(re.match(regex_pattern, url)) + # Update article status + if (matching): + if (status_if_match != article_status): + logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url)) + return status_if_match + # Pattern matching not required or not found, original article status + return article_status + + + +def bypass_google_link(article_url): + + def bypass_google_consent(article_url): + # Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1 + article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "") + + # https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' + } + cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'} + + try: + # Request + r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300) + # Decode + soup = BeautifulSoup(r.text, 'html.parser') + url_of_interest = soup.a['href'] + except Exception as e: + logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e))) + url_of_interest = None + + # Not able to bypass? + if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest): + url_of_interest = None + return url_of_interest + + def bypass_google_using_service(article_url): + try: + # e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen" + gbypass_endpoint = "http://selenium_app:80/get_redirection" + # Timeout: 5 minutes + r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300) + # Decode + redirect_url = json.loads(r.text).get("redirect_url", "") + except Exception as e: + logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e))) + redirect_url = "" + + return redirect_url + + logger.debug("Starting gbypass_endpoint()") + + article_url_bypassed = None + # Bypass using request + if ("consent.google.com" in article_url): + article_url_bypassed = bypass_google_consent(article_url) + # Not bypassed yet? Bypass using service + if (article_url_bypassed is None): + article_url_bypassed = bypass_google_using_service(article_url) + + # if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed): + if (article_url_bypassed == "") or (article_url_bypassed is None): + # Empty URL returned by Gbypass + logger.warning("Error while bypassing Gnews for URL: {}".format(article_url)) + return None + else: + logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url)) + return article_url_bypassed + +def process_article(article_url, list_pattern_status_tuple, language="en"): + # TODO: + """ + https://github.com/fhamborg/news-please + https://github.com/fhamborg/Giveme5W1HQwer123$ + + https://github.com/santhoshse7en/news-fetch + """ + try: + logger.debug("Starting process_article()") + + if ("news.google.com" in article_url) or ("consent.google.com" in article_url): + # Bypass to get redirection + article_url = bypass_google_link(article_url) + # Error? + if (article_url is None): + return None, {}, "error" + elif ("missingkids.org/poster" in article_url): + # Get status + article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True) + article_elements = { + "url_full": article_url, + "url_canonical": url_canonical + } + return url_canonical, article_elements, article_status + else: + # Avoid Too many requests (feeds, ...) + time.sleep(0.75) + + logger.debug("Processing: {}".format(article_url)) + + # Default status unless something happens + article_status = "valid" + + # Parse article + # TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None + # TODO: Language per config + article = GNews(language).get_full_article(url=article_url) + + # Article parsed? + if (article is None) or (not article.is_parsed): + logger.debug("Article not parsed: {}".format(article_url)) + return article_url, {}, "error" + + # Canonical link as main URL + url_canonical = article.canonical_link + # Empty canonical URL? + if (article.canonical_link is None) or (article.canonical_link == ""): + # URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link + if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")): + logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url)) + try: + # Remove text after parameter call + url = article.url.split("?")[0] + # Remove comment-stream + url = url.replace("#comment-stream", "").replace("#disqus_thread", "") + # Article + article_attempt = GNews(language).get_full_article(url=url) + # Retrieving same title? Update article based on clean URL + if (article_attempt is not None) and (article_attempt.title == article.title): + article = article_attempt + except Exception as e: + logger.info("Article parsing of URL without parameters failed: {}".format(article.url)) + else: # Default behaviour + logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url)) + + # By default, URL same as canonical + url_canonical = article.url + + elif (article.url != article.canonical_link): + # If different, stick to canonical URL + logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link)) + else: + # If same, continue... + pass + + # Update config to determine if content is valid + article.config.MIN_WORD_COUNT = 150 + article.config.MIN_SENT_COUNT = 6 + + # Valid URL? + if (not article.is_valid_url()): + logger.debug("Not a valid news article: {}".format(url_canonical)) + article_status = "invalid" + # Is the article's body text is long enough to meet standard article requirements? + if (not article.is_valid_body()): + logger.debug("Article body not valid: {}".format(url_canonical)) + article_status = "unknown" + + if (article.images != article.imgs): + logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs)) + + # article.keywords, article.meta_keywords, article.summary + # article.movies + # article.top_image + + # Check if article status needs to be updated + article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple) + + article_elements = { + 'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/ + 'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com + 'title': article.title, # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020 + 'description': article.meta_description, # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office + 'text': article.text, # ${Article content} + 'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00' + 'authors': article.authors, # ['Christopher Knaus'] + 'language': article.meta_lang, # en + 'tags': list(article.tags), # ['Wide Open Border', '’My Son Hunter’ Movie', ...] + 'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...] + 'url_canonical': url_canonical, # Canonical URL (redirection) + # 'html': article.html, # HTML article + } + logger.debug("Processing OK: {}".format(url_canonical)) + return url_canonical, article_elements, article_status + except Exception as e: + logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc())) + return None, {}, "error" \ No newline at end of file diff --git a/app_urls/api/obsolete_src/utils.py b/app_urls/api/obsolete_src/utils.py new file mode 100644 index 0000000..76ae07a --- /dev/null +++ b/app_urls/api/obsolete_src/utils.py @@ -0,0 +1,33 @@ + +def remove_http_s(url): + url = url.replace("https://", "") if url.startswith("https://") else url + url = url.replace("http://", "") if url.startswith("http://") else url + return url + +def is_valid_url(url): + if (url.startswith("https://")): + return True + else: + return False + +def get_searxng_instances(): + # SearxNG instances: https://searx.space/ + searx_instances = set() + searx_instances.add("https://searx.work/") + searx_instances.add("https://search.ononoki.org/") + searx_instances.add("https://searxng.nicfab.eu/") + searx_instances.add("https://searx.be/") + + # searx_instances.add("https://searx.fmac.xyz/") + # searx_instances.add("https://northboot.xyz/") # FIX + + # searx_instances.add("https://serx.ml/") # Offline + # searx_instances.add("https://searx.ru/") + # searx_instances.add("https://searx.sp-codes.de/") + # searx_instances.add("https://searxng.nicfab.eu/") + # searx_instances.add("https://s.frlt.one/") + # searx_instances.add("https://search.sapti.me/") + + # To list + list_searx_instances = list(searx_instances) + return list_searx_instances \ No newline at end of file diff --git a/app_urls/api/src/db_utils.py b/app_urls/api/src/db_utils.py new file mode 100644 index 0000000..76415ed --- /dev/null +++ b/app_urls/api/src/db_utils.py @@ -0,0 +1,190 @@ +from ..models import Urls, UrlContent, UrlsSource, Source, WebsiteToFilter, StatusPatternMatching +from .url_processor import process_url +from django.utils import timezone +from django.core.cache import cache +import hashlib +from datetime import timedelta +import re +import traceback +from .logger import get_logger +logger = get_logger() + +class DB_Handler(): + def __init__(self): + logger.debug("Initializing URL DB Handler") + + def _get_safe_cache_key(self, raw_key): + """Generate a safe cache key using an MD5 hash""" + return hashlib.md5(raw_key.encode()).hexdigest() + + def _cache_key(self, cache_key, cache_timeout=86400): + cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout) + + def _is_cached_key(self, cache_key): + # Returns True if cached + return cache.get(self._get_safe_cache_key(cache_key)) is not None + + def insert_raw_urls(self, urls, source): + try: + logger.debug("Inserting raw URLs") + # Empty? + if (len(urls) == 0): + logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source)) + return + + url_object_to_insert = [] + # Per URL + for url in urls: + ### Already processed URL? + if (self._is_cached_key(url)): + logger.debug("Already cached URL: {}".format(url)) + + if (self._is_cached_key("{}{}".format(source, url))): + logger.debug("Already cached (source, URL): {} {}".format(source, url)) + else: + ### Insert source + # Get the source (create if not exists) + source_obj, created = Source.objects.get_or_create(source=source) + # Get URL ID + url_obj = Urls.objects.get(url=url) + # Create (id_source, id_url) + UrlsSource.objects.create(id_source=source_obj.id, id_url=url_obj.id) + else: + # Add object to insert + url_object_to_insert.append(Urls(url=url)) + + ### Bulk insert URLs, ignore conflicts if a url exists + bulk_created_obj = Urls.objects.bulk_create(url_object_to_insert, ignore_conflicts=True) + # Insert or update cache + for url in urls: + self._cache_key(url) + self._cache_key("{}{}".format(source, url)) + + logger.info("Inserted #{} raw URLs".format(len(url_object_to_insert))) + + except Exception as e: + logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) + + def _get_status_pattern_matching(self, url, article_status, list_pattern_status_tuple): + # Sort pattern tuples by priority. (pattern, priority, status) + list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True) + + # Regex pattern to update status on "valid", "invalid", and "unknown" status only + # Status "raw", "duplicated" and "error" should remain the way they are + # Assumption: List of patterns sorted by importance + if (article_status in ["valid", "invalid", "unknown"]): + # Regular expression pattern matching: https://regexr.com/ + for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple: + # Matching? Update article status + if bool(re.match(regex_pattern, url)): + if (status_if_match != article_status): + logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url)) + return status_if_match + # Pattern matching not required or not found, original article status + return article_status + + def process_raw_urls(self, time_delta=timedelta(days=1), batch_size=50): + try: + logger.debug("Processing raw URLs") + + # Get list of domains to filter + list_domains_to_filter = WebsiteToFilter.objects.values_list('url_host', flat=True) + # Get list of (pattern, priority, status) tuples to override status if required + list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status")) + + # Fetched during last 24 hours + time_delta_ts = timezone.now() - time_delta + # Get batch of URLs, status='raw' and fetched X days ago + raw_urls = Urls.objects.filter(status='raw', ts_fetch__gte=time_delta_ts)[:batch_size] + # List of objects to bulk update + updating_urls = [] + + # Per URL + for obj_url in raw_urls: + + ##### Any domain to filter included in URL? -> Invalid + if (any([d in obj_url.url for d in list_domains_to_filter])): + logger.debug("Domain filter applied to input URL: {}".format(obj_url.url)) + # Update status + obj_url.status = 'invalid' + # Append to bulk update + updating_urls.append(obj_url) + # Next URL + continue + + ##### Process URL + try: + # Get data + dict_url_data = process_url(obj_url.url) + # Not none or handle as exception + assert(dict_url_data is not None) + except Exception as e: + logger.debug("Error processing URL: {}\n{}".format(obj_url.url, str(e))) + # Update status + obj_url.status = 'error' + # Append to bulk update + updating_urls.append(obj_url) + # Next URL + continue + + ##### Canonical URL different? -> Duplicate + if (dict_url_data.get("url") != dict_url_data.get("url_canonical")): + # Update status + obj_url.status = 'duplicate' + # Append to bulk update + updating_urls.append(obj_url) + + # Get or create URL with canonical form + obj_url_canonical = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical")) + # Associate same sources to url -> url_canonical + + # Get the sources id associated to obj_url.id + url_sources = UrlsSource.objects.filter(id_url=obj_url.id) + for url_source_obj in url_sources: + # Associate same sources to url_canonical (it might already exist) + UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical.id) + # Next URL + continue + + ##### Valid URL + # Update status + obj_url.status = 'valid' + # Append to bulk update + updating_urls.append(obj_url) + # Create extracted URL data + UrlContent.objects.create_or_update( + id_url=obj_url.id, + date_published=dict_url_data.get("publish_date"), + title=dict_url_data.get("title"), + description=dict_url_data.get("description"), + content=dict_url_data.get("content"), + valid_content=dict_url_data.get("valid_content"), + language=dict_url_data.get("language"), + keywords=dict_url_data.get("keywords"), + tags=dict_url_data.get("tags"), + authors=dict_url_data.get("authors"), + image_main=dict_url_data.get("image_main"), + images_url=dict_url_data.get("images_url"), + videos_url=dict_url_data.get("videos_url"), + url_host=dict_url_data.get("url_host"), + site_name=dict_url_data.get("site_name"), + ) + + + ##### Override status if pattern matching? + for obj_url in updating_urls: + # Check if article status needs to be updated with pattern matching + status_pattern_matching = self._get_status_pattern_matching(obj_url.url, obj_url.status, list_pattern_status_tuple) + # Update status? + if (status_pattern_matching != obj_url.status): + logger.debug("Pattern matching, overriding with status {} for URL: {}".format(status_pattern_matching, obj_url.url)) + # Update, no need to append to updating_urls, already included + obj_url.status = status_pattern_matching + + # Bulk update + Urls.objects.bulk_update(updating_urls, ['status']) + + logger.debug("Finished processing raw URLs") + except Exception as e: + logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc())) + diff --git a/app_urls/api/src/fetch_feed.py b/app_urls/api/src/fetch_feed.py new file mode 100644 index 0000000..a0e98a0 --- /dev/null +++ b/app_urls/api/src/fetch_feed.py @@ -0,0 +1,50 @@ +from .db_utils import DB_Handler +from ..models import Feed +import feedparser +import dateutil +import traceback +from .logger import get_logger +logger = get_logger() + +class FetchFeeds(): + def __init__(self) -> None: + logger.debug("Initializing News feed") + + def run(self): + try: + logger.debug("Starting NewsFeed.run()") + + # Get feeds + list_url_feeds = list(Feed.objects.values_list('rss_feed', flat=True)) + logger.debug("Fetching news from feeds: {}".format(list_url_feeds)) + + # Process via RSS feeds + for url_feed in list_url_feeds: + # Initialize + urls_fetched, urls_publish_date = [], [] + # Fetch feeds + feeds = feedparser.parse(url_feed) + # Parse + for f in feeds.get("entries", []): + # Get URL + url = f.get("link", None) + # Process? + if (url is not None): + # Available publish date? + publish_date_parsed = f.get("published_parsed") + if (publish_date_parsed is None): + publish_date = f.get("published", None) + if (publish_date is not None): + publish_date_parsed = dateutil.parser.parse(publish_date) + + # Published date + urls_publish_date.append(publish_date_parsed) + # URL + urls_fetched.append(url) + + # URL fetching source + source = "feed {}".format(url_feed) + # Write to DB + DB_Handler().insert_raw_urls(urls_fetched, source) + except Exception as e: + logger.warning("Exception in NewsFeed.run(): {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/api/src/logger.py b/app_urls/api/src/logger.py new file mode 100644 index 0000000..c2cae1d --- /dev/null +++ b/app_urls/api/src/logger.py @@ -0,0 +1,22 @@ +import logging + +import os +os.makedirs("logs", exist_ok=True) + +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") +logger.setLevel(logging.DEBUG) + +# To file log: INFO / WARNING / ERROR +fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4) +fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +logger.addHandler(fh) + +# To file log: WARNING / ERROR +fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1) +fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh_.setLevel(logging.WARNING) +logger.addHandler(fh_) + +def get_logger(): + return logger diff --git a/app_urls/api/src/url_processor.py b/app_urls/api/src/url_processor.py new file mode 100644 index 0000000..bebf948 --- /dev/null +++ b/app_urls/api/src/url_processor.py @@ -0,0 +1,60 @@ +from .logger import get_logger +logger = get_logger() +import newspaper +# pip install langdetect +#import langdetect +#langdetect.DetectorFactory.seed = 0 + +def process_url(url): + try: + # Process + article = newspaper.article(url) + except newspaper.ArticleException as e: + logger.warning("ArticleException for input URL {}\n{}".format(url, str(e))) + return None + except Exception as e: + logger.warning("Exception for input URL {}\n{}".format(url, str(e))) + return None + + dict_data = { + "url": url, + "url_canonical": article.canonical_link, + "url_host": article.source_url, + "site_name": article.meta_site_name, + "publish_date": article.publish_date, + "language": article.meta_lang, # langdetect.detect(article.text) + "title": article.title, + "description": article.meta_description, + "content": article.text, + "valid_content": article.is_valid_body(), + "keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""], + "tags": article.tags, + "authors": article.authors, + "image_main": article.top_image, # article.meta_img + "images": article.images, + "videos": article.videos, + } + + ''' + # TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",") + if (dict_data["tags"] is None): + dict_data["tags"] = [] + for k in article.meta_data.keys(): + if ("tags" in k): + dict_data["tags"] += article.meta_data[k].split(",") + ''' + + # Sanity check + for k in dict_data.keys(): + if (type(k) is list): + # Remove empty string + dict_data[k] = [ e for e in dict_data[k] if e != "" ] + # NULL instead of empty list + if (len(dict_data[k]) == 0): + dict_data[k] = None + else: + # NULL instead of empty string + if (dict_data[k] == ""): + dict_data[k] = None + + return dict_data \ No newline at end of file diff --git a/app_urls/api/tasks.py b/app_urls/api/tasks.py index 0ae27de..4248caa 100644 --- a/app_urls/api/tasks.py +++ b/app_urls/api/tasks.py @@ -1,13 +1,65 @@ from django_rq import job -import time + +from .src.fetch_feed import FetchFeeds +from .src.db_utils import DB_Handler +''' +from src.fetch_parser import FetchParser +from src.fetch_search import FetchSearcher +from src.missing_kids_fetch import MissingKidsFetch +from src.missing_kids_status import MissingKidsStatus +from src.url_status import UpdateErrorURLs +from src.db_utils import DB_Handler +from src.credentials import db_connect_info, redis_connect_info + +# DB Handler +db_handler = DB_Handler(db_connect_info, redis_connect_info) +''' + import logging logger = logging.getLogger(__name__) + @job -def task_1(message): - logger.info("Message: {}".format(message)) +def background_task(process_type: str): + logger.info("Task triggered: {}".format(process_type)) + try: - time.sleep(5) # Simulate a long-running task - print(f"Task completed: {message}") + FetchFeeds().run() + + # DB_Handler().process_raw_urls() + + + ''' + if (process_type == "fetch_feeds"): + FetchFeeds(db_handler).run() + + elif (process_type == "fetch_parser"): + FetchParser(db_handler).run() + + elif (process_type == "search") or (process_type == "search_full"): + FetchSearcher(cred.db_connect_info, cred.redis_connect_info, full=True).run() + elif (process_type == "search_reduced"): + FetchSearcher(cred.db_connect_info, cred.redis_connect_info, full=False).run() + + # Selenium based + elif (process_type == "fetch_missing_kids_reduced"): + MissingKidsFetch(db_handler, num_pages=4).run() + elif (process_type == "fetch_missing_kids_full"): + MissingKidsFetch(db_handler, num_pages=100000).run() + + elif (process_type == "update_missing_kids_status_reduced"): + MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status() + elif (process_type == "update_missing_kids_status_full"): + MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status() + + elif (process_type == "update_error_urls"): + UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status() + + else: + logger.error("Task error, unknown type: {}".format(process_type)) + return + ''' + + logger.info("Task completed: {}".format(process_type)) except Exception as e: logger.error(e) diff --git a/app_urls/api/urls.py b/app_urls/api/urls.py index ed574fd..e78f4ba 100644 --- a/app_urls/api/urls.py +++ b/app_urls/api/urls.py @@ -2,5 +2,5 @@ from django.urls import path from .views import trigger_task urlpatterns = [ - path('trigger_task/', trigger_task, name='trigger_task') + path('fetch', trigger_task, name='trigger_task') ] diff --git a/app_urls/api/views.py b/app_urls/api/views.py index aa1d802..a72201d 100644 --- a/app_urls/api/views.py +++ b/app_urls/api/views.py @@ -1,10 +1,10 @@ import django_rq from django.http import JsonResponse -from .tasks import task_1 +from .tasks import background_task def trigger_task(request): """View that enqueues a task.""" queue = django_rq.get_queue('default') # Get the default queue - job = queue.enqueue(task_1, "Hello from Django RQ!") + job = queue.enqueue(background_task, "Hello from Django RQ!") return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id}) diff --git a/app_urls/core/settings.py b/app_urls/core/settings.py index 7e96cb7..e5eed47 100644 --- a/app_urls/core/settings.py +++ b/app_urls/core/settings.py @@ -91,6 +91,16 @@ DATABASES = { } } +CACHES = { + "default": { + "BACKEND": "django.core.cache.backends.redis.RedisCache", + "LOCATION": "redis://{}:{}".format( + os.environ.get("REDIS_HOST", "localhost"), + os.environ.get("REDIS_PORT", 6379) + ), + } +} + RQ_QUEUES = { 'default': { 'HOST': os.environ.get("REDIS_HOST", "localhost"),