diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6fa80cc --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__/ +*.pyc +**/credentials.py diff --git a/1-DB.ipynb b/1-DB.ipynb index bc72ed1..efc3e13 100644 --- a/1-DB.ipynb +++ b/1-DB.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -11,16 +11,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "db_postgres\n", + "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n", + " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 1/1\u001b[0m\n", + " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "!docker rm -f db_postgres; docker compose -f docker/docker-compose.yml up -d ; sleep 10" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "INSERT_TABLES = False\n", - "INSERT_SAMPLE_DATA = False\n", + "INSERT_TABLES = True\n", + "INSERT_SAMPLE_DATA = True\n", "\n", "import psycopg\n", "connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n", "\n", + "from datetime import datetime, timezone\n", + "\n", "\n", "if INSERT_TABLES:\n", " # Connect to an existing database\n", @@ -87,14 +111,14 @@ " \n", " \n", " CREATE TABLE URL_CONTENT (\n", - " id_url INTEGER REFERENCES URLS(id),\n", - " date_published TIMESTAMPTZ NOT NULL DEFAULT NOW(),\n", + " id_url INTEGER PRIMARY KEY REFERENCES URLS(id),\n", + " date_published TIMESTAMPTZ DEFAULT NOW(),\n", " title TEXT,\n", " description TEXT,\n", " content TEXT,\n", " tags TEXT[],\n", " authors TEXT[],\n", - " image_urls TEXT[],\n", + " image_urls TEXT[]\n", " );\n", " CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n", " CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n", @@ -119,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -137,7 +161,6 @@ " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/', 'valid')\")\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality', 'valid')\")\n", - " # Invalid\n", " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.google.com', 'invalid')\")\n", "\n", " cur.execute(\"INSERT INTO SOURCE (source) values ('news.google.com')\")\n", @@ -162,14 +185,157 @@ " \n", " # Long URLs \n", " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html', 'invalid')\".format(j))\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))" + " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))\n", + "\n", + " # URL Content\n", + " content = \"Bla Bla Bla!!!\"*25\n", + " cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, tags, authors, image_urls) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s)\", (1, datetime.now(tz=timezone.utc), content, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\t urls\n", + "[(1,\n", + " 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'valid'),\n", + " (2,\n", + " 'https://www.bbc.com/news/articles/ckg843y8y7no',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'valid'),\n", + " (3,\n", + " 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'valid'),\n", + " (4,\n", + " 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'valid'),\n", + " (5,\n", + " 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'valid'),\n", + " (6,\n", + " 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'valid'),\n", + " (7,\n", + " 'https://www.google.com',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (8,\n", + " 'www.super_0.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (9,\n", + " 'www.super_1.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (10,\n", + " 'www.super_2.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (11,\n", + " 'www.super_3.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (12,\n", + " 'www.super_4.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (13,\n", + " 'www.super_5.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (14,\n", + " 'www.super_6.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (15,\n", + " 'www.super_7.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (16,\n", + " 'www.super_8.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (17,\n", + " 'www.super_9.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (18,\n", + " 'www.super_10.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (19,\n", + " 'www.super_11.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (20,\n", + " 'www.super_12.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (21,\n", + " 'www.super_13.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (22,\n", + " 'www.super_14.org',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (23,\n", + " 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid'),\n", + " (24,\n", + " 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n", + " datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'invalid')]\n", + "\t urls_duplicate\n", + "[]\n", + "\t feed\n", + "[(1,\n", + " 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC')]\n", + "\t website_of_interest\n", + "[(1, 'www.unicef.org')]\n", + "\t search\n", + "[(1, 'child abuse')]\n", + "\t urls_source\n", + "[(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (1, 2), (2, 2), (3, 2)]\n", + "\t source\n", + "[(1, 'news.google.com'), (2, 'qwant.com')]\n", + "\t website_to_filter\n", + "[(1, 'yewtu.be'),\n", + " (2, 'twitter.com'),\n", + " (3, 'libreddit.de'),\n", + " (4, 'youtube.com'),\n", + " (5, 'tiktok.com'),\n", + " (6, 'radio.foxnews.com')]\n", + "\t status_pattern_matching\n", + "[('.*missingkids.org/poster/.*', 50, 'valid')]\n", + "\t url_content\n", + "[(1,\n", + " datetime.datetime(2025, 3, 6, 23, 4, 37, 654130, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", + " 'Mommy blogger turned child abuser',\n", + " 'Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla '\n", + " 'Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n", + " 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n", + " 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n", + " 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!',\n", + " 'Hello there!',\n", + " ['child abuse', 'social media'],\n", + " ['Audrey Conklin'],\n", + " ['https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1'])]\n" + ] + } + ], "source": [ "from pprint import pprint\n", "\n", @@ -188,8 +354,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": "matitos", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" } }, "nbformat": 4, diff --git a/README.md b/README.md index 6b7f3e9..f1764f3 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,12 @@ pip install ipykernel django requests ollama psycopg[binary] # openai # Development -* web_app +* app_web ``` + # 1) Change models.py +python manage.py inspectdb + # 2) python manage.py makemigrations # 3) @@ -23,7 +26,7 @@ python manage.py migrate --fake-initial python manage.py createsuperuser ``` -* Image generation +* app_img_gen ``` docker build -t image_generation . docker run --rm -it -p 12343:80 image_generation diff --git a/app_fetcher/Dockerfile b/app_fetcher/Dockerfile new file mode 100644 index 0000000..be3153e --- /dev/null +++ b/app_fetcher/Dockerfile @@ -0,0 +1,16 @@ +FROM continuumio/miniconda3:23.10.0-1 + +# App repository +COPY . /opt/app/ + +RUN conda install -c conda-forge curl +RUN pip install --no-cache-dir --upgrade "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper3k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean] +RUN pip freeze +# GoogleNews-1.6.10 Pillow-10.1.0 PyYAML-6.0.1 aiofiles-23.2.1 anyio-3.7.1 beautifulsoup4-4.9.3 bs4-0.0.1 click-8.1.7 cssselect-1.2.0 dateparser-1.2.0 dnspython-1.16.0 duckduckgo_search-3.9.8 fastapi-0.104.1 fastapi-utils-0.2.1 feedfinder2-0.0.4 feedparser-6.0.10 filelock-3.13.1 gnews-0.3.6 greenlet-3.0.1 h11-0.14.0 h2-4.1.0 hpack-4.0.0 httpcore-1.0.2 httpx-0.25.2 hyperframe-6.0.1 jieba3k-0.35.1 joblib-1.3.2 lxml-4.9.3 newspaper3k-0.2.8 nltk-3.8.1 numpy-1.26.2 psycopg-3.1.13 psycopg-binary-3.1.13 pydantic-1.10.13 pymongo-3.12.3 python-dateutil-2.8.2 python-dotenv-0.19.2 pytz-2023.3.post1 redis-5.0.1 regex-2023.10.3 requests-2.26.0 requests-file-1.5.1 sgmllib3k-1.0.0 six-1.16.0 sniffio-1.3.0 socksio-1.0.0 soupsieve-2.5 sqlalchemy-1.4.50 starlette-0.27.0 tinysegmenter-0.3 tldextract-5.1.1 typing-extensions-4.8.0 tzlocal-5.2 uvicorn-0.24.0.post1 + +WORKDIR /opt/app + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"] + +# docker build -t fetch_app . +# docker run --rm --name container_fetch_app fetch_app diff --git a/app_fetcher/README.md b/app_fetcher/README.md new file mode 100644 index 0000000..fad9c10 --- /dev/null +++ b/app_fetcher/README.md @@ -0,0 +1,12 @@ +# Fetcher + +* Fetcher app + - Contains several endpoints to perform a specific fetching type task + - For more details, check in [app.py](app.py) /{fetch_type} + +* Build and run + - Important: To be deployed with other micro-services, [docker-compose.yml](../docker-compose.yml) +``` +docker build -t fetch_app . +docker run --rm --name container_fetch_app fetch_app +``` diff --git a/app_fetcher/app.py b/app_fetcher/app.py new file mode 100644 index 0000000..77a8084 --- /dev/null +++ b/app_fetcher/app.py @@ -0,0 +1,91 @@ +import src.credentials as cred +import logging +from logging.handlers import RotatingFileHandler +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") +logger.setLevel(logging.INFO) + +import os +os.makedirs("logs", exist_ok=True) + +# To file log +fh = RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4) +fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +logger.addHandler(fh) + +# To file log: WARNING / ERROR +fh_ = RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1) +fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh_.setLevel(logging.WARNING) +logger.addHandler(fh_) + +logger.info("Environment: {}".format(cred.ENVIRONMENT)) + +################################################################################################## +from src.news_feed import NewsFeed +from src.news_parsing import NewsSiteParsing +from src.news_search import NewsSearch +from src.news_missing_kids import NewsMissingKids +from src.missing_kids_status import MissingKidsStatus +from src.url_status import UpdateErrorURLs +from src.fetcher_status import FetcherStatus + +from fastapi import FastAPI, BackgroundTasks +# import requests +# from fastapi_utils.tasks import repeat_every +# import time +# time.sleep(10) +# import gc + +app = FastAPI() + +@app.get("/") +def hello_world(): + return {"message": "OK"} + +@app.get("/{fetch_type}") +async def fetch(background_tasks: BackgroundTasks, fetch_type: str): + # Concurrent job running + logger.info("Triggered fetch: {}".format(fetch_type)) + + if (fetch_type == "feeds"): + task_run = NewsFeed(cred.db_connect_info, cred.redis_connect_info).run + elif (fetch_type == "parser"): + task_run = NewsSiteParsing(cred.db_connect_info, cred.redis_connect_info).run + elif (fetch_type == "fetch_missing_kids_reduced"): + task_run = NewsMissingKids(cred.db_connect_info, cred.redis_connect_info, num_pages=4).run + elif (fetch_type == "fetch_missing_kids_full"): + task_run = NewsMissingKids(cred.db_connect_info, cred.redis_connect_info, num_pages=100000).run + elif (fetch_type == "search") or (fetch_type == "search_full"): + task_run = NewsSearch(cred.db_connect_info, cred.redis_connect_info, full=True).run + elif (fetch_type == "search_reduced"): + task_run = NewsSearch(cred.db_connect_info, cred.redis_connect_info, full=False).run + elif (fetch_type == "update_missing_kids_status_reduced"): + task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status + elif (fetch_type == "update_missing_kids_status_full"): + task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status + elif (fetch_type == "update_error_urls"): + task_run = UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status + elif (fetch_type == "fetch_warning_check"): + task_run = FetcherStatus(cred.db_connect_info, cred.redis_connect_info, last_minutes_check=180).check_warning + else: + return {"message": "ERROR. Unknown fetcher type!"} + + # Run task + background_tasks.add_task(task_run) + # Return message + return {"message": "Started fetching {}: Ok".format(fetch_type)} + +################################################################################################## + +########################### +''' +@app.on_event("startup") +def verify_db() -> None: + logger.info("Testing DB connection") + import psycopg + with psycopg.connect(cred.db_connect_info) as conn: + url_test_msg = "Num URLs: {}".format(conn.execute("SELECT COUNT(*) FROM URLS;").fetchall()) + logger.info(url_test_msg) +''' +########################### diff --git a/app_fetcher/src/db_utils.py b/app_fetcher/src/db_utils.py new file mode 100644 index 0000000..1eb8cea --- /dev/null +++ b/app_fetcher/src/db_utils.py @@ -0,0 +1,456 @@ +import psycopg +import redis +import traceback +import random +import requests +import json +import os +from .url_utils import process_article +import logging +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") + +# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ... +# The rest, elsewhere + +class URL_DB_Writer(): + def __init__(self, db_connect_info, redis_connect_info): + logger.debug("Initializing URL DB writer") + self.db_connect_info = db_connect_info + self.redis_instance = redis.Redis(host=redis_connect_info.get("redis_host"), port=redis_connect_info.get("redis_port")) + self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours + + try: + self.redis_instance.ping() + logger.debug("Succesfully pinged Redis") + except Exception as e: + logger.warning("Error trying to ping Redis: {}".format(str(e))) + + def get_urls_count(self, last_minutes_check): + ##################### + ### Get number of URLs within last X minutes + ##################### + try: + # Update + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + num_urls = cursor.execute("SELECT COUNT(*) FROM URLS WHERE ts_fetch >= current_timestamp - interval '{} minutes';".format(last_minutes_check)).fetchone()[0] + except Exception as e: + logger.warning("Error updating URLs status: {}".format(str(e))) + num_urls = None + return num_urls + + def _format(self, values): + # Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729 + # String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value) + if (type(values) == list) or (type(values) == tuple): + insert_args = "(" + ", ".join([ "NULL" if v is None else "'" + str(v).replace("'", "''") + "'" for v in values]) + ")" + elif (type(values) == str): + insert_args = "({})".format( "NULL" if values is None else "'" + values.replace("'", "''") + "'" ) + else: + logger.warning("Error formatting input values: {}".format(values)) + assert False + return insert_args + + def _get_cached_canonical_url(self, url): + ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB + try: + filter_url = self.redis_instance.get(url) + if (filter_url is not None): + filter_url = filter_url.decode("utf-8") + except Exception as e: + logger.warning("Exception querying Redis: {}".format(str(e))) + filter_url = None + return filter_url + + def _update_urls_status(self, dict_status_ids): + ##################### + ### Update status to array of URL IDs + ##################### + try: + # Update + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + # Autocommit at end of transaction (Atomic insert of URLs and sources) + with conn.transaction() as tx: + for key_status, value_ids in dict_status_ids.items(): + cursor.execute("UPDATE URLS SET status='{}' WHERE id IN ({});".format(key_status, ",".join([str(v) for v in value_ids]))) + except Exception as e: + logger.warning("Error updating URLs status: {}".format(str(e))) + + def _get_missing_kids_urls(self, num_urls=None): + ##################### + ### Get list of Missing Kids URLs + ##################### + try: + missing_kids_ids_and_urls = [] + if (num_urls is None): + limit = 500 + else: + limit = num_urls + offset = 0 + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + while True: + # Query + missing_kids_ids_and_urls_query = cursor.execute("SELECT id, url, status FROM URLS WHERE url LIKE '%missingkids.org/poster%' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall() + # Finished? + if (len(missing_kids_ids_and_urls_query) == 0): + break + # Extend + missing_kids_ids_and_urls = missing_kids_ids_and_urls + missing_kids_ids_and_urls_query + # Offset + offset += len(missing_kids_ids_and_urls_query) + # Stop? + if (num_urls is not None) and (len(missing_kids_ids_and_urls) >= num_urls): + break + + except Exception as e: + logger.warning("Error getting Missing Kids URLs: {}".format(str(e))) + missing_kids_ids_and_urls = [] + return missing_kids_ids_and_urls + + def _get_error_urls(self, num_urls=None): + ##################### + ### Get list of Missing Kids URLs + ##################### + try: + error_urls = [] + if (num_urls is None): + limit = 500 + else: + limit = num_urls + offset = 0 + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + while True: + # Query + error_urls_query = cursor.execute("SELECT id, url FROM URLS WHERE status='error' ORDER BY ts_fetch DESC LIMIT {} OFFSET {};".format(limit, offset)).fetchall() + # Finished? + if (len(error_urls_query) == 0): + break + # Extend + error_urls = error_urls + error_urls_query + # Offset + offset += len(error_urls_query) + # Stop? + if (num_urls is not None) and (len(error_urls) >= num_urls): + break + + except Exception as e: + logger.warning("Error getting Error URLs: {}".format(str(e))) + error_urls = [] + return error_urls + + def _decode_urls(self, urls_fetched, list_domains_to_filter, list_pattern_status_tuple): # TODO: language for urls_fetched... + """ + # TODO: REFACTOR + For each input url + + Already processed? + -> Update on Redis expire time + -> Associate to source + Not processed? Get main URL: + -> URL Canonical valid? + -> Rely on this as main URL + -> URL Canonical not valid? + -> Use input url, unless it's a news.google.com link + -> If news.google.com link, filter out. REDIS? + Main URL processing: + -> Update in REDIS, association url -> url_canonical + -> url != url_canonical: Add in duplicate table + If both != news.google.com + """ + + # URLs to insert, URLs duplicated association, URL to Canonical form + list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = [], [], {} + + # URL VS CANONICAL: + # News URL returned: https://news.google.com/articles/CBMifmh0dHBzOi8vd3d3LmJyZWl0YmFydC5jb20vMm5kLWFtZW5kbWVudC8yMDIzLzA0LzAzL2dvdi1kZXNhbnRpcy1zaWducy1iaWxsLW1ha2luZy1mbG9yaWRhLXRoZS0yNnRoLWNvbnN0aXR1dGlvbmFsLWNhcnJ5LXN0YXRlL9IBAA?hl=en-US&gl=US&ceid=US%3Aen + # Corresponds to canonical URL: https://www.breitbart.com/2nd-amendment/2023/04/03/gov-desantis-signs-bill-making-florida-the-26th-constitutional-carry-state/ + + for url in urls_fetched: + # Domain to filter? Input url + filter_due_to_domain = False + for domain_to_filter in list_domains_to_filter: + if (domain_to_filter in url): + logger.debug("Domain filter applied based on {} for input URL: {}".format(domain_to_filter, url)) + filter_due_to_domain = True + if (filter_due_to_domain): + continue + + # URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB + cached_canonical_url = self._get_cached_canonical_url(url) + if (cached_canonical_url is not None): + # Even if url processed, need to add url_canonical to list_filtered_urls, so as to associate search source to canonical URL (canonical is the main URL entry) + dict_full_urls_to_canonical[url] = cached_canonical_url # X -> Y + # If url has been processed, so was its canonical form + logger.debug("Filtering out already inserted (processed) URL and its canonical form: {} {}".format(url, cached_canonical_url)) + continue + + # Process TODO: Add language... + url_canonical, article_elements, article_status = process_article(url, list_pattern_status_tuple) + # TODO: Store article_elements information to insert into OS after inserted into DB (and therefore having associated url_id) + + # Could not retrieve redirection for news.google.com based URL? Continue (avoid inserting in DB) + if (url_canonical is None) and ("news.google.com" in url): + logger.debug("Filtering empty canonical link for base URL based on news.google.com: {}".format(url)) + continue + # Canonical URL still news.google.com? Continue (avoid inserting in DB) + if (url_canonical is not None) and ("news.google.com" in url_canonical): + logger.debug("Filtering canonical news.google.com based URL: {}".format(url_canonical)) + continue + + # Domain to filter? Input canonical_url + filter_due_to_domain = False + for domain_to_filter in list_domains_to_filter: + if (url_canonical is not None) and (domain_to_filter in url_canonical): + filter_due_to_domain = True + if (filter_due_to_domain): + logger.info("Filtering due to domain input URL, Canonical_URL: {} {}".format(url, url_canonical)) + continue + + if (url_canonical is None) or (article_status == "error"): + logger.debug("Processing failed for URL: {}".format(url)) + # Still insert URL with "error"? -> If processed later, might have inconsistent sources (url vs url_canonical). Only store if not news.google.com based + if ("news.google.com" in url) or ("consent.google.com" in url): + logging.debug("Not able to process Google News link, skipping: {}".format(url)) + else: + dict_full_urls_to_canonical[url] = url # X -> X + list_insert_url_tuple_args.append( (url, article_status) ) + continue + + # URL was not processed (not sure canonical yet). Generate URL_CANONICAL <-> URL_ORIGINAL association if they're different + if (url_canonical != url): + list_tuple_canonical_duplicate_urls.append( (url_canonical, url) ) + # Dict: url -> canonical (update association) + dict_full_urls_to_canonical[url] = url_canonical # X -> Y or X + + # Canonical URL processed recently? -> Filter and avoid increasing SERIAL counter & efficiency of DB + if (self._get_cached_canonical_url(url_canonical) is not None): + # Canonical URL was already processed + logger.debug("Filtering out already inserted (processed) URL canonical: {}".format(url_canonical)) + else: + # Insert url_canonical to DB formatted + list_insert_url_tuple_args.append( (url_canonical, article_status) ) + # Canonical URL different? Process + if (url_canonical != url): + if ("news.google.com" in url) or ("consent.google.com" in url): + logging.debug("Not adding google.news.com based link, skipping: {}".format(url)) + else: + # Fetched url -> duplicate (using canonical as main link) + article_status = "duplicate" + # Insert url (non-canonical) to DB formatted + list_insert_url_tuple_args.append( (url, article_status) ) + + return list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical + + def _insert_urls(self, cursor, list_insert_url_tuple_args): + ##################### + ### Insert URLs with status + ##################### + if (len(list_insert_url_tuple_args) > 0): + insert_args = ', '.join( [ self._format(t) for t in list_insert_url_tuple_args] ) + # Insert. (url_1, status_1), (url_2, status_2), ... + sql_code = "INSERT INTO URLS {} VALUES {} ON CONFLICT (url) DO NOTHING;".format("(url, status)", insert_args) + # logger.debug("SQL CODE: {}".format(sql_code)) + c = cursor.execute(sql_code) + # NOTE: Not using "RETURNING id" since previously inserted URLs are not returned (ON CONFLICT) + # https://stackoverflow.com/questions/35949877/how-to-include-excluded-rows-in-returning-from-insert-on-conflict/35953488#35953488 + + def _insert_urls_duplicated(self, cursor, list_tuple_canonical_duplicate_urls): + ##################### + ### Insert duplicated URLs + ##################### + if (len(list_tuple_canonical_duplicate_urls) > 0): + # Flatten, format, set to remove duplicates + args_duplicated_urls_set = "(" + ', '.join( set( [ "'" + str(y).replace("'", "''") + "'" for x in list_tuple_canonical_duplicate_urls for y in x] ) ) + ")" + + # Dict: url -> id + dict_url_to_id = {} + # Get url -> id association to populate duplicated URLs + for (id_, url_) in cursor.execute("SELECT id, url FROM URLS WHERE url IN {};".format(args_duplicated_urls_set)).fetchall(): + dict_url_to_id[url_] = id_ + + # Convert tuples (url_canonical, url) -> (id_url_canonical, id_url) to insert in DB + # ORIGINAL CODE. Issue, might not have found association to all urls + ### list_tuple_canonical_duplicate_urls_ids = [ (dict_url_to_id[t[0]], dict_url_to_id[t[1]]) for t in list_tuple_canonical_duplicate_urls] + + list_tuple_canonical_duplicate_urls_ids = [] + for (url_1, url_2) in list_tuple_canonical_duplicate_urls: + id_url_1, id_url_2 = dict_url_to_id.get(url_1), dict_url_to_id.get(url_2) + if (id_url_1 is None) or (id_url_2 is None): + logger.debug("Skipping duplicate association due to no url -> id_url mapping available for tuple: {} {}".format(url_1, url_2)) + else: + list_tuple_canonical_duplicate_urls_ids.append( (id_url_1, id_url_2) ) + + if (len(list_tuple_canonical_duplicate_urls_ids) > 0): + insert_args = ', '.join( [ self._format(t) for t in list_tuple_canonical_duplicate_urls_ids] ) + # Insert. (id_url_canonical_1, id_url_1), ... + sql_code = "INSERT INTO URLS_DUPLICATE {} VALUES {} ON CONFLICT DO NOTHING;".format("(id_url_canonical, id_url_duplicated)", insert_args) + # logger.debug("SQL CODE: {}".format(sql_code)) + c = cursor.execute(sql_code) + + def _get_pattern_status_list(self): + ##################### + ### Get list of domains to filter + ##################### + # TODO: Cache on redis and query once every N hours? ... + try: + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + # TODO: Cache on Redis + list_pattern_status = cursor.execute("SELECT pattern, priority, status FROM STATUS_PATTERN_MATCHING;").fetchall() + except Exception as e: + logger.warning("Error getting pattern status list: {}".format(str(e))) + list_pattern_status = [] + return list_pattern_status + + def _get_domains_to_filter(self): + ##################### + ### Get list of domains to filter + ##################### + # TODO: Cache on redis and query once every N hours? ... + try: + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + # TODO: Cache on Redis + sites_to_filter = [e[0] for e in cursor.execute("SELECT url_host FROM WEBSITE_TO_FILTER;").fetchall() ] + except Exception as e: + logger.warning("Error getting domains to filter: {}".format(str(e))) + sites_to_filter = [] + return sites_to_filter + + def _get_cached_source_id(self, source): + ### Redis: URL processed recently? -> Avoid increasing SERIAL counter & efficiency of DB + try: + source_id = self.redis_instance.get(source) + if (source_id is not None): + source_id = source_id.decode("utf-8") + except Exception as e: + logger.warning("Exception querying Redis: {}".format(str(e))) + source_id = None + return source_id + + def _get_source_id(self, cursor, source): + ##################### + ### Get source corresponding id + ##################### + # Cached? + id_source = self._get_cached_source_id(source) + if (id_source is None): + c = cursor.execute("SELECT id FROM SOURCE WHERE source='{}'".format(source.replace("'", "''"))).fetchone() + if (c is None) or (len(c) == 0): + # Source does not exist, insert and get id + c = cursor.execute("INSERT INTO SOURCE (source) VALUES ('{}') RETURNING id;".format(source.replace("'", "''"))).fetchone() + # Decode source id + id_source = c[0] + # Cache + self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds) + return id_source + + def _get_urls_id(self, cursor, urls_full): + ##################### + ### Get id of inserted and filtered URLs + ##################### + # TODO: Cache url -> url_id, url_canonical + if (len(urls_full) == 0): + return [] + # Get inserted and filtered URL ids (unnested). Filtered URLs are also retrieved since they might have been fetched from a new source + in_inserted_filtered_urls = "(" + ', '.join(["'" + u.replace("'", "''") + "'" for u in urls_full]) + ")" + id_urls_related = [ i[0] for i in cursor.execute("SELECT id FROM URLS WHERE url IN {};".format(in_inserted_filtered_urls)).fetchall() ] + return id_urls_related + + def _insert_urls_source(self, cursor, id_urls_related, id_source): + ##################### + ### Insert URL sources: (id_url_1, id_source), (id_url_2, id_source), ... + ##################### + if (len(id_urls_related) == 0) or (id_source is None): + return + columns = "(id_url, id_source)" + insert_args = ', '.join( [ self._format([id_url, id_source]) for id_url in id_urls_related ] ) + # Insert + sql_code = "INSERT INTO URLS_SOURCE {} VALUES {} ON CONFLICT DO NOTHING;".format(columns, insert_args) + # logger.debug("SQL CODE: {}".format(sql_code)) + c = cursor.execute(sql_code) + + def write_batch(self, urls_fetched, source): + # Chunks of 50 elements + n = 50 + # Divide in small chunks + urls_fetched_chunks = [urls_fetched[i:i + n] for i in range(0, len(urls_fetched), n)] + # Process + for urls_fetched_chunk_i in urls_fetched_chunks: + self._write_small_batch(urls_fetched_chunk_i, source) + + def _write_small_batch(self, urls_fetched, source): + try: + logger.info("Fetched #{} URLs, source: {}".format(len(urls_fetched), source)) + + if (len(urls_fetched) == 0): + logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source)) + return + + # Shuffle URLs to reduce continuous URLs of same URL host (minimize chance of being blocked for too many continuous requests) + random.shuffle(urls_fetched) + + # Get list of domains to filter + list_domains_to_filter = self._get_domains_to_filter() + # Get list of (pattern, priority, status) tuples to override status if required + list_pattern_status_tuple = self._get_pattern_status_list() + # Sort pattern tuples by priority + list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True) + + # Process URLs to update DB + list_insert_url_tuple_args, list_tuple_canonical_duplicate_urls, dict_full_urls_to_canonical = self._decode_urls(urls_fetched, list_domains_to_filter, list_pattern_status_tuple) + # Full set of URL and its canonical form (to associate them to a search), both to insert and filter + urls_full = set(dict_full_urls_to_canonical.keys()).union( set(dict_full_urls_to_canonical.values()) ) + + # Insert + with psycopg.connect(self.db_connect_info) as conn: + # Open cursor + cursor = conn.cursor() + # Autocommit at end of transaction (Atomic insert of URLs and sources) + with conn.transaction() as tx: + # Insert processed URLs + self._insert_urls(cursor, list_insert_url_tuple_args) + # Insert URLs duplicated (canonical != fetched url) + self._insert_urls_duplicated(cursor, list_tuple_canonical_duplicate_urls) + + # Get source id in DB + id_source = self._get_source_id(cursor, source) + # Get IDs of all related URLs + id_urls_related = self._get_urls_id(cursor, urls_full) + # Insert search source associated to URLs + self._insert_urls_source(cursor, id_urls_related, id_source) + + # Update Redis status of inserted and filtered URLs after writing to DB + for url, url_canonical in dict_full_urls_to_canonical.items(): + try: + # Set with updated expiry time + self.redis_instance.set(url, url_canonical, ex=self.redis_expiry_seconds) + if (url != url_canonical): + self.redis_instance.set(url_canonical, url_canonical, ex=self.redis_expiry_seconds) + except Exception as e: + logger.warning("Exception running set in Redis: {}".format(str(e))) + + if (len(list_insert_url_tuple_args) > 0): + try: + webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN") + endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlretrievalbot/message?zapikey={}".format(webhook_token) + + payload = json.dumps({"text": "Fetched #{} new URLs, source: {}".format(len(list_insert_url_tuple_args), source) }) + r = requests.post(endpoint_message, data=payload) + except Exception as e: + logger.warning("Webhook failed: {}".format(str(e))) + + logger.debug("URL DB write finished") + except Exception as e: + logger.warning( "Exception writing to URL_DB:\n{}".format(traceback.format_exc()) ) + logger.debug( "Exception --- List of URLs: {}".format(str(urls_fetched)) ) \ No newline at end of file diff --git a/app_fetcher/src/fetcher_status.py b/app_fetcher/src/fetcher_status.py new file mode 100644 index 0000000..b16b56b --- /dev/null +++ b/app_fetcher/src/fetcher_status.py @@ -0,0 +1,39 @@ +from .db_utils import URL_DB_Writer +import json +import logging +import requests +import os +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") + +class FetcherStatus(): + def __init__(self, db_connect_info, redis_connect_info, last_minutes_check) -> None: + self.db_connect_info = db_connect_info + self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info) + self.last_minutes_check = last_minutes_check + + def check_warning(self): + try: + logger.info("Starting fetcher check for last minutes {}".format(self.last_minutes_check)) + + # Get number of URLs + num_urls = self.db_writer.get_urls_count(last_minutes_check=self.last_minutes_check) + logger.debug("Fetched #URLs {} during the last {} minutes".format(num_urls, self.last_minutes_check)) + + webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN") + endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlfetchwarnings/message?zapikey={}".format(webhook_token) + + if (num_urls is None): + try: + payload = json.dumps({"text": "[WARNING] Error on query to DB"}) + r = requests.post(endpoint_message, data=payload) + except Exception as e: + logger.warning("Webhook failed: {}".format(str(e))) + elif (num_urls == 0): + try: + payload = json.dumps({"text": "[WARNING] No URLs fetched for {} minutes".format(self.last_minutes_check) }) + r = requests.post(endpoint_message, data=payload) + except Exception as e: + logger.warning("Webhook failed: {}".format(str(e))) + except Exception as e: + logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e))) diff --git a/app_fetcher/src/google_bypass.py b/app_fetcher/src/google_bypass.py new file mode 100644 index 0000000..3264425 --- /dev/null +++ b/app_fetcher/src/google_bypass.py @@ -0,0 +1,27 @@ +import requests +import json +import logging +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") + +class GoogleByPass(): + def __init__(self) -> None: + pass + + def bypass_google_urls(self, list_urls): + if (len(list_urls) == 0): + return [] + + try: + # Endpoint + gbypass_endpoint = "http://selenium_app:80/get_redirection" + # Timeout: 20 minutes + timeout = 60*20 + r = requests.post(gbypass_endpoint, json={"list_urls": list_urls}, timeout=timeout) + # Decode + list_urls_redirections = json.loads(r.text).get("list_urls_redirections", []) + except Exception as e: + logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e))) + list_urls_redirections = [] + + return list_urls_redirections diff --git a/app_fetcher/src/missing_kids_status.py b/app_fetcher/src/missing_kids_status.py new file mode 100644 index 0000000..b1ba7c1 --- /dev/null +++ b/app_fetcher/src/missing_kids_status.py @@ -0,0 +1,69 @@ +import requests +from .db_utils import URL_DB_Writer +from .url_utils import get_missing_kid_status +import time +import logging +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") + +class MissingKidsStatus(): + def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None: + self.num_urls = num_urls + self.db_connect_info = db_connect_info + self.redis_connect_info = redis_connect_info + self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info) + + def update_missing_kids_status(self): + try: + logger.info("Starting updating status to Missing Kids URLs, limit #URLs: {}".format(self.num_urls)) + # List of URLs + list_ids_and_urls = self.db_writer._get_missing_kids_urls(self.num_urls) + # Dict: status -> IDs to update to new status + dict_status_ids, dict_status_urls = {}, {} + # Check URLs with invalid status? + skip_invalid_check = False + + flush_every, flush_current = 20, 0 + # Iterate URLs + for (id, url, current_status) in list_ids_and_urls: + # Skip duplicate URLs + if (current_status == "duplicate"): + continue + # Skip invalid URLs? + if (skip_invalid_check): + if (current_status == "invalid"): + continue + + # Get status + new_status = get_missing_kid_status(url) + # Different? Update + if (current_status != new_status): + # Extend array + dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id] + # Debugging dict + dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url] + # +1 processed + flush_current += 1 + + # Flush batch? + if (flush_every == flush_current): + logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls)) + # Update DB + self.db_writer._update_urls_status(dict_status_ids) + # Reset + flush_current = 0 + dict_status_ids, dict_status_urls = {}, {} + + # Flush remaining batch + if (flush_current > 0): + logger.info("Updating status to Missing Kids URLs: {}".format(dict_status_urls)) + # Update DB + self.db_writer._update_urls_status(dict_status_ids) + # Reset + flush_current = 0 + dict_status_ids, dict_status_urls = {}, {} + + logger.info("Finished updating status to Missing Kids URLs") + except Exception as e: + logger.warning("Exception in MissingKidsStatus.run(): {}".format(str(e))) + \ No newline at end of file diff --git a/app_fetcher/src/news_feed.py b/app_fetcher/src/news_feed.py new file mode 100644 index 0000000..c035562 --- /dev/null +++ b/app_fetcher/src/news_feed.py @@ -0,0 +1,60 @@ +from .db_utils import URL_DB_Writer +import feedparser +import dateutil +import psycopg +import logging +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") + +class NewsFeed(): + def __init__(self, db_connect_info, redis_connect_info) -> None: + logger.debug("Initializing News feed") + self.db_connect_info = db_connect_info + self.redis_connect_info = redis_connect_info + + def _get_feed_urls(self): + try: + with psycopg.connect(self.db_connect_info) as conn: + list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall() + # Decode (tuple with 1 element) + list_url_feeds = [l[0] for l in list_url_feeds] + except Exception as e: + logger.warning("Exception fetching RSS sites: " + str(e)) + list_url_feeds = [] + return list_url_feeds + + def run(self): + try: + logger.debug("Starting NewsFeed.run()") + + # Get feeds + list_url_feeds = self._get_feed_urls() + logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds))) + + # Process via RSS feeds + for url_feed in list_url_feeds: + # Initialize + urls_fetched, urls_publish_date = [], [] + # Fetch feeds + feeds = feedparser.parse(url_feed) + # Parse + for f in feeds.get("entries", []): + # Get URL + url = f.get("link", None) + # Process? + if (url is not None): + # Available publish date? + publish_date = f.get("published", None) + if (publish_date is not None): + publish_date = dateutil.parser.parse(publish_date) + urls_publish_date.append(publish_date) + # URL + urls_fetched.append(url) + + # URL fetching source + source = "feed {}".format(url_feed) + # Write to DB + db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info) + db_writer.write_batch(urls_fetched, source) + except Exception as e: + logger.warning("Exception in NewsFeed.run(): {}".format(str(e))) diff --git a/app_fetcher/src/news_missing_kids.py b/app_fetcher/src/news_missing_kids.py new file mode 100644 index 0000000..c5da856 --- /dev/null +++ b/app_fetcher/src/news_missing_kids.py @@ -0,0 +1,40 @@ +from .db_utils import URL_DB_Writer +import requests +import json +import logging +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") + +class NewsMissingKids(): + def __init__(self, db_connect_info, redis_connect_info, num_pages) -> None: + logger.debug("Initializing News MissingKids") + self.db_connect_info = db_connect_info + self.redis_connect_info = redis_connect_info + self.num_pages = num_pages + + def run(self): + try: + logger.debug("Starting NewsMissingKids.run()") + try: + # Missing kids fetching endpoint, parameter number of pages to fetch + missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}".format(self.num_pages) + # Timeout + if (self.num_pages > 15): + timeout = 60*90 # 1.5h + else: + timeout = 60*5 # 5 min + # Request + r = requests.get(missingkids_fetch_endpoint, timeout=timeout) + # Decode + urls_fetched = json.loads(r.text).get("list_urls", []) + except Exception as e: + logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e))) + urls_fetched = [] + + # URL fetching source + source = "missingkids fetcher" + # Write to DB + db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info) + db_writer.write_batch(urls_fetched, source) + except Exception as e: + logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e))) diff --git a/app_fetcher/src/news_parsing.py b/app_fetcher/src/news_parsing.py new file mode 100644 index 0000000..217158c --- /dev/null +++ b/app_fetcher/src/news_parsing.py @@ -0,0 +1,58 @@ +from .db_utils import URL_DB_Writer +import newspaper +import psycopg +import logging +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") + +class NewsSiteParsing(): + def __init__(self, db_connect_info, redis_connect_info) -> None: + logger.debug("Initializing News SiteParsing newspaper3k") + self.db_connect_info = db_connect_info + self.redis_connect_info = redis_connect_info + + def _get_url_hosts(self): + try: + with psycopg.connect(self.db_connect_info) as conn: + list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall() + # Decode (tuple with 1 element) + list_url_hosts = [l[0] for l in list_url_hosts] + except Exception as e: + logger.warning("Exception fetching RSS sites: " + str(e)) + list_url_hosts = [] + return list_url_hosts + + def _postprocess(self, article_urls): + return [url.replace("#comment-stream", "") for url in article_urls] + + def run(self): + try: + logger.debug("Starting NewsSiteParsing.run() for {}") + + # Get feeds + list_url_hosts = self._get_url_hosts() + logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts))) + + # Process newspaper3k build method + for url_host_feed in list_url_hosts: + # Protocol + if not (url_host_feed.startswith("http")): + url_host_feed_formatted = "https://" + url_host_feed + else: + url_host_feed_formatted = url_host_feed + + logger.debug("Fetching newspaper3k parsing based on URL: {}".format(url_host_feed_formatted)) + # Source object + url_host_built = newspaper.build(url_host_feed_formatted) + # Get articles URL list + urls_fetched = url_host_built.article_urls() + # Post-processing + urls_fetched = self._postprocess(urls_fetched) + + # URL fetching source + source = "newspaper3k {}".format(url_host_feed) + # Write to DB + db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info) + db_writer.write_batch(urls_fetched, source) + except Exception as e: + logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e))) \ No newline at end of file diff --git a/app_fetcher/src/news_search.py b/app_fetcher/src/news_search.py new file mode 100644 index 0000000..024bdf8 --- /dev/null +++ b/app_fetcher/src/news_search.py @@ -0,0 +1,181 @@ +from .db_utils import URL_DB_Writer +import psycopg +from .utils import get_searxng_instances +from .search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch +from threading import Thread +import time +import random +import logging +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") + +class NewsSearch(): + def __init__(self, db_connect_info, redis_connect_info, full=True) -> None: + logger.debug("Initializing News feed") + self.db_connect_info = db_connect_info + self.redis_connect_info = redis_connect_info + self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info) + self.full_search = full + + def _get_url_host_list(self): + try: + with psycopg.connect(self.db_connect_info) as conn: + # List of URL host + list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()] + # Clean http / https from URLs + list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host] + # Clean last slash if exists + list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host] + except Exception as e: + logger.warning("Exception fetching URL host list: " + str(e)) + list_url_host = [] + return list_url_host + + def _get_search_list(self): + try: + with psycopg.connect(self.db_connect_info) as conn: + # List of keyword searches + list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()] + except Exception as e: + logger.warning("Exception fetching searches list: " + str(e)) + list_search_text = [] + return list_search_text + + def _run_fetching(self, search_text): + logger.debug("Starting _run_fetching() for {}".format(search_text)) + + # Initialize DB Writer + db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info) + + # Common parameters + lang, region = "en", "US" + + ### PreSearch + dict_params_news = {"search": search_text} + FetcherPreSearch(**dict_params_news).fetch_articles(db_writer) + + ### DuckDuckGo + period = "d" + dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period} + FetcherDuckDuckGo(**dict_params_news).fetch_articles(db_writer) + dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period} + FetcherDuckDuckGo(**dict_params_general).fetch_articles(db_writer) + + if (self.full_search): + # Avoid site:{} search due to G-Bypass required time + if ("site:" not in search_text): + ### GNews + dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period} + FetcherGNews(**dict_params).fetch_articles(db_writer) + + ### GoogleNews + dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period} + FetcherGoogleNews(**dict_params_news).fetch_articles(db_writer) + # dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period} + + + ''' + # Method run concurrently, minimize overlapping + time.sleep(random.uniform(1, 15)) + list_threads = [] + + def run_search(FetcherObject, dict_params): + # Initialize DB Writer + db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info) + # Fetch and write to DB + FetcherObject(**dict_params).fetch_articles(db_writer) + + """ + ### SearxNG + period = "day" + for searx_instance in get_searxng_instances(): + dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period} + dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period} + # Append thread + list_threads.append( Thread(target=run_search, args=(FetcherSearxNews, dict_params_news, )) ) + list_threads.append( Thread(target=run_search, args=(FetcherSearxNews, dict_params_general, )) ) + """ + + ### PreSearch + dict_params_news = {"search": search_text} + list_threads.append( Thread(target=run_search, args=(FetcherPreSearch, dict_params_news, )) ) + + ### DuckDuckGo + period = "d" + dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period} + dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period} + # Append thread + list_threads.append( Thread(target=run_search, args=(FetcherDuckDuckGo, dict_params_news, )) ) + list_threads.append( Thread(target=run_search, args=(FetcherDuckDuckGo, dict_params_general, )) ) + + if (self.full_search): + # Avoid site:{} search due to G-Bypass required time + if ("site:" not in search_text): + ### GNews + for period in ["1d"]: # ["1d", "6h"]: + dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period} + # Append thread + list_threads.append( Thread(target=run_search, args=(FetcherGNews, dict_params, )) ) + + ### GoogleNews + for period in ["1d"]: # ["1d", "6h"]: + # News + dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period} + list_threads.append( Thread(target=run_search, args=(FetcherGoogleNews, dict_params_news, )) ) + if False: + dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period} + list_threads.append( Thread(target=run_search, args=(FetcherGoogleNews, dict_params_general, )) ) + + # Run + MULTITHREADED = False + logger.debug("Fetching threads starting") + if MULTITHREADED: + for t in list_threads: + t.start() + # Join + for t in list_threads: + t.join() + else: + for t in list_threads: + t.start() + t.join() + logger.debug("Fetching threads finished") + ''' + logger.debug("Finished _run_fetching()") + + def run(self): + try: + logger.info("Fetching text searches & URL hosts of interest") + + # Get text searches of interest + list_search_text_of_interest = self._get_search_list() + + # Get URL host of interest + list_url_host = self._get_url_host_list() + # Get text searches for URL hosts + list_search_text_url_host = ["site:{}".format(l) for l in list_url_host] + + MULTITHREADED = False + if MULTITHREADED: + # Run fetching + list_fetching_threads = [] + for search_text in list_search_text_of_interest + list_search_text_url_host: + logger.debug("Fetching news for search: {}".format(search_text)) + # Append thread + list_fetching_threads.append( Thread(target=self._run_fetching, args=(search_text, )) ) + + # Run + for t in list_fetching_threads: + t.start() + # Join + for t in list_fetching_threads: + t.join() + else: + for search_text in list_search_text_of_interest + list_search_text_url_host: + logger.debug("Fetching news for search: {}".format(search_text)) + self._run_fetching(search_text) + + logger.info("Finished fetching text searches & URL hosts of interest") + except Exception as e: + logger.warning("Exception in NewsSearch.run(): {}".format(str(e))) + \ No newline at end of file diff --git a/app_fetcher/src/search_sources.py b/app_fetcher/src/search_sources.py new file mode 100644 index 0000000..5ead308 --- /dev/null +++ b/app_fetcher/src/search_sources.py @@ -0,0 +1,321 @@ +from duckduckgo_search import DDGS +from gnews import GNews +from GoogleNews import GoogleNews + +import requests +from bs4 import BeautifulSoup +import os +import time +import json +import numpy as np +import random +from .user_agents import user_agents_list +from .google_bypass import GoogleByPass +from abc import ABC, abstractmethod +import logging +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") + + + +# Generic fetcher (fetches articles, writes to DB) +class FetcherAbstract(ABC): + @abstractmethod + def _fetch(self): + pass + + def fetch_articles(self, db_writer): + logger.debug("Starting fetch() for {}".format(self.name)) + # Fetch articles + list_news = self._fetch() + logger.info("Found #{} articles for search: {}".format(len(list_news), self.name)) + # Write to DB + db_writer.write_batch(list_news, self.name) + + + + + +class FetcherPreSearch(FetcherAbstract): + def __init__(self, search): + """ + # period -> + - h = hours (eg: 12h) + - d = days (eg: 7d) + - m = months (eg: 6m) + - y = years (eg: 1y) + """ + self.search = search + self.period = "1d" # TODO Fixed for the moment + # self.lang = lang + # self.region = region + search_category = "news" + self.name = "presearch {} {} {}".format(search, search_category, self.period) + + def _fetch(self): + try: + # PreSearch fetching endpoint, parameter search keyword + presearch_fetch_endpoint = "http://selenium_app:80/fetch_presearch/?search_keyword={}".format(self.search) + # Timeout: 15 minutes + r = requests.get(presearch_fetch_endpoint, timeout=900) + # Decode + list_news = json.loads(r.text).get("list_urls", []) + except Exception as e: + logger.warning("Timeout on request: {}. {}".format(presearch_fetch_endpoint, str(e))) + list_news = [] + return list_news + + + +class FetcherGNews(FetcherAbstract): + def __init__(self, search, period, lang="en", region="US"): + """ + # period -> + - h = hours (eg: 12h) + - d = days (eg: 7d) + - m = months (eg: 6m) + - y = years (eg: 1y) + """ + self.search = search + self.period = period + self.lang = lang + self.region = region + search_category = "news" + self.name = "gnews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region)) + + def _fetch(self): + try: + list_dict_news = GNews(self.lang, self.region, period=self.period).get_news(self.search) + # Decode + list_news = [] + for l in list_dict_news: + list_news.append(l.get("url")) + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self.name, str(e))) + list_news = [] + + # Bypass Google links + list_news_redirections = GoogleByPass().bypass_google_urls(list_news) + + return list_news_redirections + +class FetcherGoogleNews(FetcherAbstract): + def __init__(self, search, search_category="news", period="1d", lang="en", region="US"): + assert(search_category in ["news", "general"]) + + self.lang = lang + self.region = region + self.period = period + self.search_category = search_category + self.search = search + self.name = "googlenews {} {} {} {}".format(search, search_category, period, "{}-{}".format(lang, region)) + + def _fetch(self): + try: + # Initialize + g = GoogleNews(encode="utf-8", period=self.period, lang=self.lang, region=self.region) + g.enableException(True) + + if (self.search_category == "general"): + set_links = set() + # Search + g.search(self.search) + + # Iterate pages + MAX_ITER_PAGES = 15 + for i in range(MAX_ITER_PAGES): + time.sleep(random.uniform(1, 1.5)) + num_before = len(set_links) + + # Get page + try: + links = g.page_at(i) + except Exception as e: + logger.warning("Exception fetching page in GoogleNews {}: {}".format(self.name, str(e))) + break + # Links + for l in links: + # '/url?esrc=s&q=&rct=j&sa=U&url=https://www.breitbart.com/news/scent-of-luxury-indias-jasmine-infuses-global-perfume/&ved=2ahUKEwjOybGSiN-AAxX1gv0HHfqSBpMQxfQBegQICBAC&usg=AOvVaw06GdoHyzPbIopUaEuUSQPQ' + url = l.get("link").split("url=")[-1] + set_links.add(url) + + num_after = len(set_links) + + # Finished? + if (num_before == num_after): + logger.debug("Iterated {} pages on GoogleNews general search".format(i)) + break + # To list + list_news = list(set_links) + elif (self.search_category == "news"): + # Search + g.get_news(self.search) + # Fetch + list_news = g.get_links() + + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self.name, str(e))) + list_news = [] + + # Bypass Google links + list_news_redirections = GoogleByPass().bypass_google_urls(list_news) + + return list_news_redirections + +class FetcherDuckDuckGo(FetcherAbstract): + def __init__(self, search, search_category, period, lang="wt", region="wt"): + assert(search_category in ["news", "general"]) + assert(period in ["d", "w", "m", "y"]) + self.search = search + self.search_category = search_category + self.period = period + self.lang_region = "{}-{}".format(lang, region) + self.name = "duckduckgo {} {} {} {}".format(search, search_category, "1{}".format(period), region) + + def _fetch(self): + try: + list_news = [] + with DDGS(timeout=10) as ddgs: + if (self.search_category == "general"): + generator_links = ddgs.text(keywords=self.search, timelimit=self.period, region=self.lang_region) + elif (self.search_category == "news"): + generator_links = ddgs.news(keywords=self.search, timelimit=self.period, region=self.lang_region) + + for l in generator_links: + list_news.append( l.get("url", l.get("href")) ) + + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self.name, str(e))) + list_news = [] + return list_news + + +class FetcherSearxNews(FetcherAbstract): + def __init__(self, search="child abuse", searx_instance="https://serx.ml/", lang="en", region="US", search_category="news", period="day"): + assert(search_category in ["news", "general"]) + assert(period in [None, "day", "week", "month", "year"]) + # Random header (minimize prob of web-scrapping detection) + self.headers = { + 'User-agent': str(np.random.choice(user_agents_list)), + 'Accept-Encoding': 'gzip, deflate', + 'Accept': '*/*', + 'Connection': 'keep-alive', + } + """ # Optional header + self.headers = { + 'User-agent': str(np.random.choice(user_agents_list)), + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'TE': 'trailers', + 'Sec-Fetch-Site': 'cross-site', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Dest': 'document', + } + """ + self.search = search + self.searx_instance = searx_instance + self.lang_region = "{}-{}".format(lang, region) + self.search_category = search_category + self.period = period + self.t_sleep_lower, self.t_sleep_higher = 0.5, 1.5 + self.request_timeout = 240 + + period_name_mapping = { + None: "no_date_range", + "day": "1d", + "week": "1w", + "month": "1m", + "year": "1y", + } + self.name = "searxng {} {} {} {} {}".format(searx_instance.replace("https://", "").replace("/", ""), search, search_category, period_name_mapping[period], self.lang_region) + logger.info("SearX - Initialized SearX fetcher: {}".format(self.name)) + + def _request_and_decode(self, url_search): + # Initial random time sleep (minimize chance of getting blocked) + time.sleep(random.uniform(self.t_sleep_lower, self.t_sleep_higher)) + # Request + logger.debug("SearX - Searching: {}".format(url_search)) + try: + r = requests.get(url_search, headers=self.headers, timeout=self.request_timeout) + except Exception as e: + logger.warning("SearX - Exception in request: {}".format(url_search), "\n", str(e)) + return [] + + if (r.status_code == 200): + # Status code Ok + pass + elif (r.status_code == 429): + # TooManyRequests, "Rate limit exceeded" + logger.warning("SearX {} - Too many requests while running: {}. Request output: {}".format(self.name, r.url, r.text)) + return [] + elif (r.status_code != 200): + logger.warning("SearX {} - Status code: {}. Request output: {}".format(self.name, r.status_code, r.text)) + return [] + else: + logger.debug("SearX - Status code: {}".format(r.status_code)) + + # Decode request + soup = BeautifulSoup(r.text, 'html.parser') + page_url_set = set() + # h3 links + for elem in soup.find_all('h3'): + # Get url + url = elem.find('a').get('href') + page_url_set.add(url) + return page_url_set + + def _get_news_list(self): + ############################################################ + # Domain & search parameter + search_domain = os.path.join(self.searx_instance, "search?q=") + # Search keywords + search_formatted = self.search.replace(" ", "+").replace(":", "%3A") + # Period formatted + period_formatted = "&time_range={}".format(self.period) if self.period is not None else "" + # Search parameters + search_parameters = "&category_{}=on&language={}{}".format(self.search_category, self.lang_region, period_formatted) + # Combined url search + url_search_nopage = "{}{}{}".format(search_domain, search_formatted, search_parameters) + ############################################################ + + # Request and decode on page=1 + url_set = self._request_and_decode(url_search_nopage) + # No results? + if (len(url_set) == 0): + logger.warning("SearX {} - Empty results on search: {}".format(self.name, url_search_nopage)) + return [] + + # Iterate pages + search_numpage = 2 + while True: + # Combine url search with page number + url_search_with_page = "{}&pageno={}".format(url_search_nopage, search_numpage) + # Request and decode on page=X + url_set_i = self._request_and_decode(url_search_with_page) + + # Length before merging + length_current = len(url_set) + # Merge + url_set = url_set.union(url_set_i) + # Length after merging + length_merged = len(url_set) + + # No new elements? + if (length_current == length_merged): + logger.debug("SearX {} - Finished processing search, #pages: {}".format(self.name, search_numpage)) + break + # Next page + search_numpage += 1 + + return list(url_set) + + def _fetch(self): + try: + # Fetch news + list_news = self._get_news_list() + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self.name, str(e))) + list_news = [] + return list_news diff --git a/app_fetcher/src/url_status.py b/app_fetcher/src/url_status.py new file mode 100644 index 0000000..b5b1839 --- /dev/null +++ b/app_fetcher/src/url_status.py @@ -0,0 +1,63 @@ +from .db_utils import URL_DB_Writer +from .url_utils import process_article +import logging +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") + +class UpdateErrorURLs(): + def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None: + self.num_urls = num_urls + self.db_connect_info = db_connect_info + self.redis_connect_info = redis_connect_info + self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info) + + def update_error_urls_status(self): + try: + logger.info("Starting updating status to URLs with error, limit #URLs: {}".format(self.num_urls)) + # List of URLs with status 'error' + list_ids_and_urls = self.db_writer._get_error_urls(self.num_urls) + # Current status + current_status = "error" + # Dict: status -> IDs to update to new status + dict_status_ids, dict_status_urls = {}, {} + + # Get list of (pattern, priority, status) tuples to override status if required + list_pattern_status_tuple = self.db_writer._get_pattern_status_list() + # Sort pattern tuples by priority + list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True) + + flush_every, flush_current = 20, 0 + # Iterate URLs + for (id, url) in list_ids_and_urls: + # Get status + url_canonical, article_elements, new_status = process_article(url, list_pattern_status_tuple) + # Different? Update + if (current_status != new_status): + # Extend array + dict_status_ids[new_status] = dict_status_ids.get(new_status, []) + [id] + # Debugging dict + dict_status_urls[new_status] = dict_status_urls.get(new_status, []) + [url] + # +1 processed + flush_current += 1 + + # Flush batch? + if (flush_every == flush_current): + logger.info("Updating status to URLs with error: {}".format(dict_status_urls)) + # Update DB + self.db_writer._update_urls_status(dict_status_ids) + # Reset + flush_current = 0 + dict_status_ids, dict_status_urls = {}, {} + + # Flush remaining batch + if (flush_current > 0): + logger.info("Updating status to URLs with error: {}".format(dict_status_urls)) + # Update DB + self.db_writer._update_urls_status(dict_status_ids) + # Reset + flush_current = 0 + dict_status_ids, dict_status_urls = {}, {} + + logger.info("Finished updating status to URLs with error") + except Exception as e: + logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e))) diff --git a/app_fetcher/src/url_utils.py b/app_fetcher/src/url_utils.py new file mode 100644 index 0000000..f04983d --- /dev/null +++ b/app_fetcher/src/url_utils.py @@ -0,0 +1,289 @@ +from gnews import GNews +import dateutil.parser +from datetime import datetime, timedelta +from .utils import remove_http_s +import time +import random +import traceback +import requests +import json +import re +from bs4 import BeautifulSoup + +import logging +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") + +def get_published_date(article): + try: + """ + # Already fetched publish date information? + if (publish_date_ is not None): + return publish_date_ + """ + + # List of potential publish dates + potential_dates = [] + # Publish date is the best match + potential_dates.append(article.publish_date) + # Publish date metadata is the following best match + potential_dates.append(article.meta_data.get('article', {}).get("published_time", None)) + # Iterate remaining keys + for key in article.meta_data.keys(): + if ("date" in key): + potential_dates.append(article.meta_data[key]) + + def invalid_date(p_date): + # Today + 2 days, article from the future? + today_plus_two = datetime.utcnow() + timedelta(days=2) + # Article from the future? + return p_date.timestamp() > today_plus_two.timestamp() + + for date_ in potential_dates: + # String date? parse + if (type(date_) == str): + try: + date_ = dateutil.parser.parse(date_) + except Exception as e: + logger.info("Invalid date found while parsing potential date: {} for URL: {}".format(date_, article.url)) + date_ = None + # Valid? + if (date_ is not None) and (not invalid_date(date_)): + return date_ + + logger.debug("Article with no published date: {}".format(article.url)) + return None + except Exception as e: + logger.info("Error while retrieving published date for URL: {}".format(article.url)) + return None + +def get_url_host(article_source_url, url): + # https://www.blabla.com/blabla -> www.blabla.com + if (article_source_url != ""): + # Article source URL already extracted, save path if any + return remove_http_s(article_source_url) # .split("/")[0] + else: + return remove_http_s(url).split("/")[0] + +def get_status_pattern_matching(url, article_status, list_pattern_status_tuple): + # Regex pattern to update status on "valid", "invalid", and "unknown" status only + # Status "raw", "duplicated" and "error" should remain the way they are + # Assumption: List of patterns sorted by importance + if (article_status in ["valid", "invalid", "unknown"]): + # Regular expression pattern matching: https://regexr.com/ + for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple: + # Matching? + matching = bool(re.match(regex_pattern, url)) + # Update article status + if (matching): + if (status_if_match != article_status): + logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url)) + return status_if_match + # Pattern matching not required or not found, original article status + return article_status + +def get_missing_kid_status(url, return_canonical_url=False): + # Sleep + time.sleep(0.75) + try: + # Request + r = requests.get(url, timeout=300) + # Decode + status_code = r.status_code + # Canonical URL removing parameters + url_canonical = r.url + except Exception as e: + logger.warning("Exception on get URL status request: {}. {}".format(url, str(e))) + status_code = None + url_canonical = url + + if (status_code == 200): + status = "valid" + elif (status_code == 404): + status = "invalid" + else: + status = "unknown" + + logger.debug("Missing Kid URL {} status: {}".format(url, status)) + if (return_canonical_url): + return status, url_canonical + else: + return status + +def bypass_google_link(article_url): + + def bypass_google_consent(article_url): + # Sample URL: https://consent.google.com/m?continue=https://news.google.com/rss/articles/CBMiMGh0dHBzOi8vd3d3Lm1pc3NpbmdraWRzLm9yZy9wb3N0ZXIvbmNtYy84NjAxMTkvMdIBAA?oc%3D5&gl=NL&m=0&pc=n&cm=2&hl=en-US&src=1 + article_url_no_consent = article_url.replace("https://consent.google.com/m?continue=", "") + + # https://stackoverflow.com/questions/76063646/how-can-i-have-redirection-link-from-google-news-link-using-requests + headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' + } + cookies = {'CONSENT': 'YES+cb.20220419-08-p0.cs+FX+111'} + + try: + # Request + r = requests.get(article_url_no_consent, headers=headers, cookies=cookies, timeout=300) + # Decode + soup = BeautifulSoup(r.text, 'html.parser') + url_of_interest = soup.a['href'] + except Exception as e: + logger.warning("Exception on request trying to G_bypass with headers: {}. {}".format(article_url_no_consent, str(e))) + url_of_interest = None + + # Not able to bypass? + if (url_of_interest == "") or ("support.google.com" in url_of_interest) or ("news.google.com" in url_of_interest): + url_of_interest = None + return url_of_interest + + def bypass_google_using_service(article_url): + try: + # e.g.: url = "https://news.google.com/articles/CBMiX2h0dHBzOi8vd3d3LmZveGJ1c2luZXNzLmNvbS9wb2xpdGljcy9kaXNuZXktc3Vlcy1mbG9yaWRhLWdvdi1yb24tZGVzYW50aXMtbG9zcy1zcGVjaWFsLWRpc3RyaWN00gEA?hl=en-US&gl=US&ceid=US%3Aen" + gbypass_endpoint = "http://selenium_app:80/get_redirection" + # Timeout: 5 minutes + r = requests.post(gbypass_endpoint, json={"url": article_url}, timeout=300) + # Decode + redirect_url = json.loads(r.text).get("redirect_url", "") + except Exception as e: + logger.warning("Exception on request: {}. {}".format(gbypass_endpoint, str(e))) + redirect_url = "" + + return redirect_url + + logger.debug("Starting gbypass_endpoint()") + + article_url_bypassed = None + # Bypass using request + if ("consent.google.com" in article_url): + article_url_bypassed = bypass_google_consent(article_url) + # Not bypassed yet? Bypass using service + if (article_url_bypassed is None): + article_url_bypassed = bypass_google_using_service(article_url) + + # if (article_url_bypassed is None) or (article_url_bypassed == "") or ("news.google.com" in article_url_bypassed): + if (article_url_bypassed == "") or (article_url_bypassed is None): + # Empty URL returned by Gbypass + logger.warning("Error while bypassing Gnews for URL: {}".format(article_url)) + return None + else: + logger.debug("Correctly bypassed GNews to URL_redirect, from URL: {} {}".format(article_url_bypassed, article_url)) + return article_url_bypassed + +def process_article(article_url, list_pattern_status_tuple, language="en"): + # TODO: + """ + https://github.com/fhamborg/news-please + https://github.com/fhamborg/Giveme5W1H + https://github.com/santhoshse7en/news-fetch + """ + try: + logger.debug("Starting process_article()") + + if ("news.google.com" in article_url) or ("consent.google.com" in article_url): + # Bypass to get redirection + article_url = bypass_google_link(article_url) + # Error? + if (article_url is None): + return None, {}, "error" + elif ("missingkids.org/poster" in article_url): + # Get status + article_status, url_canonical = get_missing_kid_status(article_url, return_canonical_url=True) + article_elements = { + "url_full": article_url, + "url_canonical": url_canonical + } + return url_canonical, article_elements, article_status + else: + # Avoid Too many requests (feeds, ...) + time.sleep(0.75) + + logger.debug("Processing: {}".format(article_url)) + + # Default status unless something happens + article_status = "valid" + + # Parse article + # TODO: :param proxy: The proxy parameter is a dictionary with a single key-value pair. self._proxy = {'http': proxy, 'https': proxy} if proxy else None + # TODO: Language per config + article = GNews(language).get_full_article(url=article_url) + + # Article parsed? + if (article is None) or (not article.is_parsed): + logger.debug("Article not parsed: {}".format(article_url)) + return article_url, {}, "error" + + # Canonical link as main URL + url_canonical = article.canonical_link + # Empty canonical URL? + if (article.canonical_link is None) or (article.canonical_link == ""): + # URL with parameters? e.g. some zerohedge news fetched from newspaper3k end with #comment-stream -> Remove extra parameter in link + if ("?" in article.url) or (article.url.endswith("#comment-stream")) or (article.url.endswith("#disqus_thread")): + logger.debug("Article URL contains parameters, trying to clean URL: {}".format(article.url)) + try: + # Remove text after parameter call + url = article.url.split("?")[0] + # Remove comment-stream + url = url.replace("#comment-stream", "").replace("#disqus_thread", "") + # Article + article_attempt = GNews(language).get_full_article(url=url) + # Retrieving same title? Update article based on clean URL + if (article_attempt is not None) and (article_attempt.title == article.title): + article = article_attempt + except Exception as e: + logger.info("Article parsing of URL without parameters failed: {}".format(article.url)) + else: # Default behaviour + logger.debug("Article canonical link is empty, assuming URL=URL_CANONICAL: {}".format(article.url)) + + # By default, URL same as canonical + url_canonical = article.url + + elif (article.url != article.canonical_link): + # If different, stick to canonical URL + logger.debug("Article URL and canonical link are different: {} {}".format(article.url, article.canonical_link)) + else: + # If same, continue... + pass + + # Update config to determine if content is valid + article.config.MIN_WORD_COUNT = 150 + article.config.MIN_SENT_COUNT = 6 + + # Valid URL? + if (not article.is_valid_url()): + logger.debug("Not a valid news article: {}".format(url_canonical)) + article_status = "invalid" + # Is the article's body text is long enough to meet standard article requirements? + if (not article.is_valid_body()): + logger.debug("Article body not valid: {}".format(url_canonical)) + article_status = "unknown" + + if (article.images != article.imgs): + logger.debug("Article images and imgs are different: {} {}".format(article.images, article.imgs)) + + # article.keywords, article.meta_keywords, article.summary + # article.movies + # article.top_image + + # Check if article status needs to be updated + article_status = get_status_pattern_matching(url_canonical, article_status, list_pattern_status_tuple) + + article_elements = { + 'url_full': article.url, # https://www.breitbart.com/tech/2022/10/03/report-election-integrity-project-worked-with-feds-to-censor-news-sites-in-2020/ + 'url_host': get_url_host(article.source_url, url_canonical), # www.breitbart.com + 'title': article.title, # Report: ‘Election Integrity’ Partnership Worked with Feds to Censor News Sites in 2020 + 'description': article.meta_description, # Coalition committed to respond in ‘early 2022’ but failed to do so, while Labor has not issued a full response since taking office + 'text': article.text, # ${Article content} + 'published_date': get_published_date(article), # python.datetime format, obtained from "YYYY-MM-DD" or '2022-10-03T20:54:17+00:00' + 'authors': article.authors, # ['Christopher Knaus'] + 'language': article.meta_lang, # en + 'tags': list(article.tags), # ['Wide Open Border', '’My Son Hunter’ Movie', ...] + 'images': list(article.images), # [URL_IMAGE_1, URL_IMAGE_2, ...] + 'url_canonical': url_canonical, # Canonical URL (redirection) + # 'html': article.html, # HTML article + } + logger.debug("Processing OK: {}".format(url_canonical)) + return url_canonical, article_elements, article_status + except Exception as e: + logger.warning("Exception processing url: {}\n{}".format(article_url, traceback.format_exc())) + return None, {}, "error" \ No newline at end of file diff --git a/app_fetcher/src/user_agents.py b/app_fetcher/src/user_agents.py new file mode 100644 index 0000000..67870ce --- /dev/null +++ b/app_fetcher/src/user_agents.py @@ -0,0 +1,64 @@ +# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/ + +user_agents_list = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48", + "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", + "Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41", + "Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" +] \ No newline at end of file diff --git a/app_fetcher/src/utils.py b/app_fetcher/src/utils.py new file mode 100644 index 0000000..76ae07a --- /dev/null +++ b/app_fetcher/src/utils.py @@ -0,0 +1,33 @@ + +def remove_http_s(url): + url = url.replace("https://", "") if url.startswith("https://") else url + url = url.replace("http://", "") if url.startswith("http://") else url + return url + +def is_valid_url(url): + if (url.startswith("https://")): + return True + else: + return False + +def get_searxng_instances(): + # SearxNG instances: https://searx.space/ + searx_instances = set() + searx_instances.add("https://searx.work/") + searx_instances.add("https://search.ononoki.org/") + searx_instances.add("https://searxng.nicfab.eu/") + searx_instances.add("https://searx.be/") + + # searx_instances.add("https://searx.fmac.xyz/") + # searx_instances.add("https://northboot.xyz/") # FIX + + # searx_instances.add("https://serx.ml/") # Offline + # searx_instances.add("https://searx.ru/") + # searx_instances.add("https://searx.sp-codes.de/") + # searx_instances.add("https://searxng.nicfab.eu/") + # searx_instances.add("https://s.frlt.one/") + # searx_instances.add("https://search.sapti.me/") + + # To list + list_searx_instances = list(searx_instances) + return list_searx_instances \ No newline at end of file diff --git a/image_generation_app/Dockerfile b/app_img_gen/Dockerfile similarity index 100% rename from image_generation_app/Dockerfile rename to app_img_gen/Dockerfile diff --git a/image_generation_app/app/__init__.py b/app_img_gen/app/__init__.py similarity index 100% rename from image_generation_app/app/__init__.py rename to app_img_gen/app/__init__.py diff --git a/image_generation_app/app/main.py b/app_img_gen/app/main.py similarity index 100% rename from image_generation_app/app/main.py rename to app_img_gen/app/main.py diff --git a/web_app/manage.py b/app_web/manage.py similarity index 100% rename from web_app/manage.py rename to app_web/manage.py diff --git a/web_app/mysite/__init__.py b/app_web/mysite/__init__.py similarity index 100% rename from web_app/mysite/__init__.py rename to app_web/mysite/__init__.py diff --git a/web_app/mysite/asgi.py b/app_web/mysite/asgi.py similarity index 100% rename from web_app/mysite/asgi.py rename to app_web/mysite/asgi.py diff --git a/web_app/mysite/settings.py b/app_web/mysite/settings.py similarity index 100% rename from web_app/mysite/settings.py rename to app_web/mysite/settings.py diff --git a/web_app/mysite/urls.py b/app_web/mysite/urls.py similarity index 100% rename from web_app/mysite/urls.py rename to app_web/mysite/urls.py diff --git a/web_app/mysite/wsgi.py b/app_web/mysite/wsgi.py similarity index 100% rename from web_app/mysite/wsgi.py rename to app_web/mysite/wsgi.py diff --git a/web_app/news/__init__.py b/app_web/news/__init__.py similarity index 100% rename from web_app/news/__init__.py rename to app_web/news/__init__.py diff --git a/web_app/news/admin.py b/app_web/news/admin.py similarity index 100% rename from web_app/news/admin.py rename to app_web/news/admin.py diff --git a/web_app/news/apps.py b/app_web/news/apps.py similarity index 100% rename from web_app/news/apps.py rename to app_web/news/apps.py diff --git a/web_app/news/migrations/0001_initial.py b/app_web/news/migrations/0001_initial.py similarity index 100% rename from web_app/news/migrations/0001_initial.py rename to app_web/news/migrations/0001_initial.py diff --git a/web_app/news/migrations/0002_alter_source_table_alter_url_table_and_more.py b/app_web/news/migrations/0002_alter_source_table_alter_url_table_and_more.py similarity index 100% rename from web_app/news/migrations/0002_alter_source_table_alter_url_table_and_more.py rename to app_web/news/migrations/0002_alter_source_table_alter_url_table_and_more.py diff --git a/web_app/news/migrations/0003_remove_url_pub_date_url_status_url_ts_fetch_and_more.py b/app_web/news/migrations/0003_remove_url_pub_date_url_status_url_ts_fetch_and_more.py similarity index 100% rename from web_app/news/migrations/0003_remove_url_pub_date_url_status_url_ts_fetch_and_more.py rename to app_web/news/migrations/0003_remove_url_pub_date_url_status_url_ts_fetch_and_more.py diff --git a/web_app/news/migrations/0004_alter_url_source_unique_together.py b/app_web/news/migrations/0004_alter_url_source_unique_together.py similarity index 100% rename from web_app/news/migrations/0004_alter_url_source_unique_together.py rename to app_web/news/migrations/0004_alter_url_source_unique_together.py diff --git a/web_app/news/migrations/0005_urls_remove_url_source_url_and_more.py b/app_web/news/migrations/0005_urls_remove_url_source_url_and_more.py similarity index 100% rename from web_app/news/migrations/0005_urls_remove_url_source_url_and_more.py rename to app_web/news/migrations/0005_urls_remove_url_source_url_and_more.py diff --git a/web_app/news/migrations/0006_alter_urls_options.py b/app_web/news/migrations/0006_alter_urls_options.py similarity index 100% rename from web_app/news/migrations/0006_alter_urls_options.py rename to app_web/news/migrations/0006_alter_urls_options.py diff --git a/web_app/news/migrations/__init__.py b/app_web/news/migrations/__init__.py similarity index 100% rename from web_app/news/migrations/__init__.py rename to app_web/news/migrations/__init__.py diff --git a/web_app/news/models.py b/app_web/news/models.py similarity index 67% rename from web_app/news/models.py rename to app_web/news/models.py index 7fa4692..1ac7054 100644 --- a/web_app/news/models.py +++ b/app_web/news/models.py @@ -1,4 +1,5 @@ from django.db import models +from django.contrib.postgres.fields import ArrayField # Create your models here. class Urls(models.Model): @@ -44,3 +45,17 @@ class UrlsSource(models.Model): managed = False db_table = 'urls_source' unique_together = (('id_url', 'id_source'),) + +class UrlContent(models.Model): + id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) + date_published = models.DateTimeField(blank=True, null=True) + title = models.TextField(blank=True, null=True) + description = models.TextField(blank=True, null=True) + content = models.TextField(blank=True, null=True) + tags = ArrayField(models.TextField(blank=True, null=True)) + authors = ArrayField(models.TextField(blank=True, null=True)) + image_urls = ArrayField(models.TextField(blank=True, null=True)) + + class Meta: + managed = False + db_table = 'url_content' diff --git a/web_app/news/templates/item_list.html b/app_web/news/templates/item_list.html similarity index 100% rename from web_app/news/templates/item_list.html rename to app_web/news/templates/item_list.html diff --git a/web_app/news/templates/item_list_partial.html b/app_web/news/templates/item_list_partial.html similarity index 97% rename from web_app/news/templates/item_list_partial.html rename to app_web/news/templates/item_list_partial.html index da2fe67..d41c3ea 100644 --- a/web_app/news/templates/item_list_partial.html +++ b/app_web/news/templates/item_list_partial.html @@ -14,7 +14,7 @@ {% for item in page_obj %} - {{ item.url }} + {{ item.url }} {{ item.ts_fetch }} {% with sources_map|dict_get:item.id as sources %} diff --git a/web_app/news/templates/url_detail.html b/app_web/news/templates/url_detail.html similarity index 89% rename from web_app/news/templates/url_detail.html rename to app_web/news/templates/url_detail.html index 8b8a7bc..d920ff2 100644 --- a/web_app/news/templates/url_detail.html +++ b/app_web/news/templates/url_detail.html @@ -131,7 +131,7 @@ - + @@ -145,9 +145,32 @@ + + + + + + + + + + + + + + + + + + + + + + + +
URL{{ url_item.url }}{{ url_item.url }}
Fetch DateStatus {{ url_item.status }}
Title{{ url_content.title }}
Description{{ url_content.description }}
Content{{ url_content.content }}
Tags{{ url_content.tags }}
Authors{{ url_content.authors }}
Image URLs{{ url_content.image_urls }}
-
diff --git a/web_app/news/templatetags/__init__.py b/app_web/news/templatetags/__init__.py similarity index 100% rename from web_app/news/templatetags/__init__.py rename to app_web/news/templatetags/__init__.py diff --git a/web_app/news/templatetags/custom_filters.py b/app_web/news/templatetags/custom_filters.py similarity index 100% rename from web_app/news/templatetags/custom_filters.py rename to app_web/news/templatetags/custom_filters.py diff --git a/web_app/news/tests.py b/app_web/news/tests.py similarity index 100% rename from web_app/news/tests.py rename to app_web/news/tests.py diff --git a/web_app/news/urls.py b/app_web/news/urls.py similarity index 100% rename from web_app/news/urls.py rename to app_web/news/urls.py diff --git a/web_app/news/views.py b/app_web/news/views.py similarity index 89% rename from web_app/news/views.py rename to app_web/news/views.py index fb02b0f..d4a89e3 100644 --- a/web_app/news/views.py +++ b/app_web/news/views.py @@ -7,7 +7,7 @@ import json import time import ollama -from .models import Urls, Source, UrlsSource +from .models import Urls, Source, UrlsSource, UrlContent # Create your views here. def index(request): @@ -60,19 +60,27 @@ def news(request): def url_detail_view(request, id): url_item = get_object_or_404(Urls, id=id) url_sources = list(Source.objects.filter(urlssource__id_url=url_item).values_list('source', flat=True)) + try: + url_content = UrlContent.objects.get(pk=id) + except UrlContent.DoesNotExist: + url_content = {} + + #print(url_content.__dict__) # TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client # LLM models available client = ollama.Client(host = 'https://ollamamodel.matitos.org') - models = [m.model for m in client.list().models] + models = sorted([m.model for m in client.list().models]) + print(models) context = { 'url_item': url_item, 'sources': url_sources, 'models': models, - "prompt": "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:", + 'prompt': "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:", #"prompt": "Image you are a journalist, TLDR in a paragraph:", #"prompt": "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content) + 'url_content': url_content, } return render(request, 'url_detail.html', context) diff --git a/web_app/mysite/__pycache__/__init__.cpython-312.pyc b/web_app/mysite/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index a92596e..0000000 Binary files a/web_app/mysite/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/web_app/mysite/__pycache__/settings.cpython-312.pyc b/web_app/mysite/__pycache__/settings.cpython-312.pyc deleted file mode 100644 index 664ce9c..0000000 Binary files a/web_app/mysite/__pycache__/settings.cpython-312.pyc and /dev/null differ diff --git a/web_app/mysite/__pycache__/urls.cpython-312.pyc b/web_app/mysite/__pycache__/urls.cpython-312.pyc deleted file mode 100644 index 674804b..0000000 Binary files a/web_app/mysite/__pycache__/urls.cpython-312.pyc and /dev/null differ diff --git a/web_app/mysite/__pycache__/wsgi.cpython-312.pyc b/web_app/mysite/__pycache__/wsgi.cpython-312.pyc deleted file mode 100644 index 4991a01..0000000 Binary files a/web_app/mysite/__pycache__/wsgi.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/__pycache__/__init__.cpython-312.pyc b/web_app/news/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 66b2a3d..0000000 Binary files a/web_app/news/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/__pycache__/admin.cpython-312.pyc b/web_app/news/__pycache__/admin.cpython-312.pyc deleted file mode 100644 index a55392e..0000000 Binary files a/web_app/news/__pycache__/admin.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/__pycache__/apps.cpython-312.pyc b/web_app/news/__pycache__/apps.cpython-312.pyc deleted file mode 100644 index 42dd93a..0000000 Binary files a/web_app/news/__pycache__/apps.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/__pycache__/models.cpython-312.pyc b/web_app/news/__pycache__/models.cpython-312.pyc deleted file mode 100644 index 6a74e6a..0000000 Binary files a/web_app/news/__pycache__/models.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/__pycache__/urls.cpython-312.pyc b/web_app/news/__pycache__/urls.cpython-312.pyc deleted file mode 100644 index 56a1f9b..0000000 Binary files a/web_app/news/__pycache__/urls.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/__pycache__/views.cpython-312.pyc b/web_app/news/__pycache__/views.cpython-312.pyc deleted file mode 100644 index a1cb7a3..0000000 Binary files a/web_app/news/__pycache__/views.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/migrations/__pycache__/0001_initial.cpython-312.pyc b/web_app/news/migrations/__pycache__/0001_initial.cpython-312.pyc deleted file mode 100644 index 922532d..0000000 Binary files a/web_app/news/migrations/__pycache__/0001_initial.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/migrations/__pycache__/0002_alter_source_table_alter_url_table_and_more.cpython-312.pyc b/web_app/news/migrations/__pycache__/0002_alter_source_table_alter_url_table_and_more.cpython-312.pyc deleted file mode 100644 index 9739c13..0000000 Binary files a/web_app/news/migrations/__pycache__/0002_alter_source_table_alter_url_table_and_more.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/migrations/__pycache__/0003_remove_url_pub_date_url_status_url_ts_fetch_and_more.cpython-312.pyc b/web_app/news/migrations/__pycache__/0003_remove_url_pub_date_url_status_url_ts_fetch_and_more.cpython-312.pyc deleted file mode 100644 index 4832345..0000000 Binary files a/web_app/news/migrations/__pycache__/0003_remove_url_pub_date_url_status_url_ts_fetch_and_more.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/migrations/__pycache__/0004_alter_url_source_unique_together.cpython-312.pyc b/web_app/news/migrations/__pycache__/0004_alter_url_source_unique_together.cpython-312.pyc deleted file mode 100644 index c942a86..0000000 Binary files a/web_app/news/migrations/__pycache__/0004_alter_url_source_unique_together.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/migrations/__pycache__/0005_urls_remove_url_source_url_and_more.cpython-312.pyc b/web_app/news/migrations/__pycache__/0005_urls_remove_url_source_url_and_more.cpython-312.pyc deleted file mode 100644 index 14e41fc..0000000 Binary files a/web_app/news/migrations/__pycache__/0005_urls_remove_url_source_url_and_more.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/migrations/__pycache__/0006_alter_urls_options.cpython-312.pyc b/web_app/news/migrations/__pycache__/0006_alter_urls_options.cpython-312.pyc deleted file mode 100644 index 5e02235..0000000 Binary files a/web_app/news/migrations/__pycache__/0006_alter_urls_options.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/migrations/__pycache__/__init__.cpython-312.pyc b/web_app/news/migrations/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 13a8e98..0000000 Binary files a/web_app/news/migrations/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/templatetags/__pycache__/__init__.cpython-312.pyc b/web_app/news/templatetags/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 6ac09fa..0000000 Binary files a/web_app/news/templatetags/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/web_app/news/templatetags/__pycache__/custom_filters.cpython-312.pyc b/web_app/news/templatetags/__pycache__/custom_filters.cpython-312.pyc deleted file mode 100644 index 45f8fe2..0000000 Binary files a/web_app/news/templatetags/__pycache__/custom_filters.cpython-312.pyc and /dev/null differ