diff --git a/.env b/.env new file mode 100644 index 0000000..89bdbcc --- /dev/null +++ b/.env @@ -0,0 +1 @@ +# TODO... \ No newline at end of file diff --git a/README.md b/README.md index 8ba13f7..a865a3e 100644 --- a/README.md +++ b/README.md @@ -1 +1,33 @@ -# Matitos \ No newline at end of file +# Matitos + +- Scheduled tasks + - Fetcher -> Inserts raw URLs + - Fetch parsing URL host + - Fetch from RSS feed + - Fetch searching (Google search & news, DuckDuckGo, ...) + - Process URLs -> Updates raw URLs + - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date + - Determines if it is a valid article content + - Valid URLs + - Generate summary + - Classification + - 5W: Who, What, When, Where, Why of a Story + - Related to child abuse? + - ... + +Georgia Institute of Technology +https://comm.gatech.edu › resources › writers + + +- Visualization of URLs + - Filter URLs + - By status, search, source, language + - Charts + +- Content generation + - Select URLs: + - Valid content + - language=en + - published_date during last_week + - Use classifications + - Merge summaries, ... \ No newline at end of file diff --git a/app_selenium/Dev.ipynb b/app_selenium/Dev.ipynb new file mode 100644 index 0000000..d6c3390 --- /dev/null +++ b/app_selenium/Dev.ipynb @@ -0,0 +1,46 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "endpoint = \"http://localhost:80/get_missing_kids?pages=2\"\n", + "r = requests.get(endpoint)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "r.text" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "matitos_urls", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/app_selenium/Dockerfile b/app_selenium/Dockerfile new file mode 100644 index 0000000..55af744 --- /dev/null +++ b/app_selenium/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12 + +RUN apt update && apt install -y --no-install-recommends chromium chromium-driver curl +RUN apt autoclean && rm -rf /var/lib/apt/lists/* + +WORKDIR /opt/app +RUN pip install --no-cache-dir selenium fastapi "uvicorn[standard]" +COPY . /opt/app/ + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"] + +# docker build -f Dockerfile -t selenium_app . +# docker run --rm -it --shm-size=512m --name selenium_app selenium_app + +# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=5" +# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=-1" diff --git a/app_selenium/README.md b/app_selenium/README.md index deac652..a2b894e 100644 --- a/app_selenium/README.md +++ b/app_selenium/README.md @@ -1,3 +1,8 @@ +# Selenium app * Missing kids posters fetch (num_pages=X) -* ... + +``` +SELENIUM_SLEEP_PER_PAGE=4 +PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log" +``` \ No newline at end of file diff --git a/app_selenium/app.py b/app_selenium/app.py new file mode 100644 index 0000000..7aa7323 --- /dev/null +++ b/app_selenium/app.py @@ -0,0 +1,14 @@ +from fastapi import FastAPI +from missing_kids import MissingKidsFetcher +from logger import get_logger +logger = get_logger() + +app = FastAPI() + +@app.get("/get_missing_kids/") +def get_missing_kids(pages: int = -1): + try: + res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)} + except Exception as e: + res = {} + return res diff --git a/app_selenium/logger.py b/app_selenium/logger.py new file mode 100644 index 0000000..28a3099 --- /dev/null +++ b/app_selenium/logger.py @@ -0,0 +1,34 @@ +import logging +import os + +# Get env var +path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log") + +# Directory of logs +directory = '/'.join(path_logs_parameterization.split("/")[:-1]) +os.makedirs(directory, exist_ok=True) + +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") +logger.setLevel(logging.DEBUG) + +# To file log: INFO / WARNING / ERROR / CRITICAL +fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1) +fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh.setLevel(logging.DEBUG) +logger.addHandler(fh) + +# To file log: INFO / WARNING / ERROR +fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1) +fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh.setLevel(logging.INFO) +logger.addHandler(fh) + +# To file log: WARNING / ERROR / CRITICAL +fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1) +fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh.setLevel(logging.WARNING) +logger.addHandler(fh) + +def get_logger(): + return logger diff --git a/app_selenium/missing_kids.py b/app_selenium/missing_kids.py new file mode 100644 index 0000000..11b38ba --- /dev/null +++ b/app_selenium/missing_kids.py @@ -0,0 +1,83 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from utils import get_chrome_options +import time +import os + +from logger import get_logger +logger = get_logger() + +class MissingKidsFetcher(): + def __init__(self) -> None: + pass + + def get_missing_kids_urls(self, first_n_pages=-1): + # Poster URL + url = "https://www.missingkids.org/gethelpnow/search/poster-search-results" + # URLs + set_urls = set() + + try: + # Initialize + driver = webdriver.Chrome(options=get_chrome_options()) + # Go to URL + driver.get(url) + # Iterate + i, continue_iterating, num_exceptions = 1, True, 0 + while (continue_iterating): + logger.debug("Processing page: {}...".format(i)) + + try: + time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3) + # Fetch poster URLs + for element_type in ["a"]: # ["a", "p", "div"]: + for elem in driver.find_elements(By.TAG_NAME, element_type): + href = elem.get_attribute('href') + if (href is not None) and ("missingkids.org/poster" in href): + set_urls.add(href) + + logger.debug("#URLS: {}".format(len(set_urls))) + + # Next page + elem = driver.find_element(By.LINK_TEXT, str(i+1)) + logger.debug("Clicking: {}...".format(elem.text)) + elem.click() + # Ok + processed_ok = True + except Exception as e: + # +1 exception + num_exceptions += 1 + processed_ok = False + + if (num_exceptions == 3): + continue_iterating = False + else: + logger.info("Exception while clicking page {}, retrying...".format(i+1)) + + start_print = False + for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""): + if (e.text == "<<"): + start_print = True + if (e.text == ">>"): + break + if (start_print): + logger.info(e.text) + + # driver.refresh() + time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); + + if (i == first_n_pages): + continue_iterating = False + if (processed_ok): + i += 1 + num_exceptions = 0 + + except Exception as e: + logger.warning("Exception while clicking page {}. {}".format(i+1, str(e)), exc_info=True) + finally: + try: + driver.close() + except Exception as e: + pass + + return set_urls diff --git a/app_selenium/utils.py b/app_selenium/utils.py new file mode 100644 index 0000000..062c720 --- /dev/null +++ b/app_selenium/utils.py @@ -0,0 +1,14 @@ +from selenium.webdriver.chrome.options import Options + +def get_chrome_options(): + """Sets chrome options for Selenium. + Chrome options for headless browser is enabled. + """ + chrome_options = Options() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_prefs = {} + chrome_options.experimental_options["prefs"] = chrome_prefs + chrome_prefs["profile.default_content_settings"] = {"images": 2} + return chrome_options diff --git a/app_urls/1-DB.ipynb b/app_urls/1-DB.ipynb index dfb5123..46757c3 100644 --- a/app_urls/1-DB.ipynb +++ b/app_urls/1-DB.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -11,16 +11,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "db_postgres\n", + "db_redis\n", + "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n", + " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", + " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n", + " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n", + " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h" + ] + } + ], "source": [ - "!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5" + "!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5\n", + "!rm logs/*" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -37,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -163,9 +189,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\t urls\n", + "[]\n", + "\t urls_duplicate\n", + "[]\n", + "\t urls_source_search\n", + "[]\n", + "\t source\n", + "[]\n", + "\t search\n", + "[(1,\n", + " 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n", + " 'rss_feed'),\n", + " (2, 'missingkids.org/poster', 'url_host'),\n", + " (3, 'missingkids.org/new-poster', 'url_host'),\n", + " (4, 'breitbart.com', 'url_host'),\n", + " (5, 'child abuse', 'keyword_search')]\n", + "\t status_pattern_matching\n", + "[('.*youtube\\\\.com/.*', 50, 'invalid'),\n", + " ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n", + " ('.*twitter\\\\.com/.*', 50, 'invalid'),\n", + " ('.*reddit\\\\.com/.*', 50, 'invalid'),\n", + " ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n", + " ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n", + "\t url_content\n", + "[]\n" + ] + } + ], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", @@ -182,9 +240,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(1,\n", + " 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n", + " 'rss_feed'),\n", + " (2, 'missingkids.org/poster', 'url_host'),\n", + " (3, 'missingkids.org/new-poster', 'url_host'),\n", + " (4, 'breitbart.com', 'url_host'),\n", + " (5, 'child abuse', 'keyword_search')]\n" + ] + } + ], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", @@ -195,9 +267,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", @@ -209,9 +289,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "'''\n", "!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n", diff --git a/app_urls/README.md b/app_urls/README.md index a23b61e..c9d6b39 100644 --- a/app_urls/README.md +++ b/app_urls/README.md @@ -96,6 +96,9 @@ FETCHER_GNEWS_DECODE_SLEEP=2 FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4 FETCHER_BETWEEN_SEARCHES_SLEEP=5 FETCHER_URL_HOST_SLEEP=5 +FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100 + +SELENIUM_ENDPOINT="http://selenium_app:80" ``` * Deploy diff --git a/app_urls/api/src/fetch_missing_kids.py b/app_urls/api/src/fetch_missing_kids.py new file mode 100644 index 0000000..c77cd98 --- /dev/null +++ b/app_urls/api/src/fetch_missing_kids.py @@ -0,0 +1,42 @@ +from .db_utils import DB_Handler +from ..models import Search, Source +import os +import requests +import json +import traceback +from .logger import get_logger +logger = get_logger() + +class FetchMissingKids(): + def __init__(self) -> None: + logger.debug("Initializing Fetcher MissingKids") + + def run(self, number_pages=-1): + try: + logger.debug("Starting MissingKids.run(), processing #{} pages".format(number_pages)) + + # Get source object + obj_source, created = Source.objects.get_or_create(source="missingkids.org") + # Get search object + obj_search, created = Search.objects.get_or_create(search="missingkids.org/poster", type=Search.TYPE_ENUM.URL_HOST) + + try: + # Missing kids fetching endpoint, parameter number of pages to fetch + missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "get_missing_kids/?pages={}".format(number_pages)) + # Timeout + if (number_pages > 15) or (number_pages == -1): + timeout = 60*90 # 1.5h + else: + timeout = 60*10 # 10 min + # Request + r = requests.get(missingkids_fetch_endpoint, timeout=timeout) + # Decode + urls_fetched = json.loads(r.text).get("list_urls", []) + except Exception as e: + logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e))) + urls_fetched = [] + + # Write to DB + DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search) + except Exception as e: + logger.warning("Exception in MissingKids.run(): {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/api/src/fetch_utils.py b/app_urls/api/src/fetch_utils.py index e5c1346..829ec35 100644 --- a/app_urls/api/src/fetch_utils.py +++ b/app_urls/api/src/fetch_utils.py @@ -1,4 +1,3 @@ -import traceback import os from django.core.cache import cache from .logger import get_logger @@ -30,7 +29,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE # Cache decoded URL cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12) else: - logger.info("Bad status while decoding news.google.com, URL {}".format(url)) + logger.info("Bad status while decoding news.google.com, URL {}\n{}".format(url, decoded_url_dict.get("message"))) except Exception as e: logger.warning("Error decoding news.google.com, URL: {}".format(url)) return list_decoded_urls \ No newline at end of file diff --git a/app_urls/api/src/url_processor.py b/app_urls/api/src/url_processor.py index d8d67bb..a4424bf 100644 --- a/app_urls/api/src/url_processor.py +++ b/app_urls/api/src/url_processor.py @@ -69,6 +69,16 @@ def process_url(url): except Exception as e: logger.warning("Exception for input URL {}\n{}".format(url, str(e))) return None + + try: + content_merged = "\n".join([article.title, article.meta_description, article.text]) + if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))): + language = langdetect.detect(content_merged) + else: + language = None + except Exception as e: + logger.info("Could not detect language: {}\n{}".format(url, str(e))) + language = None dict_data = { "url": url, @@ -76,8 +86,7 @@ def process_url(url): "url_host": article.source_url, "site_name": article.meta_site_name, "publish_date": article.publish_date, - # article.meta_lang -> Not always reliable - "language": langdetect.detect("\n".join([article.title, article.meta_description, article.text]) ), + "language": language, # article.meta_lang -> Not always reliable "title": article.title, "description": article.meta_description, "content": article.text, diff --git a/app_urls/api/tasks.py b/app_urls/api/tasks.py index 88e5a18..09fb107 100644 --- a/app_urls/api/tasks.py +++ b/app_urls/api/tasks.py @@ -3,10 +3,8 @@ from scheduler import job from .src.fetch_feed import FetchFeeds from .src.fetch_parser import FetchParser from .src.fetch_search import FetchSearcher +from .src.fetch_missing_kids import FetchMissingKids from .src.db_utils import DB_Handler -''' -from src.missing_kids_fetch import MissingKidsFetch -''' from .src.logger import get_logger logger = get_logger() @@ -32,7 +30,19 @@ def fetch_search(): FetchSearcher().run() logger.info("Task completed: {}".format(task)) -# TODO: fetch_missing_kids() +@job('default') +def fetch_missing_kids(number_pages=5): + task = "Fetch MissingKids" + logger.info("Task triggered: {}".format(task)) + FetchMissingKids().run(number_pages) + logger.info("Task completed: {}".format(task)) + +@job('default') +def fetch_missing_kids_all(number_pages=-1): + task = "Fetch MissingKids" + logger.info("Task triggered: {}".format(task)) + FetchMissingKids().run(number_pages) + logger.info("Task completed: {}".format(task)) @job('default') def process_raw_urls(batch_size=50): @@ -77,8 +87,15 @@ def background_task(process_type: str): FetchParser().run() elif (process_type == "fetch_search"): FetchSearcher().run() - #elif (process_type == "fetch_missingkids"): - # FetchMissingKids().run() + elif (process_type == "fetch_missingkids_all"): + FetchMissingKids().run(number_pages=-1) + elif ("fetch_missingkids" in process_type): + # number_pages encoded in URL + try: + number_pages = int(process_type.split("_")[-1]) + except Exception as e: + number_pages = -1 + FetchMissingKids().run(number_pages=number_pages) elif ("process_" in process_type): # Batch size encoded in URL try: @@ -95,14 +112,6 @@ def background_task(process_type: str): else: logger.info("Task unknown!: {}".format(process_type)) - ''' - # Selenium based - elif (process_type == "fetch_missing_kids_reduced"): - MissingKidsFetch(db_handler, num_pages=4).run() - elif (process_type == "fetch_missing_kids_full"): - MissingKidsFetch(db_handler, num_pages=100000).run() - ''' - logger.info("Task completed: {}".format(process_type)) except Exception as e: logger.error(e) diff --git a/app_urls/api/templates/filtered_urls.html b/app_urls/api/templates/filtered_urls.html index 2d983c6..be3e5bd 100644 --- a/app_urls/api/templates/filtered_urls.html +++ b/app_urls/api/templates/filtered_urls.html @@ -258,7 +258,7 @@ input[type="checkbox"] { OFF - --> + -->