From 5addfa5ba9c6e7099f66ee389280a759cee7aae5 Mon Sep 17 00:00:00 2001 From: Luciano Gervasoni Date: Thu, 3 Apr 2025 09:44:46 +0200 Subject: [PATCH] Valid content filter, language detect on min chars, fetch missingkids.org --- .env | 1 + README.md | 34 ++++- app_selenium/Dev.ipynb | 46 +++++++ app_selenium/Dockerfile | 16 +++ app_selenium/README.md | 7 +- app_selenium/app.py | 14 +++ app_selenium/logger.py | 34 +++++ app_selenium/missing_kids.py | 83 ++++++++++++ app_selenium/utils.py | 14 +++ app_urls/1-DB.ipynb | 119 +++++++++++++++--- app_urls/README.md | 3 + app_urls/api/src/fetch_missing_kids.py | 42 +++++++ app_urls/api/src/fetch_utils.py | 3 +- app_urls/api/src/url_processor.py | 13 +- app_urls/api/tasks.py | 37 +++--- app_urls/api/templates/filtered_urls.html | 22 ++-- app_urls/api/views.py | 62 ++++++--- .../docker-compose.yml => docker-compose.yml | 49 +++++++- 18 files changed, 533 insertions(+), 66 deletions(-) create mode 100644 .env create mode 100644 app_selenium/Dev.ipynb create mode 100644 app_selenium/Dockerfile create mode 100644 app_selenium/app.py create mode 100644 app_selenium/logger.py create mode 100644 app_selenium/missing_kids.py create mode 100644 app_selenium/utils.py create mode 100644 app_urls/api/src/fetch_missing_kids.py rename docker/docker-compose.yml => docker-compose.yml (54%) diff --git a/.env b/.env new file mode 100644 index 0000000..89bdbcc --- /dev/null +++ b/.env @@ -0,0 +1 @@ +# TODO... \ No newline at end of file diff --git a/README.md b/README.md index 8ba13f7..a865a3e 100644 --- a/README.md +++ b/README.md @@ -1 +1,33 @@ -# Matitos \ No newline at end of file +# Matitos + +- Scheduled tasks + - Fetcher -> Inserts raw URLs + - Fetch parsing URL host + - Fetch from RSS feed + - Fetch searching (Google search & news, DuckDuckGo, ...) + - Process URLs -> Updates raw URLs + - Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date + - Determines if it is a valid article content + - Valid URLs + - Generate summary + - Classification + - 5W: Who, What, When, Where, Why of a Story + - Related to child abuse? + - ... + +Georgia Institute of Technology +https://comm.gatech.edu › resources › writers + + +- Visualization of URLs + - Filter URLs + - By status, search, source, language + - Charts + +- Content generation + - Select URLs: + - Valid content + - language=en + - published_date during last_week + - Use classifications + - Merge summaries, ... \ No newline at end of file diff --git a/app_selenium/Dev.ipynb b/app_selenium/Dev.ipynb new file mode 100644 index 0000000..d6c3390 --- /dev/null +++ b/app_selenium/Dev.ipynb @@ -0,0 +1,46 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "endpoint = \"http://localhost:80/get_missing_kids?pages=2\"\n", + "r = requests.get(endpoint)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "r.text" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "matitos_urls", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/app_selenium/Dockerfile b/app_selenium/Dockerfile new file mode 100644 index 0000000..55af744 --- /dev/null +++ b/app_selenium/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12 + +RUN apt update && apt install -y --no-install-recommends chromium chromium-driver curl +RUN apt autoclean && rm -rf /var/lib/apt/lists/* + +WORKDIR /opt/app +RUN pip install --no-cache-dir selenium fastapi "uvicorn[standard]" +COPY . /opt/app/ + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"] + +# docker build -f Dockerfile -t selenium_app . +# docker run --rm -it --shm-size=512m --name selenium_app selenium_app + +# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=5" +# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=-1" diff --git a/app_selenium/README.md b/app_selenium/README.md index deac652..a2b894e 100644 --- a/app_selenium/README.md +++ b/app_selenium/README.md @@ -1,3 +1,8 @@ +# Selenium app * Missing kids posters fetch (num_pages=X) -* ... + +``` +SELENIUM_SLEEP_PER_PAGE=4 +PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log" +``` \ No newline at end of file diff --git a/app_selenium/app.py b/app_selenium/app.py new file mode 100644 index 0000000..7aa7323 --- /dev/null +++ b/app_selenium/app.py @@ -0,0 +1,14 @@ +from fastapi import FastAPI +from missing_kids import MissingKidsFetcher +from logger import get_logger +logger = get_logger() + +app = FastAPI() + +@app.get("/get_missing_kids/") +def get_missing_kids(pages: int = -1): + try: + res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)} + except Exception as e: + res = {} + return res diff --git a/app_selenium/logger.py b/app_selenium/logger.py new file mode 100644 index 0000000..28a3099 --- /dev/null +++ b/app_selenium/logger.py @@ -0,0 +1,34 @@ +import logging +import os + +# Get env var +path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log") + +# Directory of logs +directory = '/'.join(path_logs_parameterization.split("/")[:-1]) +os.makedirs(directory, exist_ok=True) + +logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') +logger = logging.getLogger("news_fetcher") +logger.setLevel(logging.DEBUG) + +# To file log: INFO / WARNING / ERROR / CRITICAL +fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1) +fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh.setLevel(logging.DEBUG) +logger.addHandler(fh) + +# To file log: INFO / WARNING / ERROR +fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1) +fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh.setLevel(logging.INFO) +logger.addHandler(fh) + +# To file log: WARNING / ERROR / CRITICAL +fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1) +fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh.setLevel(logging.WARNING) +logger.addHandler(fh) + +def get_logger(): + return logger diff --git a/app_selenium/missing_kids.py b/app_selenium/missing_kids.py new file mode 100644 index 0000000..11b38ba --- /dev/null +++ b/app_selenium/missing_kids.py @@ -0,0 +1,83 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from utils import get_chrome_options +import time +import os + +from logger import get_logger +logger = get_logger() + +class MissingKidsFetcher(): + def __init__(self) -> None: + pass + + def get_missing_kids_urls(self, first_n_pages=-1): + # Poster URL + url = "https://www.missingkids.org/gethelpnow/search/poster-search-results" + # URLs + set_urls = set() + + try: + # Initialize + driver = webdriver.Chrome(options=get_chrome_options()) + # Go to URL + driver.get(url) + # Iterate + i, continue_iterating, num_exceptions = 1, True, 0 + while (continue_iterating): + logger.debug("Processing page: {}...".format(i)) + + try: + time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3) + # Fetch poster URLs + for element_type in ["a"]: # ["a", "p", "div"]: + for elem in driver.find_elements(By.TAG_NAME, element_type): + href = elem.get_attribute('href') + if (href is not None) and ("missingkids.org/poster" in href): + set_urls.add(href) + + logger.debug("#URLS: {}".format(len(set_urls))) + + # Next page + elem = driver.find_element(By.LINK_TEXT, str(i+1)) + logger.debug("Clicking: {}...".format(elem.text)) + elem.click() + # Ok + processed_ok = True + except Exception as e: + # +1 exception + num_exceptions += 1 + processed_ok = False + + if (num_exceptions == 3): + continue_iterating = False + else: + logger.info("Exception while clicking page {}, retrying...".format(i+1)) + + start_print = False + for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""): + if (e.text == "<<"): + start_print = True + if (e.text == ">>"): + break + if (start_print): + logger.info(e.text) + + # driver.refresh() + time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); + + if (i == first_n_pages): + continue_iterating = False + if (processed_ok): + i += 1 + num_exceptions = 0 + + except Exception as e: + logger.warning("Exception while clicking page {}. {}".format(i+1, str(e)), exc_info=True) + finally: + try: + driver.close() + except Exception as e: + pass + + return set_urls diff --git a/app_selenium/utils.py b/app_selenium/utils.py new file mode 100644 index 0000000..062c720 --- /dev/null +++ b/app_selenium/utils.py @@ -0,0 +1,14 @@ +from selenium.webdriver.chrome.options import Options + +def get_chrome_options(): + """Sets chrome options for Selenium. + Chrome options for headless browser is enabled. + """ + chrome_options = Options() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_prefs = {} + chrome_options.experimental_options["prefs"] = chrome_prefs + chrome_prefs["profile.default_content_settings"] = {"images": 2} + return chrome_options diff --git a/app_urls/1-DB.ipynb b/app_urls/1-DB.ipynb index dfb5123..46757c3 100644 --- a/app_urls/1-DB.ipynb +++ b/app_urls/1-DB.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -11,16 +11,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "db_postgres\n", + "db_redis\n", + "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n", + " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", + " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n", + " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n", + " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", + "\u001b[?25h" + ] + } + ], "source": [ - "!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5" + "!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5\n", + "!rm logs/*" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -37,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -163,9 +189,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\t urls\n", + "[]\n", + "\t urls_duplicate\n", + "[]\n", + "\t urls_source_search\n", + "[]\n", + "\t source\n", + "[]\n", + "\t search\n", + "[(1,\n", + " 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n", + " 'rss_feed'),\n", + " (2, 'missingkids.org/poster', 'url_host'),\n", + " (3, 'missingkids.org/new-poster', 'url_host'),\n", + " (4, 'breitbart.com', 'url_host'),\n", + " (5, 'child abuse', 'keyword_search')]\n", + "\t status_pattern_matching\n", + "[('.*youtube\\\\.com/.*', 50, 'invalid'),\n", + " ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n", + " ('.*twitter\\\\.com/.*', 50, 'invalid'),\n", + " ('.*reddit\\\\.com/.*', 50, 'invalid'),\n", + " ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n", + " ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n", + "\t url_content\n", + "[]\n" + ] + } + ], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", @@ -182,9 +240,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(1,\n", + " 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n", + " 'rss_feed'),\n", + " (2, 'missingkids.org/poster', 'url_host'),\n", + " (3, 'missingkids.org/new-poster', 'url_host'),\n", + " (4, 'breitbart.com', 'url_host'),\n", + " (5, 'child abuse', 'keyword_search')]\n" + ] + } + ], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", @@ -195,9 +267,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", @@ -209,9 +289,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "'''\n", "!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n", diff --git a/app_urls/README.md b/app_urls/README.md index a23b61e..c9d6b39 100644 --- a/app_urls/README.md +++ b/app_urls/README.md @@ -96,6 +96,9 @@ FETCHER_GNEWS_DECODE_SLEEP=2 FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4 FETCHER_BETWEEN_SEARCHES_SLEEP=5 FETCHER_URL_HOST_SLEEP=5 +FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100 + +SELENIUM_ENDPOINT="http://selenium_app:80" ``` * Deploy diff --git a/app_urls/api/src/fetch_missing_kids.py b/app_urls/api/src/fetch_missing_kids.py new file mode 100644 index 0000000..c77cd98 --- /dev/null +++ b/app_urls/api/src/fetch_missing_kids.py @@ -0,0 +1,42 @@ +from .db_utils import DB_Handler +from ..models import Search, Source +import os +import requests +import json +import traceback +from .logger import get_logger +logger = get_logger() + +class FetchMissingKids(): + def __init__(self) -> None: + logger.debug("Initializing Fetcher MissingKids") + + def run(self, number_pages=-1): + try: + logger.debug("Starting MissingKids.run(), processing #{} pages".format(number_pages)) + + # Get source object + obj_source, created = Source.objects.get_or_create(source="missingkids.org") + # Get search object + obj_search, created = Search.objects.get_or_create(search="missingkids.org/poster", type=Search.TYPE_ENUM.URL_HOST) + + try: + # Missing kids fetching endpoint, parameter number of pages to fetch + missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "get_missing_kids/?pages={}".format(number_pages)) + # Timeout + if (number_pages > 15) or (number_pages == -1): + timeout = 60*90 # 1.5h + else: + timeout = 60*10 # 10 min + # Request + r = requests.get(missingkids_fetch_endpoint, timeout=timeout) + # Decode + urls_fetched = json.loads(r.text).get("list_urls", []) + except Exception as e: + logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e))) + urls_fetched = [] + + # Write to DB + DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search) + except Exception as e: + logger.warning("Exception in MissingKids.run(): {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/api/src/fetch_utils.py b/app_urls/api/src/fetch_utils.py index e5c1346..829ec35 100644 --- a/app_urls/api/src/fetch_utils.py +++ b/app_urls/api/src/fetch_utils.py @@ -1,4 +1,3 @@ -import traceback import os from django.core.cache import cache from .logger import get_logger @@ -30,7 +29,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE # Cache decoded URL cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12) else: - logger.info("Bad status while decoding news.google.com, URL {}".format(url)) + logger.info("Bad status while decoding news.google.com, URL {}\n{}".format(url, decoded_url_dict.get("message"))) except Exception as e: logger.warning("Error decoding news.google.com, URL: {}".format(url)) return list_decoded_urls \ No newline at end of file diff --git a/app_urls/api/src/url_processor.py b/app_urls/api/src/url_processor.py index d8d67bb..a4424bf 100644 --- a/app_urls/api/src/url_processor.py +++ b/app_urls/api/src/url_processor.py @@ -69,6 +69,16 @@ def process_url(url): except Exception as e: logger.warning("Exception for input URL {}\n{}".format(url, str(e))) return None + + try: + content_merged = "\n".join([article.title, article.meta_description, article.text]) + if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))): + language = langdetect.detect(content_merged) + else: + language = None + except Exception as e: + logger.info("Could not detect language: {}\n{}".format(url, str(e))) + language = None dict_data = { "url": url, @@ -76,8 +86,7 @@ def process_url(url): "url_host": article.source_url, "site_name": article.meta_site_name, "publish_date": article.publish_date, - # article.meta_lang -> Not always reliable - "language": langdetect.detect("\n".join([article.title, article.meta_description, article.text]) ), + "language": language, # article.meta_lang -> Not always reliable "title": article.title, "description": article.meta_description, "content": article.text, diff --git a/app_urls/api/tasks.py b/app_urls/api/tasks.py index 88e5a18..09fb107 100644 --- a/app_urls/api/tasks.py +++ b/app_urls/api/tasks.py @@ -3,10 +3,8 @@ from scheduler import job from .src.fetch_feed import FetchFeeds from .src.fetch_parser import FetchParser from .src.fetch_search import FetchSearcher +from .src.fetch_missing_kids import FetchMissingKids from .src.db_utils import DB_Handler -''' -from src.missing_kids_fetch import MissingKidsFetch -''' from .src.logger import get_logger logger = get_logger() @@ -32,7 +30,19 @@ def fetch_search(): FetchSearcher().run() logger.info("Task completed: {}".format(task)) -# TODO: fetch_missing_kids() +@job('default') +def fetch_missing_kids(number_pages=5): + task = "Fetch MissingKids" + logger.info("Task triggered: {}".format(task)) + FetchMissingKids().run(number_pages) + logger.info("Task completed: {}".format(task)) + +@job('default') +def fetch_missing_kids_all(number_pages=-1): + task = "Fetch MissingKids" + logger.info("Task triggered: {}".format(task)) + FetchMissingKids().run(number_pages) + logger.info("Task completed: {}".format(task)) @job('default') def process_raw_urls(batch_size=50): @@ -77,8 +87,15 @@ def background_task(process_type: str): FetchParser().run() elif (process_type == "fetch_search"): FetchSearcher().run() - #elif (process_type == "fetch_missingkids"): - # FetchMissingKids().run() + elif (process_type == "fetch_missingkids_all"): + FetchMissingKids().run(number_pages=-1) + elif ("fetch_missingkids" in process_type): + # number_pages encoded in URL + try: + number_pages = int(process_type.split("_")[-1]) + except Exception as e: + number_pages = -1 + FetchMissingKids().run(number_pages=number_pages) elif ("process_" in process_type): # Batch size encoded in URL try: @@ -95,14 +112,6 @@ def background_task(process_type: str): else: logger.info("Task unknown!: {}".format(process_type)) - ''' - # Selenium based - elif (process_type == "fetch_missing_kids_reduced"): - MissingKidsFetch(db_handler, num_pages=4).run() - elif (process_type == "fetch_missing_kids_full"): - MissingKidsFetch(db_handler, num_pages=100000).run() - ''' - logger.info("Task completed: {}".format(process_type)) except Exception as e: logger.error(e) diff --git a/app_urls/api/templates/filtered_urls.html b/app_urls/api/templates/filtered_urls.html index 2d983c6..be3e5bd 100644 --- a/app_urls/api/templates/filtered_urls.html +++ b/app_urls/api/templates/filtered_urls.html @@ -258,7 +258,7 @@ input[type="checkbox"] { OFF - --> + -->

Pages Per Page

@@ -297,6 +297,17 @@ input[type="checkbox"] {
{% endfor %} + +

Valid content

+
+ {% for vc in valid_contents %} +
+ {% endfor %} +

Search


@@ -329,7 +340,7 @@ input[type="checkbox"] { {{ lang|truncatechars:50 }}
{% endfor %} - + @@ -526,10 +537,6 @@ input[type="checkbox"] { const checkboxes = document.querySelectorAll(`[name='${section}']`); const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked); checkboxes.forEach(cb => cb.checked = !allChecked); - /* - // Automatically submit the form when a checkbox is toggled - document.getElementById('filterForm').submit(); - */ updateFormParameter(section); } @@ -545,9 +552,6 @@ input[type="checkbox"] { // Automatically submit the form when any checkbox changes document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) { checkbox.addEventListener('change', function() { - /* - document.getElementById('filterForm').submit(); - */ updateFormParameter(this.name); }); }); diff --git a/app_urls/api/views.py b/app_urls/api/views.py index 22bf190..f8c3c86 100644 --- a/app_urls/api/views.py +++ b/app_urls/api/views.py @@ -15,7 +15,7 @@ def trigger_task(request, task): #################################################################################################### def link_list(request): prefix = "http://localhost:8000/task" - links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"] + links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"] list_links = [ # DB @@ -212,21 +212,26 @@ def filtered_urls(request): # TODO: Cache languages, update once every N languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True)) # Null for visualization - languages = ["Null"] + [l for l in languages if l is not None] + languages = ["Unknown"] + [l for l in languages if l is not None] + valid_contents = ["True", "False", "Unknown"] # Get selected parameters selected_status = request.GET.getlist('status', ["null"]) selected_search = request.GET.getlist('search', ["null"]) selected_source = request.GET.getlist('source', ["null"]) selected_language = request.GET.getlist('language', ["null"]) + selected_valid_contents = request.GET.getlist('valid_content', ["null"]) selected_days = request.GET.get("days", 30) per_page = request.GET.get('per_page', 100) # Default is X URLs per page page_number = request.GET.get('page') # Get the current page number + all_status = [str(status[0]) for status in statuses] all_search = [str(search.id) for search in searches] all_source = [str(source.id) for source in sources] all_languages = languages + all_valid_contents = valid_contents + # Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page" if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())): @@ -234,23 +239,22 @@ def filtered_urls(request): selected_search = ["all"] selected_source = ["all"] selected_language = ["all"] - - - # print(set(selected_status), set(all_status)) - """ - # List of TODO remove... - if (set(selected_status) == set(all_status)): - selected_status = ["all"] - if (set(selected_search) == set(all_search)): - selected_search = ["all"] - if (set(selected_source) == set(all_source)): - selected_source = ["all"] - if (set(selected_language) == set(languages)): - selected_language = ["all"]" - """ + selected_valid_contents = ["all"] + else: + # Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query + if (set(selected_status) == set(all_status)): + selected_status = ["all"] + if (set(selected_search) == set(all_search)): + selected_search = ["all"] + if (set(selected_source) == set(all_source)): + selected_source = ["all"] + if (set(selected_language) == set(all_languages)): + selected_language = ["all"] + if (set(selected_valid_contents) == set(all_valid_contents)): + selected_valid_contents = ["all"] # Filter URLs based on selected filters - if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language): + if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents): urls = [] else: # Filter by date @@ -262,18 +266,36 @@ def filtered_urls(request): query &= Q(urlssourcesearch__id_source__in=selected_source) if ("all" not in selected_search): query &= Q(urlssourcesearch__id_search__in=selected_search) - if ("all" not in selected_language): + if ("all" not in selected_language): # URLs with selected languages subquery = Q(urlcontent__language__in=selected_language) - if ("Null" in selected_language): + if ("Unknown" in selected_language): # URLs with NULL language subquery |= Q(urlcontent__language__isnull=True) # URLs with no UrlContent record at all (similar to URLs with NULL language) subquery |= Q(urlcontent__id_url__isnull=True) # Update query query &= (subquery) + if ("all" not in selected_valid_contents): + # Boolean array + bool_array = [] + if ('True' in selected_valid_contents): + bool_array.append(True) + if ('False' in selected_valid_contents): + bool_array.append(False) + # URLs with selected valid_contents + subquery = Q(urlcontent__valid_content__in=bool_array) + if ("Unknown" in selected_valid_contents): + # URLs with NULL valid_content + subquery |= Q(urlcontent__valid_content__isnull=True) + # URLs with no UrlContent record at all (similar to URLs with NULL valid_content) + subquery |= Q(urlcontent__id_url__isnull=True) + # Update query + query &= (subquery) + # Run query urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch') + # print(urls.query) # Pagination paginator = Paginator(urls, per_page) # Paginate the filtered URLs @@ -300,11 +322,13 @@ def filtered_urls(request): 'searches': sorted(searches, key=lambda x: (x.type, x.search)), 'sources': sorted(sources, key=lambda x: x.source), 'languages': sorted(languages, key=lambda x: (x is None, x)), + 'valid_contents': valid_contents, # Selection 'selected_status': selected_status, 'selected_search': selected_search, 'selected_source': selected_source, 'selected_language': selected_language, + 'selected_valid_contents': selected_valid_contents, "selected_days": selected_days, # Map "sources_map": sources_map, diff --git a/docker/docker-compose.yml b/docker-compose.yml similarity index 54% rename from docker/docker-compose.yml rename to docker-compose.yml index de4ea11..252c4c7 100644 --- a/docker/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,48 @@ version: '3.9' services: - matitos_db: + fetcher_selenium: + build: + context: ./app_selenium + container_name: selenium_app + restart: unless-stopped + shm_size: 512mb + environment: + - SELENIUM_SLEEP_PER_PAGE=4 + - PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log" + ports: + - 80 + + fetcher_urls_app: + build: + context: ./app_urls + container_name: urls_app + restart: unless-stopped + environment: + #- name=value + # Database + - DB_NAME=${DB_NAME:-matitos} + - DB_USER=${DB_NAME:-supermatitos} + - DB_PASSWORD=${DB_NAME:-supermatitos} + - DB_HOST=${DB_NAME:-localhost} # db_postgres + - DB_PORT=${DB_NAME:-5432} + - REDIS_HOST=${REDIS_HOST:-localhost} + - REDIS_PORT=${REDIS_PORT:-6379} + # Job timeout: 30 min + - JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800} + # Logs path + - PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log" + # Fetcher + - FETCHER_GNEWS_DECODE_SLEEP=2 + - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4 + - FETCHER_BETWEEN_SEARCHES_SLEEP=5 + - FETCHER_URL_HOST_SLEEP=5 + # Selenium + - SELENIUM_ENDPOINT="http://selenium_app:80" + ports: + - 80 + + fetcher_db: image: postgres:17 container_name: db_postgres restart: unless-stopped @@ -18,7 +59,7 @@ services: ports: - 5432:5432 - matitos_redis: + fetcher_redis: image: redis:alpine container_name: db_redis restart: unless-stopped @@ -27,7 +68,7 @@ services: #expose: # - 6379 - matitos_adminer: + fetcher_adminer: # http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public image: adminer container_name: adminer @@ -41,7 +82,7 @@ services: ports: - 8080:8080 - matitos_dozzle: + fetcher_dozzle: container_name: dozzle image: amir20/dozzle:latest volumes: