diff --git a/1-DB.ipynb b/1-DB.ipynb index b314fb8..ebb4d82 100644 --- a/1-DB.ipynb +++ b/1-DB.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 131, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -11,38 +11,16 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "db_postgres\n", - "db_redis\n", - "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 1/3\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 3/3\u001b[0m\n", - " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h" - ] - } - ], + "outputs": [], "source": [ "!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5" ] }, { "cell_type": "code", - "execution_count": 133, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -53,7 +31,7 @@ "connection_info = \"host={} port={} user={} password={} dbname={}\".format(\"localhost\", \"5432\", \"supermatitos\", \"supermatitos\", \"matitos\")\n", "\n", "from datetime import datetime, timezone\n", - "\n", + "import re\n", "\n", "if INSERT_TABLES:\n", " # Connect to an existing database\n", @@ -107,11 +85,6 @@ " );\n", " CREATE INDEX idx_source ON urls_source(id_source);\n", "\n", - " CREATE TABLE WEBSITE_TO_FILTER (\n", - " id SMALLSERIAL PRIMARY KEY,\n", - " url_host TEXT NOT NULL UNIQUE\n", - " );\n", - "\n", " CREATE TABLE STATUS_PATTERN_MATCHING (\n", " pattern TEXT PRIMARY KEY,\n", " priority SMALLINT NOT NULL,\n", @@ -148,22 +121,23 @@ " cur.execute( \"INSERT INTO FEED (rss_feed) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC');\" )\n", " # Websites of interest\n", " cur.execute( \"INSERT INTO WEBSITE_OF_INTEREST (url_host) VALUES ('www.unicef.org');\" )\n", + " cur.execute( \"INSERT INTO WEBSITE_OF_INTEREST (url_host) VALUES ('www.breitbart.com/');\" )\n", " # Search keywords\n", " cur.execute( \"INSERT INTO SEARCH (keyword_search) VALUES ('child abuse');\" )\n", - " # Domains to filter\n", - " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('yewtu.be');\" )\n", - " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('twitter.com');\" )\n", - " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('libreddit.de');\" )\n", - " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('youtube.com');\" )\n", - " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('tiktok.com');\" )\n", - " cur.execute( \"INSERT INTO WEBSITE_TO_FILTER (url_host) VALUES ('radio.foxnews.com');\" )\n", - " # Status update based on pattern matching (with priority to apply in order)\n", - " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*missingkids.org/poster/.*', 50, 'valid');\" )" + " \n", + " # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n", + " # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n", + " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"youtube.com/\"))) )\n", + " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"tiktok.com/\"))) )\n", + " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"twitter.com/\"))) )\n", + " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"reddit.com/\"))) )\n", + " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"libreddit.de/\"))) )\n", + " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )" ] }, { "cell_type": "code", - "execution_count": 134, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -215,115 +189,9 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\t urls\n", - "[(1,\n", - " 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (2,\n", - " 'https://www.bbc.com/news/articles/ckg843y8y7no',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (3,\n", - " 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (4,\n", - " 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (5,\n", - " 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (6,\n", - " 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (7,\n", - " 'https://www.google.com',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (8,\n", - " 'www.super_0.org',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (9,\n", - " 'www.super_1.org',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (10,\n", - " 'www.super_2.org',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (11,\n", - " 'www.super_3.org',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (12,\n", - " 'www.super_4.org',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (13,\n", - " 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (14,\n", - " 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid')]\n", - "\t urls_duplicate\n", - "[]\n", - "\t feed\n", - "[(1,\n", - " 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC')]\n", - "\t website_of_interest\n", - "[(1, 'www.unicef.org')]\n", - "\t search\n", - "[(1, 'child abuse')]\n", - "\t urls_source\n", - "[(1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (1, 2), (2, 2), (3, 2)]\n", - "\t source\n", - "[(1, 'news.google.com'), (2, 'qwant.com')]\n", - "\t website_to_filter\n", - "[(1, 'yewtu.be'),\n", - " (2, 'twitter.com'),\n", - " (3, 'libreddit.de'),\n", - " (4, 'youtube.com'),\n", - " (5, 'tiktok.com'),\n", - " (6, 'radio.foxnews.com')]\n", - "\t status_pattern_matching\n", - "[('.*missingkids.org/poster/.*', 50, 'valid')]\n", - "\t url_content\n", - "[(1,\n", - " datetime.datetime(2025, 3, 13, 17, 19, 5, 639334, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'Mommy blogger turned child abuser',\n", - " 'Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla '\n", - " 'Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n", - " 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n", - " 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n", - " 'Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!',\n", - " 'Hello there!',\n", - " None,\n", - " 'en',\n", - " None,\n", - " ['child abuse', 'social media'],\n", - " ['Audrey Conklin'],\n", - " None,\n", - " ['https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1'],\n", - " None,\n", - " None,\n", - " None)]\n" - ] - } - ], + "outputs": [], "source": [ "from pprint import pprint\n", "\n", @@ -349,72 +217,9 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[(1,\n", - " 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (2,\n", - " 'https://www.bbc.com/news/articles/ckg843y8y7no',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (3,\n", - " 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (4,\n", - " 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (5,\n", - " 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (6,\n", - " 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'valid'),\n", - " (7,\n", - " 'https://www.google.com',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (8,\n", - " 'www.super_0.org',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (9,\n", - " 'www.super_1.org',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (10,\n", - " 'www.super_2.org',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (11,\n", - " 'www.super_3.org',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (12,\n", - " 'www.super_4.org',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (13,\n", - " 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid'),\n", - " (14,\n", - " 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n", - " datetime.datetime(2025, 3, 13, 17, 19, 4, 379696, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n", - " 'invalid')]\n" - ] - } - ], + "outputs": [], "source": [ "from pprint import pprint\n", "\n", @@ -422,7 +227,8 @@ "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", - " pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )" + " pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )\n", + " #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )" ] } ], diff --git a/app_fetcher/Dev.ipynb b/OBSOLETE_app_fetcher/Dev.ipynb similarity index 100% rename from app_fetcher/Dev.ipynb rename to OBSOLETE_app_fetcher/Dev.ipynb diff --git a/app_fetcher/Dockerfile b/OBSOLETE_app_fetcher/Dockerfile similarity index 100% rename from app_fetcher/Dockerfile rename to OBSOLETE_app_fetcher/Dockerfile diff --git a/app_fetcher/README.md b/OBSOLETE_app_fetcher/README.md similarity index 100% rename from app_fetcher/README.md rename to OBSOLETE_app_fetcher/README.md diff --git a/app_fetcher/app.py b/OBSOLETE_app_fetcher/app.py similarity index 100% rename from app_fetcher/app.py rename to OBSOLETE_app_fetcher/app.py diff --git a/app_fetcher/src/db_utils.py b/OBSOLETE_app_fetcher/src/db_utils.py similarity index 100% rename from app_fetcher/src/db_utils.py rename to OBSOLETE_app_fetcher/src/db_utils.py diff --git a/app_fetcher/src/fetch_feed.py b/OBSOLETE_app_fetcher/src/fetch_feed.py similarity index 100% rename from app_fetcher/src/fetch_feed.py rename to OBSOLETE_app_fetcher/src/fetch_feed.py diff --git a/app_fetcher/src/fetch_parser.py b/OBSOLETE_app_fetcher/src/fetch_parser.py similarity index 100% rename from app_fetcher/src/fetch_parser.py rename to OBSOLETE_app_fetcher/src/fetch_parser.py diff --git a/app_fetcher/src/fetch_search.py b/OBSOLETE_app_fetcher/src/fetch_search.py similarity index 100% rename from app_fetcher/src/fetch_search.py rename to OBSOLETE_app_fetcher/src/fetch_search.py diff --git a/app_fetcher/src/fetch_search_sources.py b/OBSOLETE_app_fetcher/src/fetch_search_sources.py similarity index 100% rename from app_fetcher/src/fetch_search_sources.py rename to OBSOLETE_app_fetcher/src/fetch_search_sources.py diff --git a/app_fetcher/src/google_bypass.py b/OBSOLETE_app_fetcher/src/google_bypass.py similarity index 100% rename from app_fetcher/src/google_bypass.py rename to OBSOLETE_app_fetcher/src/google_bypass.py diff --git a/app_fetcher/src/logger.py b/OBSOLETE_app_fetcher/src/logger.py similarity index 100% rename from app_fetcher/src/logger.py rename to OBSOLETE_app_fetcher/src/logger.py diff --git a/app_fetcher/src/missing_kids_fetch.py b/OBSOLETE_app_fetcher/src/missing_kids_fetch.py similarity index 100% rename from app_fetcher/src/missing_kids_fetch.py rename to OBSOLETE_app_fetcher/src/missing_kids_fetch.py diff --git a/app_fetcher/src/missing_kids_status.py b/OBSOLETE_app_fetcher/src/missing_kids_status.py similarity index 100% rename from app_fetcher/src/missing_kids_status.py rename to OBSOLETE_app_fetcher/src/missing_kids_status.py diff --git a/app_fetcher/src/url_status.py b/OBSOLETE_app_fetcher/src/url_status.py similarity index 100% rename from app_fetcher/src/url_status.py rename to OBSOLETE_app_fetcher/src/url_status.py diff --git a/app_fetcher/src/url_utils.py b/OBSOLETE_app_fetcher/src/url_utils.py similarity index 100% rename from app_fetcher/src/url_utils.py rename to OBSOLETE_app_fetcher/src/url_utils.py diff --git a/app_fetcher/src/utils.py b/OBSOLETE_app_fetcher/src/utils.py similarity index 100% rename from app_fetcher/src/utils.py rename to OBSOLETE_app_fetcher/src/utils.py diff --git a/app_urls/README.md b/app_urls/README.md index a7f8a3f..eadabd0 100644 --- a/app_urls/README.md +++ b/app_urls/README.md @@ -72,7 +72,10 @@ python manage.py runserver # Worker python manage.py rqworker default -while true; do python manage.py rqworker default --burst; sleep 5; done +while true; do python manage.py rqworker default --burst -v 0; sleep 5; done + +# Visualize DB +http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id ``` * Utils diff --git a/app_urls/api/models.py b/app_urls/api/models.py index 44d9cee..b51fcf9 100644 --- a/app_urls/api/models.py +++ b/app_urls/api/models.py @@ -106,12 +106,3 @@ class WebsiteOfInterest(models.Model): class Meta: managed = False db_table = 'website_of_interest' - - -class WebsiteToFilter(models.Model): - id = models.SmallAutoField(primary_key=True) - url_host = models.TextField(unique=True) - - class Meta: - managed = False - db_table = 'website_to_filter' \ No newline at end of file diff --git a/app_urls/api/src/db_utils.py b/app_urls/api/src/db_utils.py index 5b9dfdb..3e360db 100644 --- a/app_urls/api/src/db_utils.py +++ b/app_urls/api/src/db_utils.py @@ -1,11 +1,10 @@ -from ..models import Urls, UrlContent, UrlsSource, Source, WebsiteToFilter, StatusPatternMatching +from ..models import Urls, UrlContent, UrlsSource, UrlsDuplicate, Source, StatusPatternMatching from .url_processor import process_url -from django.utils import timezone from django.core.cache import cache from django.db import IntegrityError import hashlib -from datetime import timedelta import re +import time import traceback from .logger import get_logger logger = get_logger() @@ -13,17 +12,29 @@ logger = get_logger() class DB_Handler(): def __init__(self): logger.debug("Initializing URL DB Handler") + # Inserting raw URL, cache time: 1 day + self._cache_timeout_insert_url = 86400 + # Processing error URL, cache time: 2 days + self._cache_timeout_error_url = 86400*2 + # URL host slowdown + self.url_host_slowdown_seconds = 5 def _get_safe_cache_key(self, raw_key): """Generate a safe cache key using an MD5 hash""" return hashlib.md5(raw_key.encode()).hexdigest() - def _cache_key(self, cache_key, cache_timeout=86400): - cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout) + def _cache_key(self, cache_key, hash_encode, cache_timeout): + if (hash_encode): + cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout) + else: + cache.set(cache_key, True, timeout=cache_timeout) - def _is_cached_key(self, cache_key): + def _is_cached_key(self, cache_key, hash_encoded): # Returns True if cached - return cache.get(self._get_safe_cache_key(cache_key)) is not None + if (hash_encoded): + return cache.get(self._get_safe_cache_key(cache_key)) is not None + else: + return cache.get(cache_key) is not None def insert_raw_urls(self, urls, source): @@ -53,10 +64,10 @@ class DB_Handler(): for url in urls_clean: ### Already processed URL? - if (self._is_cached_key(url)): + if (self._is_cached_key(url, hash_encoded=True)): logger.debug("Already cached URL: {}".format(url)) - if (self._is_cached_key("{}{}".format(source, url))): + if (self._is_cached_key("{}{}".format(source, url), hash_encoded=True)): logger.debug("Already cached (source, URL): {} {}".format(source, url)) else: ### Insert (URL_id, source_id), since not cached @@ -92,139 +103,189 @@ class DB_Handler(): # Insert or update cache for url in urls_clean: - self._cache_key(url) - self._cache_key("{}{}".format(source, url)) + # Hash encode URLs for special characters + self._cache_key(url, hash_encode=True, cache_timeout=self._cache_timeout_insert_url) + self._cache_key("{}{}".format(source, url), hash_encode=True, cache_timeout=self._cache_timeout_insert_url) logger.info("Inserted #{} raw URLs".format(len(urls_to_insert))) except Exception as e: logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) - def _get_status_pattern_matching(self, url, article_status, list_pattern_status_tuple): - # Sort pattern tuples by priority. (pattern, priority, status) - list_pattern_status_tuple.sort(key=lambda tup: tup[1], reverse=True) + def _get_url_host(self, url): + # URL no protocol, first substring before '/' + url_host = url.replace("https://", "").replace("http://", "").split("/")[0] + return url_host + + def _url_host_slowdown(self, url, url_host_slowdown_seconds): + ### Avoid (frequent) too many requests to the same URL host + # Get URL host + url_host = self._get_url_host(url) + # Recently processed URL host? -> Slow down required + last_cached_timestamp = cache.get("processed_{}".format(url_host), None) + if last_cached_timestamp: + # Get time since last processed URL host (in seconds) + time_since_last_processed = time.time() - last_cached_timestamp + # Amount of time required to sleep? + slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed) + logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host)) + # Sleep + time.sleep(slowdown_required) + # About to process URL host, cache time + cache.set("processed_{}".format(url_host), time.time(), timeout=60*5) # Expire after 5 minutes - # Regex pattern to update status on "valid", "invalid", and "unknown" status only - # Status "raw", "duplicated" and "error" should remain the way they are - # Assumption: List of patterns sorted by importance - if (article_status in ["valid", "invalid", "unknown"]): - # Regular expression pattern matching: https://regexr.com/ - for regex_pattern, regex_priority, status_if_match in list_pattern_status_tuple: - # Matching? Update article status - if bool(re.match(regex_pattern, url)): - if (status_if_match != article_status): - logger.debug("Regex pattern found, updating status from '{}' to '{}' for URL: {}".format(article_status, status_if_match, url)) + def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error): + ##### Filter URL? -> Invalid + if (status_pattern_match == "invalid"): + logger.debug("Domain filter applied to input URL: {}".format(obj_url.url)) + # Update status + obj_url.status = Urls.STATUS_ENUM.INVALID + obj_url.save() + # updating_urls.append(obj_url) + # Next URL + return + + ##### Process URL + try: + # Slow down if required to avoid too many requests error + self._url_host_slowdown(obj_url.url, self.url_host_slowdown_seconds) + # Get data + dict_url_data = process_url(obj_url.url) + # Not none or handle as exception + assert(dict_url_data is not None) + except Exception as e: + if (raise_exception_on_error): + # Simply raise exception + raise Exception("Error processing URL") + else: + # Set status to error + logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc())) + # Update status + obj_url.status = Urls.STATUS_ENUM.ERROR + obj_url.save() + # updating_urls.append(obj_url) + # Next URL + return + + ##### Canonical URL different? -> Duplicate + if (dict_url_data.get("url") != dict_url_data.get("url_canonical")): + # Update status + obj_url.status = Urls.STATUS_ENUM.DUPLICATE + obj_url.save() + # updating_urls.append(obj_url) + + # Get or create URL with canonical form + obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical")) + # Get the sources id associated to obj_url.id + url_sources = UrlsSource.objects.filter(id_url=obj_url) + for url_source_obj in url_sources: + # Associate same sources to url_canonical (it might already exist) + obj_urls_source, created = UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical) + + # URLs duplciate association + obj_urls_duplicate, created = UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url) + + # Next URL + return + + ##### Valid URL + # Update status + obj_url.status = Urls.STATUS_ENUM.VALID + obj_url.save() + # updating_urls.append(obj_url) + + # Create or update extracted URL data + UrlContent.objects.update_or_create( + id_url=obj_url, + defaults = { + "date_published" : dict_url_data.get("publish_date"), + "title" : dict_url_data.get("title"), + "description" : dict_url_data.get("description"), + "content" : dict_url_data.get("content"), + "valid_content" : dict_url_data.get("valid_content"), + "language" : dict_url_data.get("language"), + "keywords" : dict_url_data.get("keywords"), + "tags" : dict_url_data.get("tags"), + "authors" : dict_url_data.get("authors"), + "image_main_url" : dict_url_data.get("image_main_url"), + "images_url" : dict_url_data.get("images_url"), + "videos_url" : dict_url_data.get("videos_url"), + "url_host" : dict_url_data.get("url_host"), + "site_name" : dict_url_data.get("site_name"), + } + ) + + def process_raw_urls(self, batch_size): + + def _get_status_pattern_matching(url, list_pattern_status_tuple): + """ Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only + """ + # Sort pattern tuples by priority. (pattern, priority, status) + for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True): + # Regular expression pattern matching: https://regexr.com/ + if bool(re.match(regex_pattern, obj_url.url)): + logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url)) return status_if_match - # Pattern matching not required or not found, original article status - return article_status + return None - - def process_error_urls(self, batch_size=50): - # Get batch of URLs, status='error' - #error_urls = Urls.objects.SORTBY TS_FETCH....filter(status=Urls.STATUS_ENUM.RAW, ts_fetch__gte=time_delta_ts)[:batch_size] - pass - - def process_raw_urls(self, time_delta=timedelta(days=1), batch_size=50): try: logger.debug("Processing raw URLs") - # Get list of domains to filter - list_domains_to_filter = WebsiteToFilter.objects.values_list('url_host', flat=True) + # Get batch of URLs, status='raw' + raw_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.RAW)[:batch_size] + + if (len(raw_urls) == 0): + logger.debug("No raw URLs to process") + return + # Get list of (pattern, priority, status) tuples to override status if required list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status")) - - # Fetched during last 24 hours - time_delta_ts = timezone.now() - time_delta - # Get batch of URLs, status='raw' and fetched X days ago - raw_urls = Urls.objects.filter(status=Urls.STATUS_ENUM.RAW, ts_fetch__gte=time_delta_ts)[:batch_size] # List of objects to bulk update - updating_urls = [] + # updating_urls = [] # Per URL for obj_url in raw_urls: - ##### Any domain to filter included in URL? -> Invalid - if (any([d in obj_url.url for d in list_domains_to_filter])): - logger.debug("Domain filter applied to input URL: {}".format(obj_url.url)) - # Update status - obj_url.status = Urls.STATUS_ENUM.INVALID - obj_url.save() - updating_urls.append(obj_url) - # Next URL - continue - - ##### Process URL - try: - # Get data - dict_url_data = process_url(obj_url.url) - # Not none or handle as exception - assert(dict_url_data is not None) - except Exception as e: - logger.debug("Error processing URL: {}\n{}\n".format(obj_url.url, str(e), traceback.format_exc())) - # Update status - obj_url.status = Urls.STATUS_ENUM.ERROR - obj_url.save() - updating_urls.append(obj_url) - # Next URL - continue + # Override status if pattern matching? + status_pattern_match = _get_status_pattern_matching(obj_url.url, list_pattern_status_tuple) + # Process URL + self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False) - ##### Canonical URL different? -> Duplicate - if (dict_url_data.get("url") != dict_url_data.get("url_canonical")): - # Update status - obj_url.status = Urls.STATUS_ENUM.DUPLICATE - obj_url.save() - updating_urls.append(obj_url) - - # Get or create URL with canonical form - obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical")) - # Get the sources id associated to obj_url.id - url_sources = UrlsSource.objects.filter(id_url=obj_url) - for url_source_obj in url_sources: - # Associate same sources to url_canonical (it might already exist) - UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical) - - # Next URL - continue - - ##### Valid URL - # Update status - obj_url.status = Urls.STATUS_ENUM.VALID - obj_url.save() - updating_urls.append(obj_url) - - # Create extracted URL data - UrlContent.objects.create( - id_url=obj_url, - date_published=dict_url_data.get("publish_date"), - title=dict_url_data.get("title"), - description=dict_url_data.get("description"), - content=dict_url_data.get("content"), - valid_content=dict_url_data.get("valid_content"), - language=dict_url_data.get("language"), - keywords=dict_url_data.get("keywords"), - tags=dict_url_data.get("tags"), - authors=dict_url_data.get("authors"), - image_main_url=dict_url_data.get("image_main_url"), - images_url=dict_url_data.get("images_url"), - videos_url=dict_url_data.get("videos_url"), - url_host=dict_url_data.get("url_host"), - site_name=dict_url_data.get("site_name"), - ) - - - ##### Override status if pattern matching? - for obj_url in updating_urls: - # Check if article status needs to be updated with pattern matching - status_pattern_matching = self._get_status_pattern_matching(obj_url.url, obj_url.status, list_pattern_status_tuple) - # Update status? - if (status_pattern_matching != obj_url.status): - logger.debug("Pattern matching, overriding with status {} for URL: {}".format(status_pattern_matching, obj_url.url)) - # Update, no need to append to updating_urls, already included - obj_url.status = status_pattern_matching - obj_url.save() - - # TODO: Fix enum type issue. Bulk update + # TODO: Fix enum type issue. Bulk update instead of .save() for each object # Urls.objects.bulk_update(updating_urls, ['status']) - logger.info("Updated #{} raw URLs".format(len(updating_urls))) + logger.info("Updated #{} raw URLs".format(len(raw_urls))) except Exception as e: logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc())) + + def process_error_urls(self, batch_size): + try: + logger.debug("Processing error URLs") + + # Keep track of processed and skipped "error" URLs + num_urls_skipped, num_urls_processed = 0, 0 + # Get batch of URLs, status='error' + error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped] + + while ((len(error_urls) > 0) and (num_urls_processed < batch_size)): + # Per URL + for obj_url in error_urls: + # URL ID cached? -> Tried to process recently already, skip + if (self._is_cached_key("error_{}".format(obj_url.id), hash_encoded=False)): + num_urls_skipped += 1 + continue + + try: + # Process URL + self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True) + num_urls_processed += 1 + except Exception as e: + # Error, cache to avoid re-processing for X time + self._cache_key("error_{}".format(obj_url.id), hash_encode=False, cache_timeout=self._cache_timeout_error_url) + num_urls_skipped += 1 + + # Get following batch of URLs, status='error' + error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped] + + logger.info("Updated #{}, skipped #{} error URLs".format(num_urls_processed, num_urls_skipped)) + except Exception as e: + logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc())) \ No newline at end of file diff --git a/app_urls/api/src/fetch_feed.py b/app_urls/api/src/fetch_feed.py index a0e98a0..54d99df 100644 --- a/app_urls/api/src/fetch_feed.py +++ b/app_urls/api/src/fetch_feed.py @@ -8,15 +8,15 @@ logger = get_logger() class FetchFeeds(): def __init__(self) -> None: - logger.debug("Initializing News feed") + logger.debug("Initializing Fetcher Feeds") def run(self): try: - logger.debug("Starting NewsFeed.run()") + logger.debug("Starting FetchFeeds.run()") # Get feeds list_url_feeds = list(Feed.objects.values_list('rss_feed', flat=True)) - logger.debug("Fetching news from feeds: {}".format(list_url_feeds)) + logger.debug("Fetching from feeds: {}".format(list_url_feeds)) # Process via RSS feeds for url_feed in list_url_feeds: @@ -47,4 +47,4 @@ class FetchFeeds(): # Write to DB DB_Handler().insert_raw_urls(urls_fetched, source) except Exception as e: - logger.warning("Exception in NewsFeed.run(): {}\n{}".format(e, traceback.format_exc())) + logger.warning("Exception in FetchFeeds.run(): {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/api/src/fetch_parser.py b/app_urls/api/src/fetch_parser.py new file mode 100644 index 0000000..73116e4 --- /dev/null +++ b/app_urls/api/src/fetch_parser.py @@ -0,0 +1,39 @@ +from .db_utils import DB_Handler +from ..models import WebsiteOfInterest +import newspaper +import traceback +from .logger import get_logger +logger = get_logger() + +class FetchParser(): + def __init__(self) -> None: + logger.debug("Initializing Fetcher Parser") + + def run(self): + try: + logger.debug("Starting FetchParser.run() for {}") + + # Get URL hosts + list_url_host = list(WebsiteOfInterest.objects.values_list('url_host', flat=True)) + logger.debug("Fetching news by parsing URL hosts: {}".format(list_url_host)) + + # Process newspaper4k build method + for url_host_feed in list_url_host: + # Protocol + if not (url_host_feed.startswith("http")): + url_host_feed_formatted = "https://" + url_host_feed + else: + url_host_feed_formatted = url_host_feed + + logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted)) + # Source object + url_host_built = newspaper.build(url_host_feed_formatted) + # Get articles URL list + urls_fetched = url_host_built.article_urls() + + # URL fetching source + source = "newspaper4k {}".format(url_host_feed) + # Write to DB + DB_Handler().insert_raw_urls(urls_fetched, source) + except Exception as e: + logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc())) diff --git a/app_urls/api/src/url_processor.py b/app_urls/api/src/url_processor.py index 87afc8e..86b4100 100644 --- a/app_urls/api/src/url_processor.py +++ b/app_urls/api/src/url_processor.py @@ -1,3 +1,4 @@ +from django.core.cache import cache from .logger import get_logger logger = get_logger() import newspaper @@ -6,6 +7,7 @@ from urllib.parse import unquote #import langdetect #langdetect.DetectorFactory.seed = 0 + def process_url(url): try: # Process diff --git a/app_urls/api/tasks.py b/app_urls/api/tasks.py index fbd3dbc..d236812 100644 --- a/app_urls/api/tasks.py +++ b/app_urls/api/tasks.py @@ -1,6 +1,7 @@ from django_rq import job from .src.fetch_feed import FetchFeeds +from .src.fetch_parser import FetchParser from .src.db_utils import DB_Handler ''' from src.fetch_parser import FetchParser @@ -8,16 +9,13 @@ from src.fetch_search import FetchSearcher from src.missing_kids_fetch import MissingKidsFetch from src.missing_kids_status import MissingKidsStatus from src.url_status import UpdateErrorURLs -from src.db_utils import DB_Handler -from src.credentials import db_connect_info, redis_connect_info - -# DB Handler -db_handler = DB_Handler(db_connect_info, redis_connect_info) ''' from .src.logger import get_logger logger = get_logger() +# TODO: Queues with priorities, process_raw_urls least priority due to slowdown... + @job def background_task(process_type: str): logger.info("Task triggered: {}".format(process_type)) @@ -25,18 +23,17 @@ def background_task(process_type: str): try: if (process_type == "fetch_feeds"): FetchFeeds().run() + elif (process_type == "fetch_parser"): + FetchParser().run() elif (process_type == "process_raw_urls"): - DB_Handler().process_raw_urls(batch_size=3) + DB_Handler().process_raw_urls(batch_size=50) + elif (process_type == "process_error_urls"): + DB_Handler().process_error_urls(batch_size=50) else: logger.info("Task unknown!: {}".format(process_type)) ''' - if (process_type == "fetch_feeds"): - FetchFeeds(db_handler).run() - - elif (process_type == "fetch_parser"): - FetchParser(db_handler).run() elif (process_type == "search") or (process_type == "search_full"): FetchSearcher(cred.db_connect_info, cred.redis_connect_info, full=True).run() diff --git a/app_urls/api/urls.py b/app_urls/api/urls.py index 38e15b5..78e35f1 100644 --- a/app_urls/api/urls.py +++ b/app_urls/api/urls.py @@ -1,6 +1,7 @@ from django.urls import path -from .views import trigger_task +from .views import trigger_task, link_list urlpatterns = [ + path('links', link_list, name='link_list'), path('', trigger_task, name='trigger_task'), ] diff --git a/app_urls/api/views.py b/app_urls/api/views.py index c952c5d..1af1eab 100644 --- a/app_urls/api/views.py +++ b/app_urls/api/views.py @@ -1,6 +1,7 @@ import django_rq from django.http import JsonResponse from .tasks import background_task +import os from .src.logger import get_logger logger = get_logger() @@ -9,3 +10,8 @@ def trigger_task(request, task): queue = django_rq.get_queue('default') # Get the default queue job = queue.enqueue(background_task, task) return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id}) + +def link_list(request): + prefix = "http://localhost:8000/api" + links = ["fetch_feeds", "fetch_parser", "process_raw_urls", "process_error_urls"] + return JsonResponse({"links": ["http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id"] + [os.path.join(prefix, l) for l in links]})