diff --git a/README.md b/README.md index 697796e..8ba13f7 100644 --- a/README.md +++ b/README.md @@ -1,38 +1 @@ -# Requirements -``` -conda create -n matitos python=3.12 -conda activate matitos -pip install ipykernel django requests ollama psycopg[binary] # openai -``` - -# Development - -* app_web -``` - -# 1) Change models.py -python manage.py inspectdb - -# 2) -python manage.py makemigrations -# 3) -python manage.py migrate --fake - -# ? -python manage.py migrate --fake sessions zero -python manage.py migrate --fake-initial - - -python manage.py createsuperuser -``` - -* app_img_gen -``` -docker build -t image_generation . -docker run --rm -it -p 12343:80 image_generation -``` - -# Deploy -``` -python app_web/manage.py runserver -``` +# Matitos \ No newline at end of file diff --git a/app_img_gen/README.md b/app_img_gen/README.md new file mode 100644 index 0000000..36f80fd --- /dev/null +++ b/app_img_gen/README.md @@ -0,0 +1,36 @@ +``` +docker build -t image_generation . +docker run --rm -it -p 12343:80 image_generation +``` + +``` +import requests +import cv2 +import base64 +import numpy as np + +endpoint = "http://192.168.2.64:12343/image" + +prompt = "Majestic mountain landscape with snow-capped peaks, autumn foliage in vibrant reds and oranges, a turquoise river winding through a valley, crisp and serene atmosphere, ultra-realistic style." +prompt = "A group of kids happily playing in a joy environment" +#prompt = "A bitcoin behaving like a king, surrounded by small alternative coins. Detailed, geometric style" + +json = { + "prompt": prompt, + "num_inference_steps": 10, + "size": "512x512", + "seed": 123456, +} + +for inf_step in [1, 4, 10, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100]: + json["num_inference_steps"] = inf_step + + %time r = requests.post(endpoint, json=json) + print("Status code", r.status_code) + + # Image + png_as_np = np.frombuffer(base64.b64decode(r.text), dtype=np.uint8) + image_bgr = cv2.imdecode(png_as_np, cv2.IMREAD_COLOR) + + cv2.imwrite("sample_img_{}.png".format(json["num_inference_steps"]), image_bgr) +``` \ No newline at end of file diff --git a/1-DB.ipynb b/app_urls/1-DB.ipynb similarity index 54% rename from 1-DB.ipynb rename to app_urls/1-DB.ipynb index 7d0c839..dfb5123 100644 --- a/1-DB.ipynb +++ b/app_urls/1-DB.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -11,41 +11,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "db_postgres\n", - "db_redis\n", - "\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n", - " ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n", - " \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - " \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n", - "\u001b[?25h" - ] - } - ], + "outputs": [], "source": [ - "!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5" + "!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -62,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -99,6 +74,8 @@ " id SMALLSERIAL PRIMARY KEY,\n", " search TEXT NOT NULL UNIQUE,\n", " type SEARCH_TYPE NOT NULL\n", + " -- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search\n", + " -- UNIQUE(search, language_country)\n", " );\n", " CREATE INDEX idx_search_type ON SEARCH(type);\n", " \n", @@ -106,7 +83,13 @@ " id SMALLSERIAL PRIMARY KEY,\n", " source TEXT NOT NULL UNIQUE\n", " );\n", - " \n", + " \n", + " -- CREATE TABLE SEARCH_LANGUAGE (\n", + " -- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. \"en\"\n", + " -- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. \"us\"\n", + " -- PRIMARY KEY (language, country)\n", + " -- );\n", + " \n", " CREATE TABLE URLS_SOURCE_SEARCH (\n", " id_url INTEGER REFERENCES URLS(id),\n", " id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n", @@ -158,6 +141,8 @@ " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');\" )\n", " # Search keywords\n", " cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n", + " # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');\" )\n", + " # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');\" )\n", " \n", " # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n", " # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n", @@ -169,51 +154,6 @@ " cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )" ] }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "if INSERT_SAMPLE_DATA:\n", - " # Connect to an existing database\n", - " with psycopg.connect(connection_info) as conn:\n", - " # Open a cursor to perform database operations\n", - " with conn.cursor() as cur:\n", - " # Autocommit at end of transaction (Atomic insert of URLs and sources)\n", - " with conn.transaction() as tx:\n", - " # Valid\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame', 'valid')\")\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.bbc.com/news/articles/ckg843y8y7no', 'valid')\")\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/', 'valid')\")\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895', 'valid')\")\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/', 'valid')\")\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality', 'valid')\")\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.google.com', 'invalid')\")\n", - "\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/USVA/VA25-0820/1', 'valid')\")\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/NCMC/2045193/1', 'valid')\")\n", - "\n", - " cur.execute(\"INSERT INTO SOURCE (source) values ('news.google.com')\")\n", - " cur.execute(\"INSERT INTO SOURCE (source) values ('qwant.com')\")\n", - "\n", - " cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source, id_search) values (1, 1, 1)\")\n", - "\n", - " for j in range(5):\n", - " import time\n", - " time.sleep(0.25)\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_{}.org', 'invalid')\".format(j))\n", - " \n", - " # Long URLs \n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html', 'invalid')\".format(j))\n", - " cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))\n", - "\n", - " # URL Content\n", - " language, content = \"en\", \"Bla Bla Bla!!!\"*25\n", - " cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, images_url) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n", - " (1, datetime.now(tz=timezone.utc), content, language, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))" - ] - }, { "cell_type": "code", "execution_count": null, @@ -223,41 +163,9 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\t urls\n", - "[]\n", - "\t urls_duplicate\n", - "[]\n", - "\t urls_source_search\n", - "[]\n", - "\t source\n", - "[]\n", - "\t search\n", - "[(1,\n", - " 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n", - " 'rss_feed'),\n", - " (2, 'missingkids.org/poster', 'url_host'),\n", - " (3, 'missingkids.org/new-poster', 'url_host'),\n", - " (4, 'breitbart.com', 'url_host'),\n", - " (5, 'child abuse', 'keyword_search')]\n", - "\t status_pattern_matching\n", - "[('.*youtube\\\\.com/.*', 50, 'invalid'),\n", - " ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n", - " ('.*twitter\\\\.com/.*', 50, 'invalid'),\n", - " ('.*reddit\\\\.com/.*', 50, 'invalid'),\n", - " ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n", - " ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n", - "\t url_content\n", - "[]\n" - ] - } - ], + "outputs": [], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", @@ -274,23 +182,9 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[(1,\n", - " 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n", - " 'rss_feed'),\n", - " (2, 'missingkids.org/poster', 'url_host'),\n", - " (3, 'missingkids.org/new-poster', 'url_host'),\n", - " (4, 'breitbart.com', 'url_host'),\n", - " (5, 'child abuse', 'keyword_search')]\n" - ] - } - ], + "outputs": [], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", @@ -301,23 +195,15 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[]\n" - ] - } - ], + "outputs": [], "source": [ "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", " # Open a cursor to perform database operations\n", " with conn.cursor() as cur:\n", - " pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )\n", + " pprint( cur.execute(\"SELECT * FROM URLS LIMIT 50;\").fetchall() )\n", " #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )" ] }, @@ -326,34 +212,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\n!docker rm -f db_redis; docker compose -f docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ "'''\n", - "!docker rm -f db_redis; docker compose -f docker/docker-compose.yml up -d\n", + "!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n", "\n", "# Connect to an existing database\n", "with psycopg.connect(connection_info) as conn:\n", diff --git a/app_urls/README.md b/app_urls/README.md index deb2042..09e5048 100644 --- a/app_urls/README.md +++ b/app_urls/README.md @@ -10,6 +10,10 @@ pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlen pip install ollama ``` +* Database + * Database initialization -> 1-DB.ipynb + + * From automated inspectdb ``` # 1) Inspect DB, generate models.py @@ -72,23 +76,26 @@ class Meta: * Environment variables ``` +# Database DB_NAME=${DB_NAME:-matitos} DB_USER=${DB_NAME:-supermatitos} DB_PASSWORD=${DB_NAME:-supermatitos} DB_HOST=${DB_NAME:-localhost} DB_PORT=${DB_NAME:-5432} - REDIS_HOST=${REDIS_HOST:-localhost} REDIS_PORT=${REDIS_PORT:-6379} -# Default RQ job timeout -RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900} -# Default RQ job queue TTL -RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600} +# Job timeout: 30 min +JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800} # Logs path -PATH_LOGS_ERROR=logs/log_app_fetcher_error.log -PATH_LOGS=logs/log_app_fetcher.log +PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log" + +# Fetcher +FETCHER_GNEWS_DECODE_SLEEP=2 +FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4 +FETCHER_BETWEEN_SEARCHES_SLEEP=5 +FETCHER_URL_HOST_SLEEP=5 ``` * Deploy @@ -110,30 +117,14 @@ http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=publ * Scheduled tasks ``` -# 1) Modify the scheduled tasks on the admin panel: - -Names: Fetch Feeds, Fetch Parser, Fetch Search -Callable: api.tasks.fetch_feeds, api.tasks.fetch_parser, api.tasks.fetch_search -Task type: Repetable task (or cron...) -Queue: Default -Interval: 15min, 2h, 30min - -Names: Process raw URLs, Process error URLs, Process MissingKids URLs -Callable: api.tasks.process_raw_urls, api.tasks.process_error_urls, api.tasks.process_missing_kids_urls_50 -Task type: Repetable task (or cron...) -Queue: Low, Low, Default -Interval: 1h, 4h, 2h - -# 2) Export -# python manage.py export > scheduled_tasks.json - - -# Or simply import saved definitions +# Import tasks python manage.py import --filename scheduled_tasks.json + +# Modify using the admin panel, then save +# python manage.py export > scheduled_tasks.json ``` -* Utils +* Utils. TODO: To endpoint... ``` python manage.py rqstats -python manage.py rqstats --interval=1 # Refreshes every second ``` \ No newline at end of file diff --git a/app_urls/api/migrations/0001_initial.py b/app_urls/api/migrations/0001_initial.py index 9d01563..829da62 100644 --- a/app_urls/api/migrations/0001_initial.py +++ b/app_urls/api/migrations/0001_initial.py @@ -1,5 +1,6 @@ -# Generated by Django 5.1.7 on 2025-03-13 17:01 +# Generated by Django 5.2 on 2025-04-02 16:44 +import django.contrib.postgres.fields import django.db.models.deletion from django.db import migrations, models @@ -12,22 +13,12 @@ class Migration(migrations.Migration): ] operations = [ - migrations.CreateModel( - name='Feed', - fields=[ - ('id', models.SmallAutoField(primary_key=True, serialize=False)), - ('rss_feed', models.TextField(unique=True)), - ], - options={ - 'db_table': 'feed', - 'managed': False, - }, - ), migrations.CreateModel( name='Search', fields=[ ('id', models.SmallAutoField(primary_key=True, serialize=False)), - ('keyword_search', models.TextField(unique=True)), + ('search', models.TextField(unique=True)), + ('type', models.TextField(choices=[('rss_feed', 'RSS_Feed'), ('keyword_search', 'Keyword_Search'), ('url_host', 'URL_Host')])), ], options={ 'db_table': 'search', @@ -67,28 +58,7 @@ class Migration(migrations.Migration): ], options={ 'db_table': 'urls', - 'managed': False, - }, - ), - migrations.CreateModel( - name='WebsiteOfInterest', - fields=[ - ('id', models.SmallAutoField(primary_key=True, serialize=False)), - ('url_host', models.TextField(unique=True)), - ], - options={ - 'db_table': 'website_of_interest', - 'managed': False, - }, - ), - migrations.CreateModel( - name='WebsiteToFilter', - fields=[ - ('id', models.SmallAutoField(primary_key=True, serialize=False)), - ('url_host', models.TextField(unique=True)), - ], - options={ - 'db_table': 'website_to_filter', + 'ordering': ['-ts_fetch'], 'managed': False, }, ), @@ -102,12 +72,12 @@ class Migration(migrations.Migration): ('content', models.TextField(blank=True, null=True)), ('valid_content', models.BooleanField(blank=True, null=True)), ('language', models.CharField(blank=True, max_length=2, null=True)), - ('keywords', models.TextField(blank=True, null=True)), - ('tags', models.TextField(blank=True, null=True)), - ('authors', models.TextField(blank=True, null=True)), - ('image_main', models.TextField(blank=True, null=True)), - ('images_url', models.TextField(blank=True, null=True)), - ('videos_url', models.TextField(blank=True, null=True)), + ('keywords', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)), + ('tags', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)), + ('authors', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)), + ('image_main_url', models.TextField(blank=True, null=True)), + ('images_url', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)), + ('videos_url', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)), ('url_host', models.TextField(blank=True, null=True)), ('site_name', models.TextField(blank=True, null=True)), ], @@ -127,12 +97,12 @@ class Migration(migrations.Migration): }, ), migrations.CreateModel( - name='UrlsSource', + name='UrlsSourceSearch', fields=[ ('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')), ], options={ - 'db_table': 'urls_source', + 'db_table': 'urls_source_search', 'managed': False, }, ), diff --git a/app_urls/api/migrations/0002_delete_feed_delete_websiteofinterest_and_more.py b/app_urls/api/migrations/0002_delete_feed_delete_websiteofinterest_and_more.py deleted file mode 100644 index 1361219..0000000 --- a/app_urls/api/migrations/0002_delete_feed_delete_websiteofinterest_and_more.py +++ /dev/null @@ -1,26 +0,0 @@ -# Generated by Django 5.1.7 on 2025-03-19 09:06 - -from django.db import migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('api', '0001_initial'), - ] - - operations = [ - migrations.DeleteModel( - name='Feed', - ), - migrations.DeleteModel( - name='WebsiteOfInterest', - ), - migrations.DeleteModel( - name='WebsiteToFilter', - ), - migrations.AlterModelOptions( - name='urls', - options={'managed': False, 'ordering': ['-ts_fetch']}, - ), - ] diff --git a/app_urls/api/migrations/0003_urlssourcesearch_delete_urlssource.py b/app_urls/api/migrations/0003_urlssourcesearch_delete_urlssource.py deleted file mode 100644 index 94c990a..0000000 --- a/app_urls/api/migrations/0003_urlssourcesearch_delete_urlssource.py +++ /dev/null @@ -1,27 +0,0 @@ -# Generated by Django 4.2.20 on 2025-03-20 16:12 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ('api', '0002_delete_feed_delete_websiteofinterest_and_more'), - ] - - operations = [ - migrations.CreateModel( - name='UrlsSourceSearch', - fields=[ - ('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')), - ], - options={ - 'db_table': 'urls_source_search', - 'managed': False, - }, - ), - migrations.DeleteModel( - name='UrlsSource', - ), - ] diff --git a/app_urls/api/models.py b/app_urls/api/models.py index 3013403..72c2811 100644 --- a/app_urls/api/models.py +++ b/app_urls/api/models.py @@ -109,3 +109,32 @@ class UrlsSourceSearch(models.Model): def __str__(self): return "{} {} {}".format(self.id_source, self.id_search, self.id_url) + +""" # TODO: Migrate to django 5.2 +class UrlsDuplicate(models.Model): + pk = models.CompositePrimaryKey('id_url_canonical', 'id_url_duplicated') + id_url_canonical = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_canonical') + id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set') + + class Meta: + managed = False + db_table = 'urls_duplicate' + unique_together = (('id_url_canonical', 'id_url_duplicated'),) + + def __str__(self): + return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical) + +class UrlsSourceSearch(models.Model): + pk = models.CompositePrimaryKey('id_url', 'id_source', 'id_search') + id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url') + id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source') + id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search') + + class Meta: + managed = False + db_table = 'urls_source_search' + unique_together = (('id_url', 'id_source', 'id_search'),) + + def __str__(self): + return "{} {} {}".format(self.id_source, self.id_search, self.id_url) +""" \ No newline at end of file diff --git a/app_urls/api/src/fetch_search.py b/app_urls/api/src/fetch_search.py index 7c032b1..9da003f 100644 --- a/app_urls/api/src/fetch_search.py +++ b/app_urls/api/src/fetch_search.py @@ -1,59 +1,17 @@ from .db_utils import DB_Handler -from ..models import Search, Source +from ..models import Search from django.db.models import Q import traceback import time -from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news, search_googlenews_rss +import os +from .fetch_search_instances import ListSearchInstances from .logger import get_logger logger = get_logger() -''' -from abc import ABC, abstractmethod - -# Generic fetcher (fetches articles, writes to DB) -class FetcherAbstract(ABC): - @abstractmethod - def _fetch_raw_urls_list(self): - pass - - def fetch_articles(self, db_writer): - logger.debug("Starting fetch() for {}".format(self.name)) - # Fetch articles - list_news = self._fetch() - logger.info("Found #{} articles for search: {}".format(len(list_news), self.name)) - # Write to DB - db_writer.write_batch(list_news, self.name) - - - self._fetch_raw_urls_list() - raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US") - raw_urls = self._post_process_urls(raw_urls, obj_search) - # Write to DB - DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) -''' - - class FetchSearcher(): def __init__(self) -> None: logger.debug("Initializing Fetcher Searcher") - def _get_source_object(self, source): - # TODO: Cache - # self.cached_sources = {} - # Get source object - obj_source, created = Source.objects.get_or_create(source=source) - return obj_source - - def _post_process_urls(self, raw_urls, obj_search): - # Searching URL Host based? Make sure results belong to that site - if (obj_search.type == Search.TYPE_ENUM.URL_HOST): - # Get clean URL host - url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "") - # Ensure URL host in URL - raw_urls = [u for u in raw_urls if url_host_clean in u] - - return raw_urls - def run(self): try: logger.debug("Starting FetchSearcher.run()") @@ -65,58 +23,36 @@ class FetchSearcher(): # Search for obj_search in list_search_obj: # TODO: language & country customization - # TODO: allintitle: "child abuse" - # TODO: intitle: "child abuse" # Search keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search) + if (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH): + # Add search with intitle keyword + # TODO: allintitle: "child abuse" + # TODO: intitle: "child abuse" + pass + # language, country = obj_search.language_country.split("-") + logger.debug("Starting keyword search: {}".format(keyword_search)) logger.debug("Search type: {}".format(obj_search.type)) - # news.google.com/rss - time.sleep(5) - raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US") - raw_urls = self._post_process_urls(raw_urls, obj_search) - # Write to DB - DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) - - - # DDG News - time.sleep(5) - raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "en-US") - raw_urls = self._post_process_urls(raw_urls, obj_search) - # Write to DB - DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) + # DB writer + db_writer = DB_Handler() - # GNews - time.sleep(5) - raw_urls, source = search_gnews(keyword_search, language="en", country="US") - raw_urls = self._post_process_urls(raw_urls, obj_search) - # Write to DB - DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) + # Keyword arguments + args = { + "language": "en", + "country": "US", + "period": "7d", + "max_results": 100, + "max_pages": 1, + } - # DDG Text (week, 20 results) - time.sleep(5) - raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=20, region = "en-US") - raw_urls = self._post_process_urls(raw_urls, obj_search) - # Write to DB - DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) - - # GoogleNews news - time.sleep(5) - raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US") - raw_urls = self._post_process_urls(raw_urls, obj_search) - # Write to DB - DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) - - # GoogleNews general - time.sleep(5) - raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=2) - raw_urls = self._post_process_urls(raw_urls, obj_search) - # Write to DB - DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search) - + for SearchInstance in ListSearchInstances: + # Sleep between requests, avoid too many requests... + time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5))) + SearchInstance(args).fetch_articles(db_writer, obj_search) # TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master except Exception as e: diff --git a/app_urls/api/src/fetch_search_instances.py b/app_urls/api/src/fetch_search_instances.py new file mode 100644 index 0000000..10397f4 --- /dev/null +++ b/app_urls/api/src/fetch_search_instances.py @@ -0,0 +1,259 @@ +import time +import feedparser +import os +from ..models import Search, Source +from .fetch_utils import decode_gnews_urls +from .logger import get_logger +logger = get_logger() + +from gnews import GNews +from duckduckgo_search import DDGS +from GoogleNews import GoogleNews + +########################################################################### +########################################################################### +from abc import ABC, abstractmethod + +# Generic fetcher (fetches articles, writes to DB) +class FetcherAbstract(ABC): + @abstractmethod + def _fetch_raw_urls(self): + pass + + @abstractmethod + def _get_name(self): + pass + + def _get_source_object(self, source): + # TODO: Cache + # self.cached_sources = {} + # Get source object + obj_source, created = Source.objects.get_or_create(source=source) + return obj_source + + def _post_process_urls(self, raw_urls, obj_search): + # Searching URL Host based? Make sure results belong to that site + if (obj_search.type == Search.TYPE_ENUM.URL_HOST): + # Get clean URL host + url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "") + # Ensure URL host in URL + raw_urls = [u for u in raw_urls if url_host_clean in u] + + return raw_urls + + def fetch_articles(self, db_writer, obj_search): + # Search + keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search) + # Source name + source_name = self._get_name() + + logger.debug("Starting search: {} - {}".format(keyword_search, source_name)) + # Fetch + raw_urls = self._fetch_raw_urls(keyword_search) + # Post-process + raw_urls = self._post_process_urls(raw_urls, obj_search) + + # Write to DB + db_writer.insert_raw_urls(raw_urls, self._get_source_object(source_name), obj_search) + +########################################################################### + +class SearchGNews(FetcherAbstract): + def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_results":100}): + super().__init__() + # Parameters + self.language = args.get("language") + self.country = args.get("country") + self.period = args.get("period") + self.max_results = args.get("max_results") + + def _get_name(self): + # [source] [period] [language-country] [max_results] + return "gnews {} {}-{} results={}".format("news", self.period, self.language, self.country, self.max_results).replace("results=None", "").strip() + + def _fetch_raw_urls(self, keyword_search): + try: + # Get news + results_gnews = GNews(language=self.language, country=self.country, period=self.period, max_results=self.max_results).get_news(keyword_search) + # Get list of encoded urls + encoded_urls = [e.get("url") for e in results_gnews] + # Decode + urls = decode_gnews_urls(encoded_urls) + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) + urls = [] + return urls + +class SearchDuckDuckGoGeneral(FetcherAbstract): + def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}): + super().__init__() + # Parameters + self.language = args.get("language") + self.country = args.get("country") + self.max_results = args.get("max_results") + self.region = "{}-{}".format(self.language, self.country).lower() + self.period = None + + def _get_name(self): + # [source] [language-country] [max_results] + return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip() + + def _fetch_raw_urls(self, keyword_search): + try: + news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results) + urls = [e.get("href") for e in news] + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) + urls = [] + return urls + +class SearchDuckDuckGoNews(FetcherAbstract): + def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}): + super().__init__() + # Parameters + self.language = args.get("language") + self.country = args.get("country") + self.max_results = args.get("max_results") + self.region = "{}-{}".format(self.language, self.country).lower() + self.period = None + + def _get_name(self): + # [source] [language-country] [max_results] + return "ddg-news {} results={}".format(self.region, self.max_results).replace("results=None", "").strip() + + def _fetch_raw_urls(self, keyword_search): + try: + news = DDGS().news(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results) + urls = [e.get("url") for e in news] + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) + urls = [] + return urls + +class SearchGoogleNews(FetcherAbstract): + def __init__(self, args={"language":"en", "country":"US", "period":"7d"}): + super().__init__() + # Parameters + self.language = args.get("language") + self.country = args.get("country") + self.period = args.get("period") + + def _get_name(self): + # [source] [period] [language-country] + return "googlenews {} {}-{}".format(self.period, self.language, self.country) + + def _fetch_raw_urls(self, keyword_search): + try: + # Initialize + googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country) + googlenews.enableException(True) + # Search + googlenews.get_news(keyword_search) + # Fetch + encoded_urls = googlenews.get_links() + # Decode + urls = decode_gnews_urls(encoded_urls) + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) + urls = [] + return urls + +class SearchGoogleGeneral(FetcherAbstract): + def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_pages":1}): + super().__init__() + # Parameters + self.language = args.get("language") + self.country = args.get("country") + self.period = args.get("period") + self.max_pages = args.get("max_pages") + + def _get_name(self): + # [source] [period] [language-country] [pages] + return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.max_pages).replace("pages=None", "").strip() + + def _fetch_raw_urls(self, keyword_search): + try: + # Initialize + googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country) + googlenews.enableException(True) + # Search + googlenews.search(keyword_search) + + set_links = set() + # Iterate pages + for i in range(self.max_pages): + # Sleep between pages fetch + time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4))) + # Number of URLs fetched so far + num_before = len(set_links) + # Get page + try: + links = googlenews.page_at(i+1) + except Exception as e: + logger.warning("Exception fetching page - {}: {}".format(self._get_name(), str(e))) + break + # Links + for l in links: + # 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK' + set_links.add( l.get("link").split("&ved=")[0] ) + # Finished? + if (num_before == len(set_links)): + break + # To list + urls = list(set_links) + except Exception as e: + logger.warning("Exception fetching {}: {}\n{}".format(self._get_name(), str(e))) + urls = [] + return urls + +class SearchGoogleNewsRSS(FetcherAbstract): + def __init__(self, args={"language":"en", "country":"US"}): + super().__init__() + # Parameters + self.language = args.get("language") + self.country = args.get("country") + + def _get_name(self): + # [source] [language-country] + return "googlenews-rss {}-{}".format(self.language, self.country).strip() + + def _fetch_raw_urls(self, keyword_search): + try: + # Search URL with parameters filled: https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en + search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(self.language, self.country.upper()), self.country.upper(), self.country.upper(), self.language) + # Control characters + search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen + # Initialize + encoded_urls = [] + # Fetch feeds + feeds = feedparser.parse(search_url) + # Parse + for f in feeds.get("entries", []): + # Encoded URL + encoded_url = f.get("link", None) + ''' + # Available publish date? + publish_date_parsed = f.get("published_parsed") + if (publish_date_parsed is None): + publish_date = f.get("published", None) + if (publish_date is not None): + publish_date_parsed = dateutil.parser.parse(publish_date) + + # Published date + urls_publish_date.append(publish_date_parsed)' + ''' + # Append + encoded_urls.append(encoded_url) + + # Decode + urls = decode_gnews_urls(encoded_urls) + + except Exception as e: + logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e))) + urls = [] + + return urls +########################################################################### + +# List of instances +ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS] diff --git a/app_urls/api/src/fetch_search_utils.py b/app_urls/api/src/fetch_search_utils.py deleted file mode 100644 index b5acce5..0000000 --- a/app_urls/api/src/fetch_search_utils.py +++ /dev/null @@ -1,197 +0,0 @@ -from django.core.cache import cache -import traceback -import random -import time -import feedparser -import urllib -import dateutil -from .logger import get_logger -logger = get_logger() - -from googlenewsdecoder import gnewsdecoder -from gnews import GNews -from duckduckgo_search import DDGS -from GoogleNews import GoogleNews - -########################################################################### -def decode_gnews_urls(encoded_urls, interval=2): - # DecodeURLs - list_decoded_urls = [] - for url in encoded_urls: - # Already cached? - decoded_url = cache.get("gnews_decode_{}".format(url)) - if (decoded_url is not None): - logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url)) - # Append decoded URL - list_decoded_urls.append(decoded_url) - else: - try: - # Decode URL, with interval time to avoid block - decoded_url_dict = gnewsdecoder(url, interval=interval) - # Ok? - if decoded_url_dict.get("status"): - # Append decoded URL - decoded_url = decoded_url_dict["decoded_url"] - list_decoded_urls.append(decoded_url) - # Cache decoded URL - cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12) - else: - logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, str(decoded_url))) - except Exception as e: - logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc())) - return list_decoded_urls - -########################################################################### - -def search_gnews(keyword_search, period="1d", language="en", country="US", max_results=100): - # [source] [category] [period] [language-country] [max_results] - source = "gnews {} {} {}-{} max_results={}".format("news", period, language, country, max_results).replace("None", "").strip() - logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) - - try: - # Get news - results_gnews = GNews(language=language, country=country).get_news(keyword_search) - # Get list of encoded urls - encoded_urls = [e.get("url") for e in results_gnews] - # Decode - logger.debug("Decoding gnews URLs") - urls = decode_gnews_urls(encoded_urls) - except Exception as e: - logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc())) - urls = [] - return urls, source - -########################################################################### - -def search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region="wt-wt"): - # [source] [category] [period] [language-country] [max_results] - source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("max_results=None", "").strip() - logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) - - # region="{}-{}".format(langauge, country.lower()) - # timelimit= # Options: d, w, m - # max_results # max number of results. If None, returns results only from the first response. Defaults to None - - try: - if (category == "news"): - news = DDGS().news(keyword_search, region=region, timelimit=timelimit, max_results=max_results) - urls = [e.get("url") for e in news] - if (category == "text"): - news = DDGS().text(keyword_search, region=region, timelimit=timelimit, max_results=max_results) - urls = [e.get("href") for e in news] - except Exception as e: - logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc())) - urls = [] - - return urls, source -########################################################################### - -def search_googlenews_news(keyword_search, period="1d", language="en", country="US"): - category = "news" - # [source] [category] [period] [language-country] - source = "googlenews {} {} {}-{}".format(category, period, language, country).replace("None", "").strip() - logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) - - # Initialize - googlenews = GoogleNews(period=period, lang=language, region=country) - googlenews.enableException(True) - - try: - # Search - googlenews.get_news(keyword_search) - # Fetch - encoded_urls = googlenews.get_links() - # Decode - logger.debug("Decoding gnews URLs") - urls = decode_gnews_urls(encoded_urls) - except Exception as e: - logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc())) - urls = [] - - return urls, source - -def search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5): - category="general" - # [source] [category] [period] [language-country] [max_results] - source = "googlenews {} {} {}-{} max_pages={}".format(category, period, language, country, max_pages).replace("None", "").strip() - logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) - - # Initialize - googlenews = GoogleNews(period=period, lang=language, region=country) - googlenews.enableException(True) - - try: - set_links = set() - # Search - googlenews.search(keyword_search) - - # Iterate pages - for i in range(max_pages): - time.sleep(random.uniform(2, 4.5)) - num_before = len(set_links) - - # Get page - try: - links = googlenews.page_at(i+1) - except Exception as e: - logger.warning("Exception fetching page in GoogleNews {}: {}".format(source, str(e))) - break - # Links - for l in links: - # 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK' - set_links.add( l.get("link").split("&ved=")[0] ) - # Finished? - if (num_before == len(set_links)): - break - # To list - urls = list(set_links) - except Exception as e: - logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc())) - urls = [] - - return urls, source - -########################################################################### - -def search_googlenews_rss(keyword_search, language="en", country="US"): - # [source] [category] [period] [language-country] [max_results] - source = "googlenews-rss {}-{}".format(language, country).replace("None", "").strip() - logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) - - # https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en - - try: - # Search URL with parameters filled - search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(language, country.upper()), country.upper(), country.upper(), language) - # Control characters - search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen - # Initialize - encoded_urls = [] - # Fetch feeds - feeds = feedparser.parse(search_url) - # Parse - for f in feeds.get("entries", []): - # Encoded URL - encoded_url = f.get("link", None) - ''' - # Available publish date? - publish_date_parsed = f.get("published_parsed") - if (publish_date_parsed is None): - publish_date = f.get("published", None) - if (publish_date is not None): - publish_date_parsed = dateutil.parser.parse(publish_date) - - # Published date - urls_publish_date.append(publish_date_parsed)' - ''' - # Append - encoded_urls.append(encoded_url) - - # Decode - urls = decode_gnews_urls(encoded_urls) - - except Exception as e: - logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc())) - urls = [] - - return urls, source diff --git a/app_urls/api/src/fetch_utils.py b/app_urls/api/src/fetch_utils.py new file mode 100644 index 0000000..29621e1 --- /dev/null +++ b/app_urls/api/src/fetch_utils.py @@ -0,0 +1,35 @@ +import traceback +import os +from django.core.cache import cache +from .logger import get_logger +logger = get_logger() +from googlenewsdecoder import gnewsdecoder + + +def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))): + logger.debug("Decoding gnews URLs") + # DecodeURLs + list_decoded_urls = [] + for url in encoded_urls: + # Already cached? + decoded_url = cache.get("gnews_decode_{}".format(url)) + if (decoded_url is not None): + logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url)) + # Append decoded URL + list_decoded_urls.append(decoded_url) + else: + try: + # Decode URL, with interval time to avoid block + decoded_url_dict = gnewsdecoder(url, interval=interval) + # Ok? + if decoded_url_dict.get("status"): + # Append decoded URL + decoded_url = decoded_url_dict["decoded_url"] + list_decoded_urls.append(decoded_url) + # Cache decoded URL + cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12) + else: + logger.warning("Error decoding news.google.com, URL {}".format(url)) + except Exception as e: + logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc())) + return list_decoded_urls \ No newline at end of file diff --git a/app_urls/api/src/logger.py b/app_urls/api/src/logger.py index 5eb736c..fbc4405 100644 --- a/app_urls/api/src/logger.py +++ b/app_urls/api/src/logger.py @@ -1,34 +1,34 @@ import logging import os -''' TODO: PATH LOGS -PATH_LOGS_ERROR=logs/log_app_fetcher_error.log -PATH_LOGS_INFO=logs/log_app_fetcher_info.log -PATH_LOGS_DEBUG=logs/log_app_fetcher_debug.log -# PATH_LOGS=logs/log_app_fetcher.log -''' -os.makedirs("logs", exist_ok=True) +# Get env var +path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_fetcher_{}.log") + +# Directory of logs +directory = '/'.join(path_logs_parameterization.split("/")[:-1]) +os.makedirs(directory, exist_ok=True) logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') logger = logging.getLogger("news_fetcher") logger.setLevel(logging.DEBUG) # To file log: INFO / WARNING / ERROR / CRITICAL -fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_debug.log", mode="a", maxBytes=10000000, backupCount=4) +fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=4) fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh.setLevel(logging.DEBUG) logger.addHandler(fh) # To file log: INFO / WARNING / ERROR -fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_info.log", mode="a", maxBytes=10000000, backupCount=2) -fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) -fh_.setLevel(logging.INFO) -logger.addHandler(fh_) +fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=2) +fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh.setLevel(logging.INFO) +logger.addHandler(fh) # To file log: WARNING / ERROR / CRITICAL -fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1) -fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) -fh_.setLevel(logging.WARNING) -logger.addHandler(fh_) +fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1) +fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s')) +fh.setLevel(logging.WARNING) +logger.addHandler(fh) def get_logger(): return logger diff --git a/app_urls/api/src/url_processor.py b/app_urls/api/src/url_processor.py index b42015c..d8d67bb 100644 --- a/app_urls/api/src/url_processor.py +++ b/app_urls/api/src/url_processor.py @@ -3,6 +3,7 @@ from .logger import get_logger logger = get_logger() import newspaper import time +import os from urllib.parse import unquote import langdetect langdetect.DetectorFactory.seed = 0 @@ -40,11 +41,11 @@ def url_host_slowdown(url, url_host_slowdown_seconds): def process_url(url): try: # Slow down if required to avoid too many requests error - url_host_slowdown(url, url_host_slowdown_seconds=5) + url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5))) # Process article = newspaper.article(url) except newspaper.ArticleBinaryDataException: - logger.warning("ArticleException for input URL {}\n{}".format(url, str(e))) + logger.warning("ArticleException for input URL {}".format(url)) return {"override_status": "invalid"} except newspaper.ArticleException as e: diff --git a/app_urls/api/tasks.py b/app_urls/api/tasks.py index 4193bab..88e5a18 100644 --- a/app_urls/api/tasks.py +++ b/app_urls/api/tasks.py @@ -1,4 +1,3 @@ -# from django_rq import job from scheduler import job from .src.fetch_feed import FetchFeeds diff --git a/app_urls/api/templates/OBSOLETE_urls.html b/app_urls/api/templates/OBSOLETE_urls.html deleted file mode 100644 index 3e00c7a..0000000 --- a/app_urls/api/templates/OBSOLETE_urls.html +++ /dev/null @@ -1,607 +0,0 @@ - - -
- - -| URL | -Fetch date | -Sources | -Search | -Status | -
|---|---|---|---|---|
| - ➤ - {{ item.url }} - | -{{ item.ts_fetch }} | -- {% with sources_map|dict_get:item.id as sources %} - {% if sources %} - {% for source in sources %} - {{ source }} - {% endfor %} - {% else %} - No sources - {% endif %} - {% endwith %} - | -- {% with searches_map|dict_get:item.id as searches %} - {% if searches %} - {% for search in searches %} - {{ search }} - {% endfor %} - {% else %} - No searches - {% endif %} - {% endwith %} - | -- {% if item.status == 'raw' %} - {{ item.status|capfirst }} - {% elif item.status == 'error' %} - {{ item.status|capfirst }} - {% elif item.status == 'valid' %} - {{ item.status|capfirst }} - {% elif item.status == 'unknown' %} - {{ item.status|capfirst }} - {% elif item.status == 'invalid' %} - {{ item.status|capfirst }} - {% elif item.status == 'duplicate' %} - {{ item.status|capfirst }} - {% else %} - Unknown - {% endif %} - | -
| No items available. | -||||