Refactor searches, env vars fetcher config, urls webpage update

This commit is contained in:
Luciano Gervasoni
2025-04-02 18:45:43 +02:00
parent 077219fcb6
commit 84da104dc8
22 changed files with 676 additions and 1521 deletions

View File

@@ -1,38 +1 @@
# Requirements
```
conda create -n matitos python=3.12
conda activate matitos
pip install ipykernel django requests ollama psycopg[binary] # openai
```
# Development
* app_web
```
# 1) Change models.py
python manage.py inspectdb
# 2)
python manage.py makemigrations
# 3)
python manage.py migrate --fake
# ?
python manage.py migrate --fake sessions zero
python manage.py migrate --fake-initial
python manage.py createsuperuser
```
* app_img_gen
```
docker build -t image_generation .
docker run --rm -it -p 12343:80 image_generation
```
# Deploy
```
python app_web/manage.py runserver
```
# Matitos

36
app_img_gen/README.md Normal file
View File

@@ -0,0 +1,36 @@
```
docker build -t image_generation .
docker run --rm -it -p 12343:80 image_generation
```
```
import requests
import cv2
import base64
import numpy as np
endpoint = "http://192.168.2.64:12343/image"
prompt = "Majestic mountain landscape with snow-capped peaks, autumn foliage in vibrant reds and oranges, a turquoise river winding through a valley, crisp and serene atmosphere, ultra-realistic style."
prompt = "A group of kids happily playing in a joy environment"
#prompt = "A bitcoin behaving like a king, surrounded by small alternative coins. Detailed, geometric style"
json = {
"prompt": prompt,
"num_inference_steps": 10,
"size": "512x512",
"seed": 123456,
}
for inf_step in [1, 4, 10, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100]:
json["num_inference_steps"] = inf_step
%time r = requests.post(endpoint, json=json)
print("Status code", r.status_code)
# Image
png_as_np = np.frombuffer(base64.b64decode(r.text), dtype=np.uint8)
image_bgr = cv2.imdecode(png_as_np, cv2.IMREAD_COLOR)
cv2.imwrite("sample_img_{}.png".format(json["num_inference_steps"]), image_bgr)
```

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -11,41 +11,16 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"db_postgres\n",
"db_redis\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h"
]
}
],
"outputs": [],
"source": [
"!docker rm -f db_postgres db_redis; docker compose -f docker/docker-compose.yml up -d ; sleep 5"
"!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5"
]
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -62,7 +37,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -99,6 +74,8 @@
" id SMALLSERIAL PRIMARY KEY,\n",
" search TEXT NOT NULL UNIQUE,\n",
" type SEARCH_TYPE NOT NULL\n",
" -- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search\n",
" -- UNIQUE(search, language_country)\n",
" );\n",
" CREATE INDEX idx_search_type ON SEARCH(type);\n",
" \n",
@@ -106,7 +83,13 @@
" id SMALLSERIAL PRIMARY KEY,\n",
" source TEXT NOT NULL UNIQUE\n",
" );\n",
" \n",
" \n",
" -- CREATE TABLE SEARCH_LANGUAGE (\n",
" -- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. \"en\"\n",
" -- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. \"us\"\n",
" -- PRIMARY KEY (language, country)\n",
" -- );\n",
" \n",
" CREATE TABLE URLS_SOURCE_SEARCH (\n",
" id_url INTEGER REFERENCES URLS(id),\n",
" id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,\n",
@@ -158,6 +141,8 @@
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');\" )\n",
" # Search keywords\n",
" cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');\" )\n",
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');\" )\n",
" # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');\" )\n",
" \n",
" # Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/\n",
" # cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 75, 'valid');\".format(\".*{}.*\".format(re.escape(\"missingkids.org/poster/\"))) )\n",
@@ -169,51 +154,6 @@
" cur.execute( \"INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');\".format(\".*{}.*\".format(re.escape(\"radio.foxnews.com/\"))) )"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"if INSERT_SAMPLE_DATA:\n",
" # Connect to an existing database\n",
" with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" # Autocommit at end of transaction (Atomic insert of URLs and sources)\n",
" with conn.transaction() as tx:\n",
" # Valid\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame', 'valid')\")\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.bbc.com/news/articles/ckg843y8y7no', 'valid')\")\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/', 'valid')\")\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895', 'valid')\")\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/', 'valid')\")\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality', 'valid')\")\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.google.com', 'invalid')\")\n",
"\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/USVA/VA25-0820/1', 'valid')\")\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('https://www.missingkids.org/poster/NCMC/2045193/1', 'valid')\")\n",
"\n",
" cur.execute(\"INSERT INTO SOURCE (source) values ('news.google.com')\")\n",
" cur.execute(\"INSERT INTO SOURCE (source) values ('qwant.com')\")\n",
"\n",
" cur.execute(\"INSERT INTO URLS_SOURCE (id_url, id_source, id_search) values (1, 1, 1)\")\n",
"\n",
" for j in range(5):\n",
" import time\n",
" time.sleep(0.25)\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_{}.org', 'invalid')\".format(j))\n",
" \n",
" # Long URLs \n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html', 'invalid')\".format(j))\n",
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))\n",
"\n",
" # URL Content\n",
" language, content = \"en\", \"Bla Bla Bla!!!\"*25\n",
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, images_url) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n",
" (1, datetime.now(tz=timezone.utc), content, language, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -223,41 +163,9 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\t urls\n",
"[]\n",
"\t urls_duplicate\n",
"[]\n",
"\t urls_source_search\n",
"[]\n",
"\t source\n",
"[]\n",
"\t search\n",
"[(1,\n",
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
" 'rss_feed'),\n",
" (2, 'missingkids.org/poster', 'url_host'),\n",
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
" (4, 'breitbart.com', 'url_host'),\n",
" (5, 'child abuse', 'keyword_search')]\n",
"\t status_pattern_matching\n",
"[('.*youtube\\\\.com/.*', 50, 'invalid'),\n",
" ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n",
" ('.*twitter\\\\.com/.*', 50, 'invalid'),\n",
" ('.*reddit\\\\.com/.*', 50, 'invalid'),\n",
" ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n",
" ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n",
"\t url_content\n",
"[]\n"
]
}
],
"outputs": [],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
@@ -274,23 +182,9 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(1,\n",
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
" 'rss_feed'),\n",
" (2, 'missingkids.org/poster', 'url_host'),\n",
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
" (4, 'breitbart.com', 'url_host'),\n",
" (5, 'child abuse', 'keyword_search')]\n"
]
}
],
"outputs": [],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
@@ -301,23 +195,15 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[]\n"
]
}
],
"outputs": [],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
" # Open a cursor to perform database operations\n",
" with conn.cursor() as cur:\n",
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 150;\").fetchall() )\n",
" pprint( cur.execute(\"SELECT * FROM URLS LIMIT 50;\").fetchall() )\n",
" #pprint( cur.execute(\"SELECT id_url, title, valid_content FROM URL_CONTENT LIMIT 10;\").fetchall() )"
]
},
@@ -326,34 +212,9 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\n!docker rm -f db_redis; docker compose -f docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"!docker rm -f db_redis; docker compose -f docker/docker-compose.yml up -d\n",
"!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n",
"\n",
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",

View File

@@ -10,6 +10,10 @@ pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlen
pip install ollama
```
* Database
* Database initialization -> 1-DB.ipynb
* From automated inspectdb
```
# 1) Inspect DB, generate models.py
@@ -72,23 +76,26 @@ class Meta:
* Environment variables
```
# Database
DB_NAME=${DB_NAME:-matitos}
DB_USER=${DB_NAME:-supermatitos}
DB_PASSWORD=${DB_NAME:-supermatitos}
DB_HOST=${DB_NAME:-localhost}
DB_PORT=${DB_NAME:-5432}
REDIS_HOST=${REDIS_HOST:-localhost}
REDIS_PORT=${REDIS_PORT:-6379}
# Default RQ job timeout
RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900}
# Default RQ job queue TTL
RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600}
# Job timeout: 30 min
JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
# Logs path
PATH_LOGS_ERROR=logs/log_app_fetcher_error.log
PATH_LOGS=logs/log_app_fetcher.log
PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
# Fetcher
FETCHER_GNEWS_DECODE_SLEEP=2
FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
FETCHER_BETWEEN_SEARCHES_SLEEP=5
FETCHER_URL_HOST_SLEEP=5
```
* Deploy
@@ -110,30 +117,14 @@ http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=publ
* Scheduled tasks
```
# 1) Modify the scheduled tasks on the admin panel:
Names: Fetch Feeds, Fetch Parser, Fetch Search
Callable: api.tasks.fetch_feeds, api.tasks.fetch_parser, api.tasks.fetch_search
Task type: Repetable task (or cron...)
Queue: Default
Interval: 15min, 2h, 30min
Names: Process raw URLs, Process error URLs, Process MissingKids URLs
Callable: api.tasks.process_raw_urls, api.tasks.process_error_urls, api.tasks.process_missing_kids_urls_50
Task type: Repetable task (or cron...)
Queue: Low, Low, Default
Interval: 1h, 4h, 2h
# 2) Export
# python manage.py export > scheduled_tasks.json
# Or simply import saved definitions
# Import tasks
python manage.py import --filename scheduled_tasks.json
# Modify using the admin panel, then save
# python manage.py export > scheduled_tasks.json
```
* Utils
* Utils. TODO: To endpoint...
```
python manage.py rqstats
python manage.py rqstats --interval=1 # Refreshes every second
```

View File

@@ -1,5 +1,6 @@
# Generated by Django 5.1.7 on 2025-03-13 17:01
# Generated by Django 5.2 on 2025-04-02 16:44
import django.contrib.postgres.fields
import django.db.models.deletion
from django.db import migrations, models
@@ -12,22 +13,12 @@ class Migration(migrations.Migration):
]
operations = [
migrations.CreateModel(
name='Feed',
fields=[
('id', models.SmallAutoField(primary_key=True, serialize=False)),
('rss_feed', models.TextField(unique=True)),
],
options={
'db_table': 'feed',
'managed': False,
},
),
migrations.CreateModel(
name='Search',
fields=[
('id', models.SmallAutoField(primary_key=True, serialize=False)),
('keyword_search', models.TextField(unique=True)),
('search', models.TextField(unique=True)),
('type', models.TextField(choices=[('rss_feed', 'RSS_Feed'), ('keyword_search', 'Keyword_Search'), ('url_host', 'URL_Host')])),
],
options={
'db_table': 'search',
@@ -67,28 +58,7 @@ class Migration(migrations.Migration):
],
options={
'db_table': 'urls',
'managed': False,
},
),
migrations.CreateModel(
name='WebsiteOfInterest',
fields=[
('id', models.SmallAutoField(primary_key=True, serialize=False)),
('url_host', models.TextField(unique=True)),
],
options={
'db_table': 'website_of_interest',
'managed': False,
},
),
migrations.CreateModel(
name='WebsiteToFilter',
fields=[
('id', models.SmallAutoField(primary_key=True, serialize=False)),
('url_host', models.TextField(unique=True)),
],
options={
'db_table': 'website_to_filter',
'ordering': ['-ts_fetch'],
'managed': False,
},
),
@@ -102,12 +72,12 @@ class Migration(migrations.Migration):
('content', models.TextField(blank=True, null=True)),
('valid_content', models.BooleanField(blank=True, null=True)),
('language', models.CharField(blank=True, max_length=2, null=True)),
('keywords', models.TextField(blank=True, null=True)),
('tags', models.TextField(blank=True, null=True)),
('authors', models.TextField(blank=True, null=True)),
('image_main', models.TextField(blank=True, null=True)),
('images_url', models.TextField(blank=True, null=True)),
('videos_url', models.TextField(blank=True, null=True)),
('keywords', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
('tags', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
('authors', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
('image_main_url', models.TextField(blank=True, null=True)),
('images_url', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
('videos_url', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
('url_host', models.TextField(blank=True, null=True)),
('site_name', models.TextField(blank=True, null=True)),
],
@@ -127,12 +97,12 @@ class Migration(migrations.Migration):
},
),
migrations.CreateModel(
name='UrlsSource',
name='UrlsSourceSearch',
fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
],
options={
'db_table': 'urls_source',
'db_table': 'urls_source_search',
'managed': False,
},
),

View File

@@ -1,26 +0,0 @@
# Generated by Django 5.1.7 on 2025-03-19 09:06
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('api', '0001_initial'),
]
operations = [
migrations.DeleteModel(
name='Feed',
),
migrations.DeleteModel(
name='WebsiteOfInterest',
),
migrations.DeleteModel(
name='WebsiteToFilter',
),
migrations.AlterModelOptions(
name='urls',
options={'managed': False, 'ordering': ['-ts_fetch']},
),
]

View File

@@ -1,27 +0,0 @@
# Generated by Django 4.2.20 on 2025-03-20 16:12
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('api', '0002_delete_feed_delete_websiteofinterest_and_more'),
]
operations = [
migrations.CreateModel(
name='UrlsSourceSearch',
fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
],
options={
'db_table': 'urls_source_search',
'managed': False,
},
),
migrations.DeleteModel(
name='UrlsSource',
),
]

View File

@@ -109,3 +109,32 @@ class UrlsSourceSearch(models.Model):
def __str__(self):
return "{} {} {}".format(self.id_source, self.id_search, self.id_url)
""" # TODO: Migrate to django 5.2
class UrlsDuplicate(models.Model):
pk = models.CompositePrimaryKey('id_url_canonical', 'id_url_duplicated')
id_url_canonical = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_canonical')
id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
class Meta:
managed = False
db_table = 'urls_duplicate'
unique_together = (('id_url_canonical', 'id_url_duplicated'),)
def __str__(self):
return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical)
class UrlsSourceSearch(models.Model):
pk = models.CompositePrimaryKey('id_url', 'id_source', 'id_search')
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url')
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
class Meta:
managed = False
db_table = 'urls_source_search'
unique_together = (('id_url', 'id_source', 'id_search'),)
def __str__(self):
return "{} {} {}".format(self.id_source, self.id_search, self.id_url)
"""

View File

@@ -1,59 +1,17 @@
from .db_utils import DB_Handler
from ..models import Search, Source
from ..models import Search
from django.db.models import Q
import traceback
import time
from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news, search_googlenews_rss
import os
from .fetch_search_instances import ListSearchInstances
from .logger import get_logger
logger = get_logger()
'''
from abc import ABC, abstractmethod
# Generic fetcher (fetches articles, writes to DB)
class FetcherAbstract(ABC):
@abstractmethod
def _fetch_raw_urls_list(self):
pass
def fetch_articles(self, db_writer):
logger.debug("Starting fetch() for {}".format(self.name))
# Fetch articles
list_news = self._fetch()
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
# Write to DB
db_writer.write_batch(list_news, self.name)
self._fetch_raw_urls_list()
raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
'''
class FetchSearcher():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Searcher")
def _get_source_object(self, source):
# TODO: Cache
# self.cached_sources = {}
# Get source object
obj_source, created = Source.objects.get_or_create(source=source)
return obj_source
def _post_process_urls(self, raw_urls, obj_search):
# Searching URL Host based? Make sure results belong to that site
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
# Get clean URL host
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
# Ensure URL host in URL
raw_urls = [u for u in raw_urls if url_host_clean in u]
return raw_urls
def run(self):
try:
logger.debug("Starting FetchSearcher.run()")
@@ -65,58 +23,36 @@ class FetchSearcher():
# Search
for obj_search in list_search_obj:
# TODO: language & country customization
# TODO: allintitle: "child abuse"
# TODO: intitle: "child abuse"
# Search
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
if (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
# Add search with intitle keyword
# TODO: allintitle: "child abuse"
# TODO: intitle: "child abuse"
pass
# language, country = obj_search.language_country.split("-")
logger.debug("Starting keyword search: {}".format(keyword_search))
logger.debug("Search type: {}".format(obj_search.type))
# news.google.com/rss
time.sleep(5)
raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# DDG News
time.sleep(5)
raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "en-US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# DB writer
db_writer = DB_Handler()
# GNews
time.sleep(5)
raw_urls, source = search_gnews(keyword_search, language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# Keyword arguments
args = {
"language": "en",
"country": "US",
"period": "7d",
"max_results": 100,
"max_pages": 1,
}
# DDG Text (week, 20 results)
time.sleep(5)
raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=20, region = "en-US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# GoogleNews news
time.sleep(5)
raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# GoogleNews general
time.sleep(5)
raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=2)
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
for SearchInstance in ListSearchInstances:
# Sleep between requests, avoid too many requests...
time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
SearchInstance(args).fetch_articles(db_writer, obj_search)
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
except Exception as e:

View File

@@ -0,0 +1,259 @@
import time
import feedparser
import os
from ..models import Search, Source
from .fetch_utils import decode_gnews_urls
from .logger import get_logger
logger = get_logger()
from gnews import GNews
from duckduckgo_search import DDGS
from GoogleNews import GoogleNews
###########################################################################
###########################################################################
from abc import ABC, abstractmethod
# Generic fetcher (fetches articles, writes to DB)
class FetcherAbstract(ABC):
@abstractmethod
def _fetch_raw_urls(self):
pass
@abstractmethod
def _get_name(self):
pass
def _get_source_object(self, source):
# TODO: Cache
# self.cached_sources = {}
# Get source object
obj_source, created = Source.objects.get_or_create(source=source)
return obj_source
def _post_process_urls(self, raw_urls, obj_search):
# Searching URL Host based? Make sure results belong to that site
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
# Get clean URL host
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
# Ensure URL host in URL
raw_urls = [u for u in raw_urls if url_host_clean in u]
return raw_urls
def fetch_articles(self, db_writer, obj_search):
# Search
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
# Source name
source_name = self._get_name()
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
# Fetch
raw_urls = self._fetch_raw_urls(keyword_search)
# Post-process
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
db_writer.insert_raw_urls(raw_urls, self._get_source_object(source_name), obj_search)
###########################################################################
class SearchGNews(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_results":100}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.period = args.get("period")
self.max_results = args.get("max_results")
def _get_name(self):
# [source] [period] [language-country] [max_results]
return "gnews {} {}-{} results={}".format("news", self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Get news
results_gnews = GNews(language=self.language, country=self.country, period=self.period, max_results=self.max_results).get_news(keyword_search)
# Get list of encoded urls
encoded_urls = [e.get("url") for e in results_gnews]
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchDuckDuckGoGeneral(FetcherAbstract):
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.max_results = args.get("max_results")
self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None
def _get_name(self):
# [source] [language-country] [max_results]
return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
urls = [e.get("href") for e in news]
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchDuckDuckGoNews(FetcherAbstract):
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.max_results = args.get("max_results")
self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None
def _get_name(self):
# [source] [language-country] [max_results]
return "ddg-news {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
news = DDGS().news(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
urls = [e.get("url") for e in news]
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleNews(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d"}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.period = args.get("period")
def _get_name(self):
# [source] [period] [language-country]
return "googlenews {} {}-{}".format(self.period, self.language, self.country)
def _fetch_raw_urls(self, keyword_search):
try:
# Initialize
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
googlenews.enableException(True)
# Search
googlenews.get_news(keyword_search)
# Fetch
encoded_urls = googlenews.get_links()
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleGeneral(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_pages":1}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.period = args.get("period")
self.max_pages = args.get("max_pages")
def _get_name(self):
# [source] [period] [language-country] [pages]
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.max_pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Initialize
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
googlenews.enableException(True)
# Search
googlenews.search(keyword_search)
set_links = set()
# Iterate pages
for i in range(self.max_pages):
# Sleep between pages fetch
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
# Number of URLs fetched so far
num_before = len(set_links)
# Get page
try:
links = googlenews.page_at(i+1)
except Exception as e:
logger.warning("Exception fetching page - {}: {}".format(self._get_name(), str(e)))
break
# Links
for l in links:
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
set_links.add( l.get("link").split("&ved=")[0] )
# Finished?
if (num_before == len(set_links)):
break
# To list
urls = list(set_links)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleNewsRSS(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US"}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
def _get_name(self):
# [source] [language-country]
return "googlenews-rss {}-{}".format(self.language, self.country).strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Search URL with parameters filled: https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(self.language, self.country.upper()), self.country.upper(), self.country.upper(), self.language)
# Control characters
search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
# Initialize
encoded_urls = []
# Fetch feeds
feeds = feedparser.parse(search_url)
# Parse
for f in feeds.get("entries", []):
# Encoded URL
encoded_url = f.get("link", None)
'''
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)'
'''
# Append
encoded_urls.append(encoded_url)
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
###########################################################################
# List of instances
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]

View File

@@ -1,197 +0,0 @@
from django.core.cache import cache
import traceback
import random
import time
import feedparser
import urllib
import dateutil
from .logger import get_logger
logger = get_logger()
from googlenewsdecoder import gnewsdecoder
from gnews import GNews
from duckduckgo_search import DDGS
from GoogleNews import GoogleNews
###########################################################################
def decode_gnews_urls(encoded_urls, interval=2):
# DecodeURLs
list_decoded_urls = []
for url in encoded_urls:
# Already cached?
decoded_url = cache.get("gnews_decode_{}".format(url))
if (decoded_url is not None):
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
# Append decoded URL
list_decoded_urls.append(decoded_url)
else:
try:
# Decode URL, with interval time to avoid block
decoded_url_dict = gnewsdecoder(url, interval=interval)
# Ok?
if decoded_url_dict.get("status"):
# Append decoded URL
decoded_url = decoded_url_dict["decoded_url"]
list_decoded_urls.append(decoded_url)
# Cache decoded URL
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
else:
logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, str(decoded_url)))
except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
return list_decoded_urls
###########################################################################
def search_gnews(keyword_search, period="1d", language="en", country="US", max_results=100):
# [source] [category] [period] [language-country] [max_results]
source = "gnews {} {} {}-{} max_results={}".format("news", period, language, country, max_results).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
try:
# Get news
results_gnews = GNews(language=language, country=country).get_news(keyword_search)
# Get list of encoded urls
encoded_urls = [e.get("url") for e in results_gnews]
# Decode
logger.debug("Decoding gnews URLs")
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source
###########################################################################
def search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region="wt-wt"):
# [source] [category] [period] [language-country] [max_results]
source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("max_results=None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# region="{}-{}".format(langauge, country.lower())
# timelimit= # Options: d, w, m
# max_results # max number of results. If None, returns results only from the first response. Defaults to None
try:
if (category == "news"):
news = DDGS().news(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
urls = [e.get("url") for e in news]
if (category == "text"):
news = DDGS().text(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
urls = [e.get("href") for e in news]
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source
###########################################################################
def search_googlenews_news(keyword_search, period="1d", language="en", country="US"):
category = "news"
# [source] [category] [period] [language-country]
source = "googlenews {} {} {}-{}".format(category, period, language, country).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# Initialize
googlenews = GoogleNews(period=period, lang=language, region=country)
googlenews.enableException(True)
try:
# Search
googlenews.get_news(keyword_search)
# Fetch
encoded_urls = googlenews.get_links()
# Decode
logger.debug("Decoding gnews URLs")
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source
def search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5):
category="general"
# [source] [category] [period] [language-country] [max_results]
source = "googlenews {} {} {}-{} max_pages={}".format(category, period, language, country, max_pages).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# Initialize
googlenews = GoogleNews(period=period, lang=language, region=country)
googlenews.enableException(True)
try:
set_links = set()
# Search
googlenews.search(keyword_search)
# Iterate pages
for i in range(max_pages):
time.sleep(random.uniform(2, 4.5))
num_before = len(set_links)
# Get page
try:
links = googlenews.page_at(i+1)
except Exception as e:
logger.warning("Exception fetching page in GoogleNews {}: {}".format(source, str(e)))
break
# Links
for l in links:
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
set_links.add( l.get("link").split("&ved=")[0] )
# Finished?
if (num_before == len(set_links)):
break
# To list
urls = list(set_links)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source
###########################################################################
def search_googlenews_rss(keyword_search, language="en", country="US"):
# [source] [category] [period] [language-country] [max_results]
source = "googlenews-rss {}-{}".format(language, country).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
try:
# Search URL with parameters filled
search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(language, country.upper()), country.upper(), country.upper(), language)
# Control characters
search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
# Initialize
encoded_urls = []
# Fetch feeds
feeds = feedparser.parse(search_url)
# Parse
for f in feeds.get("entries", []):
# Encoded URL
encoded_url = f.get("link", None)
'''
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)'
'''
# Append
encoded_urls.append(encoded_url)
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source

View File

@@ -0,0 +1,35 @@
import traceback
import os
from django.core.cache import cache
from .logger import get_logger
logger = get_logger()
from googlenewsdecoder import gnewsdecoder
def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
logger.debug("Decoding gnews URLs")
# DecodeURLs
list_decoded_urls = []
for url in encoded_urls:
# Already cached?
decoded_url = cache.get("gnews_decode_{}".format(url))
if (decoded_url is not None):
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
# Append decoded URL
list_decoded_urls.append(decoded_url)
else:
try:
# Decode URL, with interval time to avoid block
decoded_url_dict = gnewsdecoder(url, interval=interval)
# Ok?
if decoded_url_dict.get("status"):
# Append decoded URL
decoded_url = decoded_url_dict["decoded_url"]
list_decoded_urls.append(decoded_url)
# Cache decoded URL
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
else:
logger.warning("Error decoding news.google.com, URL {}".format(url))
except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
return list_decoded_urls

View File

@@ -1,34 +1,34 @@
import logging
import os
''' TODO: PATH LOGS
PATH_LOGS_ERROR=logs/log_app_fetcher_error.log
PATH_LOGS_INFO=logs/log_app_fetcher_info.log
PATH_LOGS_DEBUG=logs/log_app_fetcher_debug.log
# PATH_LOGS=logs/log_app_fetcher.log
'''
os.makedirs("logs", exist_ok=True)
# Get env var
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_fetcher_{}.log")
# Directory of logs
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
os.makedirs(directory, exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_debug.log", mode="a", maxBytes=10000000, backupCount=4)
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=4)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# To file log: INFO / WARNING / ERROR
fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_info.log", mode="a", maxBytes=10000000, backupCount=2)
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh_.setLevel(logging.INFO)
logger.addHandler(fh_)
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=2)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.INFO)
logger.addHandler(fh)
# To file log: WARNING / ERROR / CRITICAL
fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh_.setLevel(logging.WARNING)
logger.addHandler(fh_)
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.WARNING)
logger.addHandler(fh)
def get_logger():
return logger

View File

@@ -3,6 +3,7 @@ from .logger import get_logger
logger = get_logger()
import newspaper
import time
import os
from urllib.parse import unquote
import langdetect
langdetect.DetectorFactory.seed = 0
@@ -40,11 +41,11 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
def process_url(url):
try:
# Slow down if required to avoid too many requests error
url_host_slowdown(url, url_host_slowdown_seconds=5)
url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
# Process
article = newspaper.article(url)
except newspaper.ArticleBinaryDataException:
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
logger.warning("ArticleException for input URL {}".format(url))
return {"override_status": "invalid"}
except newspaper.ArticleException as e:

View File

@@ -1,4 +1,3 @@
# from django_rq import job
from scheduler import job
from .src.fetch_feed import FetchFeeds

View File

@@ -1,607 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>News</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<script>
function getQueryString(pageNumber, itemsNumber, sources, searches, statuses){
// Query parameters. If input is null, get most recent value
let queryParams = new URLSearchParams(window.location.search);
// page
if (pageNumber == null) pageNumber = queryParams.get("page") ?? 1;
queryParams.set("page", pageNumber);
// items
if (itemsNumber == null) itemsNumber = queryParams.get("items") ?? 15;
queryParams.set("items", itemsNumber);
// sources
if (sources == null) sources = queryParams.get("sources") ?? "all";
queryParams.set("sources", sources);
// searches
if (searches == null) searches = queryParams.get("searches") ?? "all";
queryParams.set("searches", searches);
// status
if (statuses == null) statuses = queryParams.get("status") ?? "all";
queryParams.set("status", statuses);
// Encoding fix: %2C -> ,
let queryParamsString = queryParams.toString();
while (queryParamsString.includes("%2C")) {
queryParamsString = queryParamsString.replace("%2C", ",");
}
return queryParamsString;
}
function loadPage(pageNumber, itemsNumber, sources, searches, statuses) {
$("#item-list").fadeTo(100, 0.5); // Smooth fade effect
$("#loading").show();
queryParamsString = getQueryString(pageNumber, itemsNumber, sources, searches, statuses);
$.ajax({
url: "?" + queryParamsString,
type: "GET",
headers: { "X-Requested-With": "XMLHttpRequest" },
success: function (data) {
$("#item-list").fadeTo(0, 1).html(data.items_html); // Restore opacity smoothly
$("#loading").hide();
// Update URL without reloading
window.history.pushState({}, "", "?" + queryParamsString);
}
});
}
////////////////////////////////////////////////////////////////////////////
// Pagination
////////////////////////////////////////////////////////////////////////////
$(document).on("click", ".pagination a", function (event) {
event.preventDefault();
let page = $(this).attr("data-page");
loadPage(pageNumber=page, itemsNumber=null, sources=null, searches=null, statuses=null);
});
$(document).ready(function () {
////////////////////////////////////////////////////////////////////////////
// Filter updates
////////////////////////////////////////////////////////////////////////////
const sourcesToggleAll = $("#toggle-all-sources");
const sourcesCheckboxes = $(".source-checkbox");
const searchesToggleAll = $("#toggle-all-searches");
const searchesCheckboxes = $(".search-checkbox");
const statusesToggleAll = $("#toggle-all-status");
const statusCheckboxes = $(".status-checkbox");
function updateFilters() {
// Get selected sources
if (sourcesToggleAll.prop("checked")) {
selectedSources = "all";
}
else {
if (sourcesCheckboxes.filter(":checked").length > 0 ){
selectedSources = sourcesCheckboxes.filter(":checked").map(function () {
return $(this).val();
}).get().join(",");
}
else {
selectedSources = "none";
}
}
// Get selected searches
if (searchesToggleAll.prop("checked")) {
selectedSearches = "all";
}
else {
if (searchesCheckboxes.filter(":checked").length > 0 ){
selectedSearches = searchesCheckboxes.filter(":checked").map(function () {
return $(this).val();
}).get().join(",");
}
else {
selectedSearches = "none";
}
}
// Get selected URL statuses
if (statusesToggleAll.prop("checked")) {
selectedStatuses = "all";
}
else {
if (statusCheckboxes.filter(":checked").length > 0 ){
selectedStatuses = statusCheckboxes.filter(":checked").map(function () {
return $(this).val();
}).get().join(",");
}
else {
selectedStatuses = "none";
}
}
// Get selected items per page
let selectedItems = $("input[name='items']:checked").val();
// Update pagination and reload data
loadPage(1, selectedItems, selectedSources, selectedSearches, selectedStatuses);
}
////////////////////////////////////////////////////////////////////////////
// Change triggers
////////////////////////////////////////////////////////////////////////////
// Sources
sourcesToggleAll.on("change", function () {
sourcesCheckboxes.prop("checked", sourcesToggleAll.prop("checked"));
updateFilters();
});
sourcesCheckboxes.on("change", function () {
sourcesToggleAll.prop("checked", sourcesCheckboxes.length === sourcesCheckboxes.filter(":checked").length);
updateFilters();
});
// Searches
searchesToggleAll.on("change", function () {
searchesCheckboxes.prop("checked", searchesToggleAll.prop("checked"));
updateFilters();
});
searchesCheckboxes.on("change", function () {
searchesToggleAll.prop("checked", searchesCheckboxes.length === searchesCheckboxes.filter(":checked").length);
updateFilters();
});
// Status
statusesToggleAll.on("change", function () {
statusCheckboxes.prop("checked", statusesToggleAll.prop("checked"));
updateFilters();
});
statusCheckboxes.on("change", function () {
// If all checkboxes are checked, mark "Toggle All" as checked
statusesToggleAll.prop("checked", statusCheckboxes.length === statusCheckboxes.filter(":checked").length);
updateFilters();
});
// Items change trigger update
$(".items").on("change", updateFilters);
////////////////////////////////////////////////////////////////////////////
// Default values
////////////////////////////////////////////////////////////////////////////
// Sources
sourcesCheckboxes.each(function () { $(this).prop("checked", true); });
sourcesToggleAll.prop("checked", true);
// Searches
searchesCheckboxes.each(function () { $(this).prop("checked", true); });
searchesToggleAll.prop("checked", true);
// Statuses
statusCheckboxes.each(function () { $(this).prop("checked", true); });
statusesToggleAll.prop("checked", true);
// Items
// $("input[name='items'][value='" + 15 + "']").prop("checked", true);
// loadPage(pageNumber=page, itemsNumber=null, sources=null, searches=null, statuses=null);
});
////////////////////////////////////////////////////////////////////////////
// Theme logic
////////////////////////////////////////////////////////////////////////////
function setTheme(mode) {
document.documentElement.setAttribute("data-theme", mode);
document.documentElement.setAttribute("data-bs-theme", mode);
localStorage.setItem("theme", mode);
document.getElementById("theme-icon").innerHTML = mode === "dark" ? "🌞" : "🌙";
document.body.classList.toggle("dark-mode", mode === "dark");
}
function toggleTheme() {
let currentTheme = document.documentElement.getAttribute("data-theme");
setTheme(currentTheme === "dark" ? "light" : "dark");
}
document.addEventListener("DOMContentLoaded", function () {
let savedTheme = localStorage.getItem("theme") ||
(window.matchMedia("(prefers-color-scheme: dark)").matches ? "dark" : "light");
setTheme(savedTheme);
// Local browser timestamp aware for ts_fetch print
document.querySelectorAll(".timestamp").forEach(function (el) {
const ts = el.getAttribute("data-ts");
if (ts) {
const options = {
day: "2-digit",
month: "2-digit",
year: "numeric",
hour: "2-digit",
minute: "2-digit",
second: "2-digit",
hour12: false // Use 24-hour format
}; // "en-GB" for DD-MM-YYYY
const localDate = new Date(ts).toLocaleString("en-GB", options); // Adjust to browser's timezone
el.innerHTML = `${localDate}`;
}
});
});
////////////////////////////////////////////////////////////////////////////
</script>
<style>
/* Content Area */
#content {
margin-left: 170px; /* Match sidebar width */
min-width: calc(100vw - 170px); /* Ensure it doesn't shrink into the sidebar */
width: calc(100vw - 170px); /* Expands based on screen size */
padding: 20px;
overflow-x: auto; /* Prevent content from being squeezed */
transition: margin-left 0.3s ease;
}
/* Sidebar Styles */
#sidebar {
height: 100vh;
position: fixed;
top: 0;
left: 0;
width: 170px; /* Default width */
background-color: var(--bg-color);
box-shadow: 2px 0 5px rgba(0, 0, 0, 0.1);
padding: 15px;
transition: width 0.3s ease;
/* Enable scrolling */
overflow-y: auto;
max-height: 100vh;
}
#sidebar .nav-link {
color: var(--text-color);
}
#sidebar .nav-link:hover {
background-color: var(--pagination-hover-bg);
}
/* ============================= */
/* Responsive Enhancements */
/* ============================= */
@media (min-width: 1200px) {
.table {
width: 95%; /* Allows table to take more space */
margin: 0 auto; /* Centers the table */
}
}
@media (max-width: 768px) {
#sidebar {
width: 70px; /* Collapse sidebar to smaller width */
/*padding: 10px;*/
}
#content {
margin-left: 70px; /* Adjust margin to match collapsed sidebar */
min-width: calc(100vw - 70px); /* Prevent overlap */
/*padding: 10px;*/
}
/* Adjust table for small screens */
.table-responsive {
overflow-x: auto;
}
.table th,
.table td {
white-space: nowrap; /* Prevent text wrapping in cells */
}
.table a {
word-break: break-word; /* Ensure long URLs break properly */
}
}
/* ============================= */
/* Global Styles */
/* ============================= */
body {
background-color: var(--bg-color);
color: var(--text-color);
transition: background-color 0.3s, color 0.3s;
}
/* ============================= */
/* Light & Dark Mode Variables */
/* ============================= */
:root {
--bg-color: #ffffff;
--text-color: #212529;
--table-bg: #ffffff;
--table-text: #000000;
--table-border: #dee2e6;
--link-color: #007bff;
--pagination-bg: #ffffff;
--pagination-border: #dee2e6;
--pagination-hover-bg: #f8f9fa;
--pagination-active-bg: #007bff;
--pagination-active-text: #ffffff;
--button-bg: #f8f9fa;
--button-border: #ced4da;
--button-text: #212529;
}
[data-theme="dark"] {
--bg-color: #121212;
--text-color: #e0e0e0;
--table-bg: #1e1e1e;
--table-text: #ffffff;
--table-border: #2c2c2c;
--link-color: #9ec5fe;
--pagination-bg: #1e1e1e;
--pagination-border: #444;
--pagination-hover-bg: #333;
--pagination-active-bg: #007bff;
--pagination-active-text: #ffffff;
--button-bg: #1e1e1e;
--button-border: #444;
--button-text: #e0e0e0;
}
/* ============================= */
/* Table Styling */
/* ============================= */
.table-responsive {
width: 100%; /* Ensure it spans the full width of its container */
max-width: 100%;
overflow-x: auto;
}
.table {
background-color: var(--table-bg);
color: var(--table-text);
border: 1px solid var(--table-border);
transition: background-color 0.3s, color 0.3s;
width: 100%; /* Ensures it takes full width of its container */
table-layout: auto; /* Allows columns to adjust dynamically */
/*white-space: nowrap;*/ /* Prevents text wrapping in cells */
}
.table th,
.table td {
border-color: var(--table-border);
}
.table thead {
background-color: var(--pagination-active-bg);
color: var(--pagination-active-text);
}
[data-theme="dark"] .table {
background-color: var(--table-bg);
color: var(--table-text);
}
[data-theme="dark"] .table th,
[data-theme="dark"] .table td {
border-color: var(--table-border);
}
[data-theme="dark"] .table thead {
background-color: #333;
color: #fff;
}
th:nth-child(1), td:nth-child(1) { width: 50%; } /* URL column */
th:nth-child(2), td:nth-child(2) { width: 27.5%; } /* Fetch Date */
th:nth-child(3), td:nth-child(3) { width: 10%; } /* Sources */
th:nth-child(4), td:nth-child(4) { width: 10%; } /* Searches */
th:nth-child(5), td:nth-child(5) { width: 2.5%; } /* Status */
/* ============================= */
/* Pagination Styling */
/* ============================= */
.pagination {
display: flex;
justify-content: center;
padding: 10px 0;
}
.pagination .page-link {
background-color: var(--pagination-bg);
border-color: var(--pagination-border);
color: var(--text-color);
padding: 10px 14px;
margin: 0 5px;
border-radius: 8px;
transition: background-color 0.3s, color 0.3s, transform 0.2s;
}
.pagination .page-link:hover {
background-color: var(--pagination-hover-bg);
transform: scale(1.05);
}
.pagination .active .page-link {
background-color: var(--pagination-active-bg);
color: var(--pagination-active-text);
border-color: var(--pagination-active-bg);
}
/* ============================= */
/* Theme Toggle Button */
/* ============================= */
.theme-toggle-btn {
background-color: var(--button-bg);
border: 1px solid var(--button-border);
color: var(--button-text);
border-radius: 50%;
width: 40px;
height: 40px;
font-size: 20px;
display: flex;
align-items: center;
justify-content: center;
transition: background-color 0.3s, color 0.3s, transform 0.2s;
cursor: pointer;
}
.theme-toggle-btn:hover {
background-color: var(--pagination-hover-bg);
transform: rotate(20deg);
}
.theme-toggle-btn:active {
transform: scale(0.95);
}
/* ============================= */
/* Loading Spinner Styling */
/* ============================= */
#loading {
position: fixed;
left: 50%;
top: 50%;
transform: translate(-50%, -50%);
z-index: 1050;
display: none;
}
.spinner-border {
width: 4rem;
height: 4rem;
}
</style>
</head>
<body>
<!-- Left Sidebar -->
<div id="sidebar" class="d-flex flex-column">
<ul class="nav flex-column">
<!-- Theme Toggle Button -->
<div class="nav-item">
<button onclick="toggleTheme()" class="theme-toggle-btn">
<span id="theme-icon">🌙</span>
</button>
</div>
<!-- URLs per page -->
<div class="nav-item mt-3">
<strong>URLs per page</strong>
<div class="card-body">
<!-- Individual Status Checkboxes -->
{% for url_per_page in list_urls_per_page %}
<div class="items-form-check">
<input class="form-check-input items" type="radio" name="items" id="value-{{ url_per_page }}" value="{{ url_per_page }}">
<label class="form-check-label" for="value-{{ url_per_page }}">{{ url_per_page }}</label>
</div>
{% empty %}
<tr>
<td colspan="2" class="text-center">No options available.</td>
</tr>
{% endfor %}
</div>
</div>
<!-- Status -->
<div class="nav-item mt-3">
<strong>Select status</strong>
<form id="status-filter-form">
<!-- Toggle All Checkbox -->
<div class="status-form-check">
<input class="form-check-input" type="checkbox" id="toggle-all-status">
<label class="form-check-label fw-bold" for="toggle-all-status">
Toggle all
</label>
</div>
<!-- Individual Status Checkboxes -->
{% for status in list_status %}
<div class="status-form-check">
<input class="form-check-input status-checkbox" type="checkbox" value="{{ status }}" id="status-{{ status }}">
<label class="form-check-label" for="status-{{ status }}">
{{ status }}
</label>
</div>
{% empty %}
<tr>
<td colspan="2" class="text-center">No statuses available.</td>
</tr>
{% endfor %}
</form>
</div>
<!-- Sources -->
<div class="nav-item mt-3">
<strong>Select sources</strong>
<form id="source-filter-form">
<!-- Toggle All Checkbox -->
<div class="form-check">
<input class="form-check-input" type="checkbox" id="toggle-all-sources">
<label class="form-check-label fw-bold" for="toggle-all-sources">
Toggle all
</label>
</div>
<!-- Individual Source Checkboxes -->
{% for source in sources %}
<div class="form-check">
<input class="form-check-input source-checkbox" type="checkbox" value="{{ source.id }}" id="source-{{ source.id }}">
<label class="form-check-label" for="source-{{ source.id }}">
{{ source.source }}
</label>
</div>
{% empty %}
<tr>
<td colspan="2" class="text-center">No sources available.</td>
</tr>
{% endfor %}
</form>
</div>
<!-- Searches -->
<div class="nav-item mt-3">
<strong>Select searches</strong>
<form id="search-filter-form">
<!-- Toggle All Checkbox -->
<div class="form-check">
<input class="form-check-input" type="checkbox" id="toggle-all-searches">
<label class="form-check-label fw-bold" for="toggle-all-searches">
Toggle all
</label>
</div>
<!-- Individual Search Checkboxes -->
{% for search in searches %}
<div class="form-check">
<input class="form-check-input search-checkbox" type="checkbox" value="{{ search.id }}" id="search-{{ search.id }}">
<label class="form-check-label" for="search-{{ search.id }}">
[{{ search.type }}] {{ search.search }}
</label>
</div>
{% empty %}
<tr>
<td colspan="2" class="text-center">No search available.</td>
</tr>
{% endfor %}
</form>
</div>
</ul>
</div>
<!-- Main Content Area -->
<div id="content" class="main-content">
<div class="container mt-4">
<!-- Table -->
<div id="item-list">
{% include 'urls_partial.html' %}
</div>
<!-- Loading... -->
<div id="loading" class="text-center mt-3" style="display:none;">
<div class="spinner-border text-primary" role="status">
<span class="visually-hidden">Loading...</span>
</div>
</div>
</div>
</div>
</body>
</html>

View File

@@ -1,97 +0,0 @@
{% load custom_filters %}
<div class="table-responsive">
<table class="table table-hover">
<thead>
<tr>
<th scope="col"><strong>URL</strong></th>
<th scope="col"><strong>Fetch date</strong></th>
<th scope="col"><strong>Sources</strong></th>
<th scope="col"><strong>Search</strong></th>
<th scope="col"><strong>Status</strong></th>
</tr>
</thead>
<tbody>
{% for item in page_obj %}
<tr>
<td>
<a href="./{{ item.id }}" class="btn btn-primary btn-sm" target="_blank"></a>
<a href="{{ item.url }}/" target="_blank">{{ item.url }}</a>
</td>
<td class="timestamp" data-ts="{{ item.ts_fetch|date:'c' }}">{{ item.ts_fetch }}</td>
<td>
{% with sources_map|dict_get:item.id as sources %}
{% if sources %}
{% for source in sources %}
<span class="badge bg-secondary">{{ source }}</span>
{% endfor %}
{% else %}
<span class="text-muted">No sources</span>
{% endif %}
{% endwith %}
</td>
<td>
{% with searches_map|dict_get:item.id as searches %}
{% if searches %}
{% for search in searches %}
<span class="badge bg-secondary">{{ search }}</span>
{% endfor %}
{% else %}
<span class="text-muted">No searches</span>
{% endif %}
{% endwith %}
</td>
<td>
{% if item.status == 'raw' %}
<span class="badge bg-secondary">{{ item.status|capfirst }}</span>
{% elif item.status == 'error' %}
<span class="badge bg-danger">{{ item.status|capfirst }}</span>
{% elif item.status == 'valid' %}
<span class="badge bg-success">{{ item.status|capfirst }}</span>
{% elif item.status == 'unknown' %}
<span class="badge bg-warning">{{ item.status|capfirst }}</span>
{% elif item.status == 'invalid' %}
<span class="badge bg-danger">{{ item.status|capfirst }}</span>
{% elif item.status == 'duplicate' %}
<span class="badge bg-info">{{ item.status|capfirst }}</span>
{% else %}
<span class="badge bg-light">Unknown</span>
{% endif %}
</td>
</tr>
{% empty %}
<tr>
<td colspan="4" class="text-center">No items available.</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<div class="d-flex justify-content-center mt-3">
<nav>
<ul class="pagination">
{% if page_obj.has_previous %}
<li class="page-item">
<a class="page-link" href="#" data-page="1">First</a>
</li>
<li class="page-item">
<a class="page-link" href="#" data-page="{{ page_obj.previous_page_number }}">Previous</a>
</li>
{% endif %}
<li class="page-item active">
<span class="page-link">Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}</span>
</li>
{% if page_obj.has_next %}
<li class="page-item">
<a class="page-link" href="#" data-page="{{ page_obj.next_page_number }}">Next</a>
</li>
<li class="page-item">
<a class="page-link" href="#" data-page="{{ page_obj.paginator.num_pages }}">Last</a>
</li>
{% endif %}
</ul>
</nav>
</div>

View File

@@ -258,10 +258,7 @@ input[type="checkbox"] {
<span id="offText" class="off-text">OFF</span>
</span>
</div>
-->
-->
<!-- Pages Per Page Dropdown -->
<h3>Pages Per Page</h3>
@@ -291,28 +288,14 @@ input[type="checkbox"] {
<!-- Filter by Status -->
<h3>Status</h3>
<!--
<label for="toggle-all-checkbox">
<input type="checkbox" id="toggle-all-checkbox" class="toggle-all-checkbox"> Toggle All
</label><br>
{% for status in statuses %}
<label>
<input type="checkbox" name="status" value="{{ status.0 }}"
{% if status.0 in selected_status %}checked{% endif %}
class="status-checkbox">
{{ status.1 }}
</label><br>
{% endfor %}
-->
<button type="button" class="toggle-all-btn" data-toggle="status">Toggle All</button><br>
{% for status in statuses %}
<label>
<input type="checkbox" name="status" value="{{ status.0 }}"
{% if status.0 in selected_status %}checked{% endif %}>
{% if status.0 in selected_status or 'all' in selected_status %}checked{% endif %}>
{{ status.1 }}
</label><br>
{% endfor %}
<!-- Filter by Search -->
<h3>Search</h3>
@@ -320,11 +303,10 @@ input[type="checkbox"] {
{% for search in searches %}
<label>
<input type="checkbox" name="search" value="{{ search.id }}"
{% if search.id|stringformat:"s" in selected_search %}checked{% endif %}>
{% if search.id|stringformat:"s" in selected_search or 'all' in selected_search %}checked{% endif %}>
[{{ search.type }}] {{ search.search|truncatechars:50 }}
</label><br>
{% endfor %}
<!-- Filter by Source -->
<h3>Source</h3>
@@ -332,7 +314,7 @@ input[type="checkbox"] {
{% for source in sources %}
<label>
<input type="checkbox" name="source" value="{{ source.id }}"
{% if source.id|stringformat:"s" in selected_source %}checked{% endif %}>
{% if source.id|stringformat:"s" in selected_source or 'all' in selected_source %}checked{% endif %}>
{{ source.source|truncatechars:50 }}
</label><br>
{% endfor %}
@@ -343,7 +325,7 @@ input[type="checkbox"] {
{% for lang in languages %}
<label>
<input type="checkbox" name="language" value="{{ lang }}"
{% if lang|stringformat:"s" in selected_lang %}checked{% endif %}>
{% if lang|stringformat:"s" in selected_language or 'all' in selected_language%}checked{% endif %}>
{{ lang|truncatechars:50 }}
</label><br>
{% endfor %}
@@ -456,6 +438,7 @@ input[type="checkbox"] {
</div>
<script>
//////////////////////////////////////////////////////////////////////
document.addEventListener("DOMContentLoaded", function () {
//////////////////////////////////////////////
@@ -514,16 +497,40 @@ input[type="checkbox"] {
});
});
//////////////////////////////////////////////////////////////////////
// Function to update the form parameter before submitting
function updateFormParameter(section) {
const checkboxes = document.querySelectorAll(`[name='${section}']`);
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
// If all are checked, replace them with a hidden input with value "all"
if (allChecked) {
checkboxes.forEach(checkbox => checkbox.removeAttribute("name"));
let hiddenInput = document.createElement("input");
hiddenInput.type = "hidden";
hiddenInput.name = section;
hiddenInput.value = "all";
document.getElementById("filterForm").appendChild(hiddenInput);
} else {
checkboxes.forEach(checkbox => checkbox.setAttribute("name", section));
document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove());
}
// Submit form after changes
document.getElementById("filterForm").submit();
}
//////////////////////////////////////////////////////////////////////
// Function to toggle all checkboxes in a section
function toggleCheckboxes(section) {
const checkboxes = document.querySelectorAll(`[name='${section}']`);
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
checkboxes.forEach(checkbox => {
checkbox.checked = !allChecked;
});
checkboxes.forEach(cb => cb.checked = !allChecked);
/*
// Automatically submit the form when a checkbox is toggled
document.getElementById('filterForm').submit();
*/
updateFormParameter(section);
}
// Attach event listeners to "Toggle All" buttons
@@ -533,13 +540,15 @@ input[type="checkbox"] {
toggleCheckboxes(section);
});
});
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
// Automatically submit the form when any checkbox changes
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
checkbox.addEventListener('change', function() {
/*
document.getElementById('filterForm').submit();
*/
updateFormParameter(this.name);
});
});
document.getElementById('perPageSelect').addEventListener('change', function() {
@@ -548,20 +557,6 @@ input[type="checkbox"] {
document.getElementById('timeFilterSelect').addEventListener('change', function() {
document.getElementById('filterForm').submit();
});
/*
document.getElementById('tableRadio').addEventListener('change', function() {
document.getElementById('tableViewContent').classList.remove('d-none');
document.getElementById('chartViewContent').classList.add('d-none');
document.getElementById('filterForm').submit();
});
document.getElementById('chartRadio').addEventListener('change', function() {
document.getElementById('chartViewContent').classList.remove('d-none');
document.getElementById('tableViewContent').classList.add('d-none');
document.getElementById('filterForm').submit();
});
*/
</script>

View File

@@ -4,9 +4,9 @@ from . import views
urlpatterns = [
path('', views.link_list, name='link_list'),
#
path('logs_debug', views.logs_debug, name='logs_debug'),
path('logs_info', views.logs_info, name='logs_info'),
path('logs_error', views.logs_error, name='logs_error'),
path('logs/<str:log_type>', views.logs, name='logs'),
#
path('task/<str:task>', views.trigger_task, name='trigger_task'),
#
path('charts/', views.charts, name='charts'),
path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
@@ -17,10 +17,4 @@ urlpatterns = [
path('urls/', views.filtered_urls, name='filtered_urls'),
path('urls/<int:id>/', views.url_detail_view, name='url_detail'),
path('urls/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
#
#path('url/', views.urls, name='url_detail'),
#path('url/<int:id>/', views.url_detail_view, name='url_detail'),
#path('url/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
#
path('task/<str:task>', views.trigger_task, name='trigger_task'),
]

View File

@@ -23,9 +23,9 @@ def link_list(request):
# Admin panel
"http://localhost:8000/admin",
# Logs
"http://localhost:8000/logs_debug",
"http://localhost:8000/logs_info",
"http://localhost:8000/logs_error",
"http://localhost:8000/logs/debug",
"http://localhost:8000/logs/info",
"http://localhost:8000/logs/error",
# URLs
"http://localhost:8000/urls",
# Charts
@@ -36,17 +36,13 @@ def link_list(request):
return JsonResponse({"links": list_links })
####################################################################################################
def logs_error(request):
with open(os.getenv("PATH_LOGS_ERROR", "logs/log_app_fetcher_error.log"), "r") as f:
file_content = f.read()
return HttpResponse(file_content, content_type="text/plain")
def logs_info(request):
with open(os.getenv("PATH_LOGS_INFO", "logs/log_app_fetcher_info.log"), "r") as f:
file_content = f.read()
return HttpResponse(file_content, content_type="text/plain")
def logs_debug(request):
with open(os.getenv("PATH_LOGS_DEBUG", "logs/log_app_fetcher_debug.log"), "r") as f:
file_content = f.read()
def logs(request, log_type):
# Capture output: python manage.py rqstats
try:
with open(os.getenv("PATH_LOGS_DEBUG", "logs/log_app_fetcher_{}.log".format(log_type)), "r") as f:
file_content = f.read()
except Exception as e:
file_content = "Error reading logs for log type :{}".format(log_type)
return HttpResponse(file_content, content_type="text/plain")
####################################################################################################
@@ -208,48 +204,77 @@ from .models import Urls, Search, Source
from django.db.models import Q
from django.utils.timezone import now, timedelta
def filtered_urls(request):
statuses = Urls.STATUS_ENUM.choices
searches = Search.objects.all()
sources = Source.objects.all()
# TODO: Cache languages, update once every N
languages = UrlContent.objects.distinct('language').values_list('language', flat=True)
# languages = [l for l in languages if l is not None]
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
# Null for visualization
languages = ["Null"] + [l for l in languages if l is not None]
# Get selected parameters
selected_status = request.GET.getlist('status')
selected_search = request.GET.getlist('search')
selected_source = request.GET.getlist('source')
selected_language = request.GET.getlist('language')
selected_status = request.GET.getlist('status', ["null"])
selected_search = request.GET.getlist('search', ["null"])
selected_source = request.GET.getlist('source', ["null"])
selected_language = request.GET.getlist('language', ["null"])
selected_days = request.GET.get("days", 30)
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
page_number = request.GET.get('page') # Get the current page number
all_status = [str(status[0]) for status in statuses]
all_search = [str(search.id) for search in searches]
all_source = [str(source.id) for source in sources]
all_languages = languages
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
selected_status = [str(status[0]) for status in statuses]
selected_search = [str(search.id) for search in searches]
selected_source = [str(source.id) for source in sources]
selected_language = languages
selected_status = ["all"]
selected_search = ["all"]
selected_source = ["all"]
selected_language = ["all"]
# print(set(selected_status), set(all_status))
"""
# List of TODO remove...
if (set(selected_status) == set(all_status)):
selected_status = ["all"]
if (set(selected_search) == set(all_search)):
selected_search = ["all"]
if (set(selected_source) == set(all_source)):
selected_source = ["all"]
if (set(selected_language) == set(languages)):
selected_language = ["all"]"
"""
# Filter URLs based on selected filters
if ('' in selected_status) or ('' in selected_search) or ('' in selected_source):
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language):
urls = []
else:
query = Q(urlssourcesearch__id_source__in=selected_source) & \
Q(urlssourcesearch__id_search__in=selected_search) & \
Q(status__in=selected_status) & \
Q(ts_fetch__gte=now() - timedelta(days=float(selected_days)))
if selected_language:
query &= Q(urlcontent__language__in=selected_language)
# Filter by date
query = Q(ts_fetch__gte=now() - timedelta(days=float(selected_days)))
# Additional filters
if ("all" not in selected_status):
query &= Q(status__in=selected_status)
if ("all" not in selected_source):
query &= Q(urlssourcesearch__id_source__in=selected_source)
if ("all" not in selected_search):
query &= Q(urlssourcesearch__id_search__in=selected_search)
if ("all" not in selected_language):
# URLs with selected languages
subquery = Q(urlcontent__language__in=selected_language)
if ("Null" in selected_language):
# URLs with NULL language
subquery |= Q(urlcontent__language__isnull=True)
# URLs with no UrlContent record at all (similar to URLs with NULL language)
subquery |= Q(urlcontent__id_url__isnull=True)
# Update query
query &= (subquery)
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
# Custom replace search type
for s in searches:
s.type = s.type.replace("rss_feed", "rss").replace("url_host", "url").replace("keyword_search", "keyword")
# Pagination
paginator = Paginator(urls, per_page) # Paginate the filtered URLs
page_obj = paginator.get_page(page_number) # Get the current page object
@@ -264,6 +289,9 @@ def filtered_urls(request):
url_content_map = {
url.id: UrlContent.objects.filter(pk=url).first() for url in page_obj.object_list
}
# Custom replace search type text
for s in searches:
s.type = s.type.replace("rss_feed", "rss").replace("url_host", "url").replace("keyword_search", "keyword")
context = {
'urls': page_obj, # Pass the paginated URLs

View File

@@ -18,7 +18,6 @@ BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt'
@@ -74,7 +73,6 @@ WSGI_APPLICATION = 'core.wsgi.application'
# Database
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
DATABASES = {
'default': {
@@ -110,30 +108,26 @@ SCHEDULER_QUEUES = {
'HOST': os.environ.get("REDIS_HOST", "localhost"),
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
},
'high': {
'HOST': os.environ.get("REDIS_HOST", "localhost"),
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
},
'low': {
'HOST': os.environ.get("REDIS_HOST", "localhost"),
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15),
}
}
SCHEDULER_CONFIG = {
'EXECUTIONS_IN_PAGE': 20,
'DEFAULT_TIMEOUT': os.environ.get("JOB_DEFAULT_TIMEOUT", 60*30), # 15 minutes
'DEFAULT_RESULT_TTL': 60*60*12, # 12 hours
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 60*15), # 15 minutes
'EXECUTIONS_IN_PAGE': 20,
'SCHEDULER_INTERVAL': 10, # 10 seconds
}
# Password validation
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
@@ -152,7 +146,6 @@ AUTH_PASSWORD_VALIDATORS = [
# Internationalization
# https://docs.djangoproject.com/en/5.1/topics/i18n/
LANGUAGE_CODE = 'en-us'
@@ -164,11 +157,9 @@ USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/5.1/howto/static-files/
STATIC_URL = 'static/'
# Default primary key field type
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'

View File

@@ -1,46 +1,4 @@
[
{
"model": "RepeatableTaskType",
"name": "Fetch Feeds",
"callable": "api.tasks.fetch_feeds",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-03-27T14:33:56+00:00",
"interval": 15,
"interval_unit": "minutes",
"successful_runs": 215,
"failed_runs": 0,
"last_successful_run": "2025-03-27 14:18:58.028684+00:00",
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process raw URLs",
"callable": "api.tasks.process_raw_urls",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"queue": "low",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-03-27T14:35:08+00:00",
"interval": 1,
"interval_unit": "hours",
"successful_runs": 41,
"failed_runs": 0,
"last_successful_run": "2025-03-27 13:35:48.534489+00:00",
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process error URLs",
@@ -54,54 +12,12 @@
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-03-27T16:36:21+00:00",
"scheduled_time": "2025-04-01T12:36:21+00:00",
"interval": 4,
"interval_unit": "hours",
"successful_runs": 10,
"successful_runs": 15,
"failed_runs": 0,
"last_successful_run": "2025-03-27 12:37:28.301866+00:00",
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch Parser",
"callable": "api.tasks.fetch_parser",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-03-27T14:25:42+00:00",
"interval": 1,
"interval_unit": "hours",
"successful_runs": 44,
"failed_runs": 0,
"last_successful_run": "2025-03-27 13:25:46.205433+00:00",
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch Search",
"callable": "api.tasks.fetch_search",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-03-27T14:29:33+00:00",
"interval": 1,
"interval_unit": "hours",
"successful_runs": 46,
"failed_runs": 0,
"last_successful_run": "2025-03-27 13:33:00.628827+00:00",
"last_successful_run": "2025-04-01 08:37:06.722770+00:00",
"last_failed_run": null
},
{
@@ -117,12 +33,117 @@
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-03-27T14:37:50+00:00",
"scheduled_time": "2025-04-01T10:37:50+00:00",
"interval": 2,
"interval_unit": "hours",
"successful_runs": 20,
"successful_runs": 29,
"failed_runs": 0,
"last_successful_run": "2025-03-27 12:38:42.545373+00:00",
"last_successful_run": "2025-04-01 08:42:05.864064+00:00",
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process MissingKids URLs ALL",
"callable": "api.tasks.process_missing_kids_urls_all",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": null,
"cron_string": null,
"scheduled_time": "2025-04-07T15:59:49+00:00",
"interval": 1,
"interval_unit": "weeks",
"successful_runs": 0,
"failed_runs": 0,
"last_successful_run": null,
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch Feeds",
"callable": "api.tasks.fetch_feeds",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:18:56+00:00",
"interval": 15,
"interval_unit": "minutes",
"successful_runs": 288,
"failed_runs": 0,
"last_successful_run": "2025-04-01 10:03:58.363856+00:00",
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Process raw URLs",
"callable": "api.tasks.process_raw_urls",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"queue": "low",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:20:08+00:00",
"interval": 15,
"interval_unit": "minutes",
"successful_runs": 78,
"failed_runs": 0,
"last_successful_run": "2025-04-01 10:05:08.394472+00:00",
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch Parser",
"callable": "api.tasks.fetch_parser",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:25:42+00:00",
"interval": 1,
"interval_unit": "hours",
"successful_runs": 62,
"failed_runs": 0,
"last_successful_run": "2025-04-01 09:25:57.977051+00:00",
"last_failed_run": null
},
{
"model": "RepeatableTaskType",
"name": "Fetch Search",
"callable": "api.tasks.fetch_search",
"callable_args": [],
"callable_kwargs": [],
"enabled": true,
"queue": "default",
"repeat": null,
"at_front": false,
"timeout": null,
"result_ttl": 86400,
"cron_string": null,
"scheduled_time": "2025-04-01T10:29:33+00:00",
"interval": 1,
"interval_unit": "hours",
"successful_runs": 63,
"failed_runs": 0,
"last_successful_run": "2025-04-01 09:37:20.671072+00:00",
"last_failed_run": null
}
]