Towards django RQ

This commit is contained in:
Luciano Gervasoni
2025-03-10 12:17:31 +01:00
parent e024b200bb
commit e124dbc21a
20 changed files with 722 additions and 4643 deletions

View File

@@ -19,10 +19,12 @@
"output_type": "stream",
"text": [
"db_postgres\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 0/0\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 1/0\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 1/1\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 2/2\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h"
]
}
@@ -116,12 +118,14 @@
" title TEXT,\n",
" description TEXT,\n",
" content TEXT,\n",
" language CHAR(2), -- ISO 639-1 Code\n",
" tags TEXT[],\n",
" authors TEXT[],\n",
" image_urls TEXT[]\n",
" );\n",
" CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);\n",
" CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);\n",
" CREATE INDEX idx_language ON URL_CONTENT (language);\n",
" \"\"\")\n",
"\n",
" # Feeds\n",
@@ -188,8 +192,9 @@
" cur.execute(\"INSERT INTO URLS (url, status) values ('www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html', 'invalid')\".format(j))\n",
"\n",
" # URL Content\n",
" content = \"Bla Bla Bla!!!\"*25\n",
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, tags, authors, image_urls) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s)\", (1, datetime.now(tz=timezone.utc), content, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))"
" language, content = \"en\", \"Bla Bla Bla!!!\"*25\n",
" cur.execute(\"INSERT INTO URL_CONTENT (id_url, date_published, title, description, content, language, tags, authors, image_urls) values (%s, %s, 'Mommy blogger turned child abuser', %s, 'Hello there!', %s, %s, %s, %s)\", \n",
" (1, datetime.now(tz=timezone.utc), content, language, [\"child abuse\", \"social media\"], [\"Audrey Conklin\"], [\"https://a57.foxnews.com/static.foxnews.com/foxnews.com/content/uploads/2023/08/1440/810/image-58.jpg?ve=1&tl=1\"]))"
]
},
{
@@ -204,99 +209,99 @@
"\t urls\n",
"[(1,\n",
" 'https://www.foxnews.com/us/husband-ruby-franke-utah-mommy-blogger-convicted-child-abuse-regrets-wifes-fall-fame',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (2,\n",
" 'https://www.bbc.com/news/articles/ckg843y8y7no',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (3,\n",
" 'https://www.wilx.com/2025/03/05/lenawee-county-man-arrested-possessing-child-abuse-material/',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (4,\n",
" 'https://www.dw.com/en/trauma-how-child-abuse-victims-deal-with-parenthood/a-71833895',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (5,\n",
" 'https://nypost.com/2025/03/06/us-news/colorado-day-care-worker-hit-with-51-charges-of-child-abuse-harassment-for-slapping-toddler/',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (6,\n",
" 'https://www.fox35orlando.com/news/tavares-police-florida-boys-10-9-abused-sheer-brutality',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'valid'),\n",
" (7,\n",
" 'https://www.google.com',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (8,\n",
" 'www.super_0.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (9,\n",
" 'www.super_1.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (10,\n",
" 'www.super_2.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (11,\n",
" 'www.super_3.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (12,\n",
" 'www.super_4.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (13,\n",
" 'www.super_5.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (14,\n",
" 'www.super_6.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (15,\n",
" 'www.super_7.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (16,\n",
" 'www.super_8.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (17,\n",
" 'www.super_9.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (18,\n",
" 'www.super_10.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (19,\n",
" 'www.super_11.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (20,\n",
" 'www.super_12.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (21,\n",
" 'www.super_13.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (22,\n",
" 'www.super_14.org',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (23,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf_23dj9sdgj9sdgj8sdf8ds8f.html',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid'),\n",
" (24,\n",
" 'www.super_url.org/superextrakmsdimsdf/349mvlsdfsdfwr/akivsdmimnsdifmisdf.html',\n",
" datetime.datetime(2025, 3, 6, 23, 4, 22, 630547, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 23, 32211, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'invalid')]\n",
"\t urls_duplicate\n",
"[]\n",
@@ -322,7 +327,7 @@
"[('.*missingkids.org/poster/.*', 50, 'valid')]\n",
"\t url_content\n",
"[(1,\n",
" datetime.datetime(2025, 3, 6, 23, 4, 37, 654130, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" datetime.datetime(2025, 3, 7, 16, 57, 38, 54447, tzinfo=zoneinfo.ZoneInfo(key='Etc/UTC')),\n",
" 'Mommy blogger turned child abuser',\n",
" 'Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla '\n",
" 'Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla Bla!!!Bla Bla '\n",

View File

@@ -103,13 +103,26 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 54,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"('https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert',\n",
" 'foxnews.com')"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# !pip install trafilatura trafilatura[all] cchardet\n",
"import courlan\n",
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"url = \"https://foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"courlan.check_url(url)"
]
},
@@ -125,13 +138,65 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"import newspaper\n",
"\n",
"article = newspaper.article(url)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"datetime.datetime(2025, 3, 4, 4, 0, 31, tzinfo=tzoffset(None, -18000))"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"article.publish_date"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# !pip install trafilatura\n",
"import trafilatura\n",
"from pprint import pprint\n",
"\n",
"url = \"https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert\"\n",
"url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
"# url = \"https://www.missingkids.org/poster/USVA/VA25-0820/1\"\n",
"\n",
"# Fetch\n",
"doc = trafilatura.fetch_url(url)\n",
@@ -142,9 +207,40 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'author': 'Audrey Conklin',\n",
" 'body': <Element body at 0x7e22813ce400>,\n",
" 'categories': [],\n",
" 'comments': None,\n",
" 'commentsbody': <Element body at 0x7e22813ce180>,\n",
" 'date': '2025-03-03',\n",
" 'description': \"Disgraced parenting blogger and mom of six Ruby Franke's \"\n",
" '\"power\" and public image\" allowed her crimes against her '\n",
" 'children to go \"unchecked,\" according to a defense attorney.',\n",
" 'filedate': '2025-03-08',\n",
" 'fingerprint': None,\n",
" 'hostname': 'foxnews.com',\n",
" 'id': None,\n",
" 'image': 'https://static.foxnews.com/foxnews.com/content/uploads/2024/03/967e1c1b-Franke.jpg',\n",
" 'language': None,\n",
" 'license': None,\n",
" 'pagetype': 'article',\n",
" 'raw_text': None,\n",
" 'sitename': 'Fox News',\n",
" 'tags': [],\n",
" 'text': None,\n",
" 'title': \"Utah mommy blogger Ruby Franke's power, public image allowed child \"\n",
" \"abuse to go 'unchecked': expert\",\n",
" 'url': 'https://www.foxnews.com/us/utah-mommy-blogger-ruby-franke-power-public-image-allowed-child-abuse-go-unchecked-expert'}\n"
]
}
],
"source": [
"pprint(metadata.as_dict())"
]
@@ -165,6 +261,64 @@
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 18.6 ms, sys: 40 μs, total: 18.7 ms\n",
"Wall time: 18 ms\n"
]
},
{
"data": {
"text/plain": [
"'en'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"!pip install lingua-language-detector\n",
"import lingua\n",
"ld = lingua.LanguageDetectorBuilder.from_all_languages().build()\n",
"l = ld.detect_language_of(content)\n",
"'''\n",
"# !pip install langdetect \n",
"import langdetect\n",
"langdetect.DetectorFactory.seed = 0\n",
"langdetect.detect(content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,

File diff suppressed because it is too large Load Diff

34
app_urls/README.md Normal file
View File

@@ -0,0 +1,34 @@
* Dependencies
```
conda create -n matitos_urls python=3.12
conda activate matitos_urls
pip install django psycopg[binary] django-rq
```
* Environment variables
```
DB_NAME=${DB_NAME:-matitos}
DB_USER=${DB_NAME:-supermatitos}
DB_PASSWORD=${DB_NAME:-supermatitos}
DB_HOST=${DB_NAME:-localhost}
DB_PORT=${DB_NAME:-5432}
REDIS_HOST=${REDIS_HOST:-localhost}
REDIS_PORT=${REDIS_PORT:-6379}
```
* Django DB
```
# Generate content for models.py
python manage.py inspectdb
python manage.py makemigrations
python manage.py migrate --fake
```
```
# Server
python manage.py runserver
# Worker
python manage.py rqworker default
```

0
app_urls/api/__init__.py Normal file
View File

3
app_urls/api/admin.py Normal file
View File

@@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

6
app_urls/api/apps.py Normal file
View File

@@ -0,0 +1,6 @@
from django.apps import AppConfig
class ApiConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'api'

View File

@@ -0,0 +1,132 @@
# Generated by Django 5.1.7 on 2025-03-07 16:56
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Feed',
fields=[
('id', models.SmallAutoField(primary_key=True, serialize=False)),
('rss_feed', models.TextField(unique=True)),
],
options={
'db_table': 'feed',
'managed': False,
},
),
migrations.CreateModel(
name='Search',
fields=[
('id', models.SmallAutoField(primary_key=True, serialize=False)),
('keyword_search', models.TextField(unique=True)),
],
options={
'db_table': 'search',
'managed': False,
},
),
migrations.CreateModel(
name='Source',
fields=[
('id', models.SmallAutoField(primary_key=True, serialize=False)),
('source', models.TextField(unique=True)),
],
options={
'db_table': 'source',
'managed': False,
},
),
migrations.CreateModel(
name='StatusPatternMatching',
fields=[
('pattern', models.TextField(primary_key=True, serialize=False)),
('priority', models.SmallIntegerField()),
('status', models.TextField()),
],
options={
'db_table': 'status_pattern_matching',
'managed': False,
},
),
migrations.CreateModel(
name='Urls',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('url', models.TextField(unique=True)),
('ts_fetch', models.DateTimeField()),
('status', models.TextField()),
],
options={
'db_table': 'urls',
'managed': False,
},
),
migrations.CreateModel(
name='WebsiteOfInterest',
fields=[
('id', models.SmallAutoField(primary_key=True, serialize=False)),
('url_host', models.TextField(unique=True)),
],
options={
'db_table': 'website_of_interest',
'managed': False,
},
),
migrations.CreateModel(
name='WebsiteToFilter',
fields=[
('id', models.SmallAutoField(primary_key=True, serialize=False)),
('url_host', models.TextField(unique=True)),
],
options={
'db_table': 'website_to_filter',
'managed': False,
},
),
migrations.CreateModel(
name='UrlContent',
fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
('date_published', models.DateTimeField(blank=True, null=True)),
('title', models.TextField(blank=True, null=True)),
('description', models.TextField(blank=True, null=True)),
('content', models.TextField(blank=True, null=True)),
('tags', models.TextField(blank=True, null=True)),
('authors', models.TextField(blank=True, null=True)),
('image_urls', models.TextField(blank=True, null=True)),
],
options={
'db_table': 'url_content',
'managed': False,
},
),
migrations.CreateModel(
name='UrlsDuplicate',
fields=[
('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
],
options={
'db_table': 'urls_duplicate',
'managed': False,
},
),
migrations.CreateModel(
name='UrlsSource',
fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
],
options={
'db_table': 'urls_source',
'managed': False,
},
),
]

View File

101
app_urls/api/models.py Normal file
View File

@@ -0,0 +1,101 @@
from django.db import models
# Create your models here.
class Feed(models.Model):
id = models.SmallAutoField(primary_key=True)
rss_feed = models.TextField(unique=True)
class Meta:
managed = False
db_table = 'feed'
class Search(models.Model):
id = models.SmallAutoField(primary_key=True)
keyword_search = models.TextField(unique=True)
class Meta:
managed = False
db_table = 'search'
class Source(models.Model):
id = models.SmallAutoField(primary_key=True)
source = models.TextField(unique=True)
class Meta:
managed = False
db_table = 'source'
class StatusPatternMatching(models.Model):
pattern = models.TextField(primary_key=True)
priority = models.SmallIntegerField()
status = models.TextField() # This field type is a guess.
class Meta:
managed = False
db_table = 'status_pattern_matching'
class UrlContent(models.Model):
id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True)
date_published = models.DateTimeField(blank=True, null=True)
title = models.TextField(blank=True, null=True)
description = models.TextField(blank=True, null=True)
content = models.TextField(blank=True, null=True)
tags = models.TextField(blank=True, null=True) # This field type is a guess.
authors = models.TextField(blank=True, null=True) # This field type is a guess.
image_urls = models.TextField(blank=True, null=True) # This field type is a guess.
class Meta:
managed = False
db_table = 'url_content'
class Urls(models.Model):
url = models.TextField(unique=True)
ts_fetch = models.DateTimeField()
status = models.TextField() # This field type is a guess.
class Meta:
managed = False
db_table = 'urls'
class UrlsDuplicate(models.Model):
id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
class Meta:
managed = False
db_table = 'urls_duplicate'
unique_together = (('id_url_canonical', 'id_url_duplicated'),)
class UrlsSource(models.Model):
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source) found, that is not supported. The first column is selected.
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
class Meta:
managed = False
db_table = 'urls_source'
unique_together = (('id_url', 'id_source'),)
class WebsiteOfInterest(models.Model):
id = models.SmallAutoField(primary_key=True)
url_host = models.TextField(unique=True)
class Meta:
managed = False
db_table = 'website_of_interest'
class WebsiteToFilter(models.Model):
id = models.SmallAutoField(primary_key=True)
url_host = models.TextField(unique=True)
class Meta:
managed = False
db_table = 'website_to_filter'

13
app_urls/api/tasks.py Normal file
View File

@@ -0,0 +1,13 @@
from django_rq import job
import time
import logging
logger = logging.getLogger(__name__)
@job
def task_1(message):
logger.info("Message: {}".format(message))
try:
time.sleep(5) # Simulate a long-running task
print(f"Task completed: {message}")
except Exception as e:
logger.error(e)

3
app_urls/api/tests.py Normal file
View File

@@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

6
app_urls/api/urls.py Normal file
View File

@@ -0,0 +1,6 @@
from django.urls import path
from .views import trigger_task
urlpatterns = [
path('trigger_task/', trigger_task, name='trigger_task')
]

10
app_urls/api/views.py Normal file
View File

@@ -0,0 +1,10 @@
import django_rq
from django.http import JsonResponse
from .tasks import task_1
def trigger_task(request):
"""View that enqueues a task."""
queue = django_rq.get_queue('default') # Get the default queue
job = queue.enqueue(task_1, "Hello from Django RQ!")
return JsonResponse({"message": "Task has been enqueued!", "job_id": job.id})

View File

16
app_urls/core/asgi.py Normal file
View File

@@ -0,0 +1,16 @@
"""
ASGI config for core project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
application = get_asgi_application()

142
app_urls/core/settings.py Normal file
View File

@@ -0,0 +1,142 @@
"""
Django settings for core project.
Generated by 'django-admin startproject' using Django 5.1.7.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/5.1/ref/settings/
"""
from pathlib import Path
import os
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-kc0jj#_=7i$_79p(n5)p3taxvhnq=w*ori-%%iu_a6wye@$(*n'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
# 'rest_framework',
'django_rq',
'api',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'core.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'core.wsgi.application'
# Database
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.postgresql',
'NAME': os.environ.get("DB_NAME", "matitos"),
'USER': os.environ.get("DB_USER", "supermatitos"),
'PASSWORD': os.environ.get("DB_PASSWORD", "supermatitos"),
'HOST': os.environ.get("DB_HOST", "localhost"),
'PORT': os.environ.get("DB_PORT", "5432"),
#'OPTIONS': {
# 'options': '-c default_transaction_read_only=on'
#}
}
}
RQ_QUEUES = {
'default': {
'HOST': os.environ.get("REDIS_HOST", "localhost"),
'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0),
'DEFAULT_TIMEOUT': os.environ.get("REDIS_DEFAULT_TIMEOUT", 360),
}
}
# Password validation
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/5.1/topics/i18n/
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/5.1/howto/static-files/
STATIC_URL = 'static/'
# Default primary key field type
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'

23
app_urls/core/urls.py Normal file
View File

@@ -0,0 +1,23 @@
"""
URL configuration for core project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/5.1/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path, include
urlpatterns = [
path('admin/', admin.site.urls),
path('api/', include('api.urls'))
]

16
app_urls/core/wsgi.py Normal file
View File

@@ -0,0 +1,16 @@
"""
WSGI config for core project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
application = get_wsgi_application()

22
app_urls/manage.py Executable file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()