133 lines
4.0 KiB
Markdown
133 lines
4.0 KiB
Markdown
* Dependencies
|
|
```
|
|
conda create -n matitos_urls python=3.12
|
|
conda activate matitos_urls
|
|
# Core
|
|
pip install django==5.1 psycopg[binary] django-redis django-tasks-scheduler
|
|
# Fetcher
|
|
pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect
|
|
# News visualization
|
|
pip install ollama
|
|
```
|
|
|
|
* Database
|
|
* Database initialization -> 1-DB.ipynb
|
|
|
|
|
|
* From automated inspectdb
|
|
```
|
|
# 1) Inspect DB, generate models.py
|
|
python manage.py inspectdb
|
|
|
|
# 2) Modify models.py
|
|
|
|
# URLS:
|
|
class Urls(models.Model):
|
|
class STATUS_ENUM(models.TextChoices):
|
|
RAW = "raw", "Raw"
|
|
ERROR = "error", "Error"
|
|
VALID = "valid", "Valid"
|
|
UNKNOWN = "unknown", "Unknown"
|
|
INVALID = "invalid", "Invalid"
|
|
DUPLICATE = "duplicate", "Duplicate"
|
|
|
|
url = models.TextField(unique=True)
|
|
ts_fetch = models.DateTimeField(auto_now_add=True)
|
|
status = models.TextField(choices=STATUS_ENUM.choices, default=STATUS_ENUM.RAW) # This field type is a guess.
|
|
|
|
class Meta:
|
|
managed = False
|
|
db_table = 'urls'
|
|
ordering = ["-ts_fetch"]
|
|
|
|
# SEARCH:
|
|
class Search(models.Model):
|
|
class TYPE_ENUM(models.TextChoices):
|
|
RSS_FEED = "rss_feed", "RSS_Feed"
|
|
KEYWORD_SEARCH = "keyword_search", "Keyword_Search"
|
|
URL_HOST = "url_host", "URL_Host"
|
|
|
|
id = models.SmallAutoField(primary_key=True)
|
|
search = models.TextField(unique=True)
|
|
type = models.TextField(choices=TYPE_ENUM.choices) # This field type is a guess.
|
|
|
|
# URL_CONTENT:
|
|
class UrlContent(models.Model):
|
|
id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True)
|
|
date_published = models.DateTimeField(blank=True, null=True)
|
|
title = models.TextField(blank=True, null=True)
|
|
description = models.TextField(blank=True, null=True)
|
|
content = models.TextField(blank=True, null=True)
|
|
valid_content = models.BooleanField(blank=True, null=True)
|
|
language = models.CharField(max_length=2, blank=True, null=True)
|
|
keywords = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
|
tags = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
|
authors = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
|
image_main_url = models.TextField(blank=True, null=True)
|
|
images_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
|
videos_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
|
url_host = models.TextField(blank=True, null=True)
|
|
site_name = models.TextField(blank=True, null=True)
|
|
|
|
# TODO: Associate db_table name with a prefix on project_name
|
|
class Meta:
|
|
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
|
|
```
|
|
|
|
* Environment variables
|
|
```
|
|
# Database
|
|
DB_NAME=${DB_NAME:-matitos}
|
|
DB_USER=${DB_NAME:-supermatitos}
|
|
DB_PASSWORD=${DB_NAME:-supermatitos}
|
|
DB_HOST=${DB_NAME:-localhost}
|
|
DB_PORT=${DB_NAME:-5432}
|
|
REDIS_HOST=${REDIS_HOST:-localhost}
|
|
REDIS_PORT=${REDIS_PORT:-6379}
|
|
|
|
# Job timeout: 30 min
|
|
JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
|
|
|
|
# Logs path
|
|
PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
|
|
|
|
# Fetcher
|
|
FETCHER_GNEWS_DECODE_SLEEP=2
|
|
FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
|
|
FETCHER_BETWEEN_SEARCHES_SLEEP=5
|
|
FETCHER_URL_HOST_SLEEP=5
|
|
FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
|
|
|
|
SELENIUM_ENDPOINT="http://selenium_app:80"
|
|
```
|
|
|
|
* Deploy
|
|
```
|
|
# Migrations
|
|
python manage.py makemigrations api; python manage.py migrate --fake-initial
|
|
# Create user
|
|
python manage.py createsuperuser
|
|
|
|
# 1) Server
|
|
python manage.py runserver
|
|
|
|
# 2) Workers
|
|
python manage.py rqworker high default low
|
|
|
|
# Visualize DB
|
|
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id
|
|
```
|
|
|
|
* Scheduled tasks
|
|
```
|
|
# Import tasks
|
|
python manage.py import --filename scheduled_tasks.json
|
|
|
|
# Modify using the admin panel, then save
|
|
# python manage.py export > scheduled_tasks.json
|
|
```
|
|
|
|
* Utils. TODO: To endpoint...
|
|
```
|
|
python manage.py rqstats
|
|
``` |