Files
matitos_news/app_urls/README.md
2025-03-18 14:49:12 +01:00

2.5 KiB

  • Dependencies
conda create -n matitos_urls python=3.12
conda activate matitos_urls
pip install django psycopg[binary] django-rq
pip install feedparser python-dateutil newspaper4k lxml[html_clean]
  • From automated inspectdb
# 1) Inspect DB, generate models.py
python manage.py inspectdb

# 2) models.py, within class Urls, add:

    class STATUS_ENUM(models.TextChoices):
        RAW = "raw"
        ERROR = "error"
        VALID = "valid"
        UNKNOWN = "unknown"
        INVALID = "invalid"
        DUPLICATE = "duplicate"

# Update status
    status = models.TextField(choices=STATUS_ENUM, default=STATUS_ENUM.RAW)  # This field type is a guess.

# To class Meta, add default ordering
    class Meta:
        managed = False
        db_table = 'urls' # db_table = '{}_urls'.format(project_name)
        ordering = ["-ts_fetch"]

# Fields default:
    ts_fetch = models.DateTimeField(auto_now_add=True)
    status = models.TextField(default='raw')  # This field type is a guess.

# URLContent:
from django.contrib.postgres.fields import ArrayField

    keywords = ArrayField(models.TextField(blank=True, null=True))  # This field type is a guess.
    tags = ArrayField(models.TextField(blank=True, null=True))  # This field type is a guess.
    authors = ArrayField(models.TextField(blank=True, null=True))  # This field type is a guess.
    image_main_url = models.TextField(blank=True, null=True)
    images_url = ArrayField(models.TextField(blank=True, null=True))  # This field type is a guess.
    videos_url = ArrayField(models.TextField(blank=True, null=True))  # This field type is a guess.
  • Environment variables
DB_NAME=${DB_NAME:-matitos}
DB_USER=${DB_NAME:-supermatitos}
DB_PASSWORD=${DB_NAME:-supermatitos}
DB_HOST=${DB_NAME:-localhost}
DB_PORT=${DB_NAME:-5432}

REDIS_HOST=${REDIS_HOST:-localhost}
REDIS_PORT=${REDIS_PORT:-6379}
  • Django DB
# Generate content for models.py
python manage.py inspectdb
# Migrations
python manage.py makemigrations api; python manage.py migrate --fake-initial
  • Deploy
# Server
python manage.py runserver

# Worker
python manage.py rqworker default
while true; do python manage.py rqworker default --burst -v 0; sleep 5; done

# Visualize DB
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id
  • Utils
python manage.py rqstats
python manage.py rqstats --interval=1  # Refreshes every second
python manage.py rqstats --json  # Output as JSON
python manage.py rqstats --yaml  # Output as YAML