Files
2025-10-16 10:38:32 +02:00
..
2025-09-04 08:46:04 +02:00
2025-10-16 10:38:32 +02:00
2025-07-22 00:51:09 +02:00
2025-03-10 12:17:31 +01:00
2025-07-22 22:53:53 +02:00
2025-10-16 10:12:17 +02:00
2025-09-08 12:34:47 +02:00

  • Dependencies
conda create -n matitos_urls python=3.12
conda activate matitos_urls
pip install -r requirements.txt
  • From automated inspectdb
# 1) Inspect DB, generate models.py
python manage.py inspectdb

# 2) Modify models.py

# URLS:
class Urls(models.Model):
    class STATUS_ENUM(models.TextChoices):
        RAW = "raw", "Raw"
        ERROR = "error", "Error"
        VALID = "valid", "Valid"
        UNKNOWN = "unknown", "Unknown"
        INVALID = "invalid", "Invalid"
        DUPLICATE = "duplicate", "Duplicate"

    url = models.TextField(unique=True)
    ts_fetch = models.DateTimeField(auto_now_add=True)
    status = models.TextField(choices=STATUS_ENUM.choices, default=STATUS_ENUM.RAW)  # This field type is a guess.

    class Meta:
        managed = False
        db_table = 'urls'
        ordering = ["-ts_fetch"]

# SEARCH:
class Search(models.Model):
    class TYPE_ENUM(models.TextChoices):
        RSS_FEED = "rss_feed", "RSS_Feed"
        KEYWORD_SEARCH = "keyword_search", "Keyword_Search"
        URL_HOST = "url_host", "URL_Host"

    id = models.SmallAutoField(primary_key=True)
    search = models.TextField(unique=True)
    type = models.TextField(choices=TYPE_ENUM.choices)  # This field type is a guess.

# URL_CONTENT:
class UrlContent(models.Model):
    id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True)
    date_published = models.DateTimeField(blank=True, null=True)
    title = models.TextField(blank=True, null=True)
    description = models.TextField(blank=True, null=True)
    content = models.TextField(blank=True, null=True)
    valid_content = models.BooleanField(blank=True, null=True)
    language = models.CharField(max_length=2, blank=True, null=True)
    keywords = ArrayField(models.TextField(blank=True, null=True))  # This field type is a guess.
    tags = ArrayField(models.TextField(blank=True, null=True))  # This field type is a guess.
    authors = ArrayField(models.TextField(blank=True, null=True))  # This field type is a guess.
    image_main_url = models.TextField(blank=True, null=True)
    images_url = ArrayField(models.TextField(blank=True, null=True))  # This field type is a guess.
    videos_url = ArrayField(models.TextField(blank=True, null=True))  # This field type is a guess.
    url_host = models.TextField(blank=True, null=True)
    site_name = models.TextField(blank=True, null=True)

# TODO: Associate db_table name with a prefix on project_name
class Meta:
    db_table = 'urls' # db_table = '{}_urls'.format(project_name)
  • Database & initialization

    • Check initialize.sh on Dockerfile
    • init_data.json Insert URLs host of interest, RSS feeds, keyword searches, and Regex (escaped) status patterns to set "invalid" or "valid" URLs
  • Environment variables

    • In docker-compose.yml
  • Tasks

python manage.py dumpdata \
  django_celery_beat.PeriodicTask \
  django_celery_beat.IntervalSchedule \
  django_celery_beat.CrontabSchedule \
  django_celery_beat.SolarSchedule \
  django_celery_beat.ClockedSchedule \
  --indent 2 > scheduled_tasks.json
  • Deploy
# Check environments variables on .env file

# Remove previous instances
docker compose down -v

# Build & up
docker compose up -d --build