* Dependencies ``` conda create -n matitos_urls python=3.12 conda activate matitos_urls # Core pip install django psycopg[binary] django-redis django-tasks-scheduler # Fetcher pip install feedparser python-dateutil newspaper4k[all] lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews langdetect # News visualization pip install ollama ``` * Database * Database initialization -> 1-DB.ipynb * From automated inspectdb ``` # 1) Inspect DB, generate models.py python manage.py inspectdb # 2) Modify models.py # URLS: class Urls(models.Model): class STATUS_ENUM(models.TextChoices): RAW = "raw", "Raw" ERROR = "error", "Error" VALID = "valid", "Valid" UNKNOWN = "unknown", "Unknown" INVALID = "invalid", "Invalid" DUPLICATE = "duplicate", "Duplicate" url = models.TextField(unique=True) ts_fetch = models.DateTimeField(auto_now_add=True) status = models.TextField(choices=STATUS_ENUM.choices, default=STATUS_ENUM.RAW) # This field type is a guess. class Meta: managed = False db_table = 'urls' ordering = ["-ts_fetch"] # SEARCH: class Search(models.Model): class TYPE_ENUM(models.TextChoices): RSS_FEED = "rss_feed", "RSS_Feed" KEYWORD_SEARCH = "keyword_search", "Keyword_Search" URL_HOST = "url_host", "URL_Host" id = models.SmallAutoField(primary_key=True) search = models.TextField(unique=True) type = models.TextField(choices=TYPE_ENUM.choices) # This field type is a guess. # URL_CONTENT: class UrlContent(models.Model): id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True) date_published = models.DateTimeField(blank=True, null=True) title = models.TextField(blank=True, null=True) description = models.TextField(blank=True, null=True) content = models.TextField(blank=True, null=True) valid_content = models.BooleanField(blank=True, null=True) language = models.CharField(max_length=2, blank=True, null=True) keywords = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess. tags = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess. authors = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess. image_main_url = models.TextField(blank=True, null=True) images_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess. videos_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess. url_host = models.TextField(blank=True, null=True) site_name = models.TextField(blank=True, null=True) # TODO: Associate db_table name with a prefix on project_name class Meta: db_table = 'urls' # db_table = '{}_urls'.format(project_name) ``` * Environment variables ``` # Database DB_NAME=${DB_NAME:-matitos} DB_USER=${DB_NAME:-supermatitos} DB_PASSWORD=${DB_NAME:-supermatitos} DB_HOST=${DB_NAME:-localhost} DB_PORT=${DB_NAME:-5432} REDIS_HOST=${REDIS_HOST:-localhost} REDIS_PORT=${REDIS_PORT:-6379} # Job timeout: 30 min JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800} # Logs path PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log" # Fetcher FETCHER_GNEWS_DECODE_SLEEP=2 FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4 FETCHER_BETWEEN_SEARCHES_SLEEP=5 FETCHER_URL_HOST_SLEEP=5 ``` * Deploy ``` # Migrations python manage.py makemigrations api; python manage.py migrate --fake-initial # Create user python manage.py createsuperuser # 1) Server python manage.py runserver # 2) Workers python manage.py rqworker high default low # Visualize DB http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id ``` * Scheduled tasks ``` # Import tasks python manage.py import --filename scheduled_tasks.json # Modify using the admin panel, then save # python manage.py export > scheduled_tasks.json ``` * Utils. TODO: To endpoint... ``` python manage.py rqstats ```