Working fetch search, refactoring DB towards source search
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
conda create -n matitos_urls python=3.12
|
||||
conda activate matitos_urls
|
||||
pip install django psycopg[binary] django-rq
|
||||
pip install feedparser python-dateutil newspaper4k lxml[html_clean]
|
||||
pip install feedparser python-dateutil newspaper4k lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews
|
||||
```
|
||||
|
||||
* From automated inspectdb
|
||||
@@ -11,38 +11,59 @@ pip install feedparser python-dateutil newspaper4k lxml[html_clean]
|
||||
# 1) Inspect DB, generate models.py
|
||||
python manage.py inspectdb
|
||||
|
||||
# 2) models.py, within class Urls, add:
|
||||
# 2) Modify models.py
|
||||
|
||||
# URLS:
|
||||
class Urls(models.Model):
|
||||
class STATUS_ENUM(models.TextChoices):
|
||||
RAW = "raw"
|
||||
ERROR = "error"
|
||||
VALID = "valid"
|
||||
UNKNOWN = "unknown"
|
||||
INVALID = "invalid"
|
||||
DUPLICATE = "duplicate"
|
||||
RAW = "raw", "Raw"
|
||||
ERROR = "error", "Error"
|
||||
VALID = "valid", "Valid"
|
||||
UNKNOWN = "unknown", "Unknown"
|
||||
INVALID = "invalid", "Invalid"
|
||||
DUPLICATE = "duplicate", "Duplicate"
|
||||
|
||||
# Update status
|
||||
status = models.TextField(choices=STATUS_ENUM, default=STATUS_ENUM.RAW) # This field type is a guess.
|
||||
url = models.TextField(unique=True)
|
||||
ts_fetch = models.DateTimeField(auto_now_add=True)
|
||||
status = models.TextField(choices=STATUS_ENUM.choices, default=STATUS_ENUM.RAW) # This field type is a guess.
|
||||
|
||||
# To class Meta, add default ordering
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
|
||||
db_table = 'urls'
|
||||
ordering = ["-ts_fetch"]
|
||||
|
||||
# Fields default:
|
||||
ts_fetch = models.DateTimeField(auto_now_add=True)
|
||||
status = models.TextField(default='raw') # This field type is a guess.
|
||||
# SEARCH:
|
||||
class Search(models.Model):
|
||||
class TYPE_ENUM(models.TextChoices):
|
||||
RSS_FEED = "rss_feed", "RSS_Feed"
|
||||
KEYWORD_SEARCH = "keyword_search", "Keyword_Search"
|
||||
URL_HOST = "url_host", "URL_Host"
|
||||
|
||||
# URLContent:
|
||||
from django.contrib.postgres.fields import ArrayField
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
search = models.TextField(unique=True)
|
||||
type = models.TextField(choices=TYPE_ENUM.choices) # This field type is a guess.
|
||||
|
||||
# URL_CONTENT:
|
||||
class UrlContent(models.Model):
|
||||
id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True)
|
||||
date_published = models.DateTimeField(blank=True, null=True)
|
||||
title = models.TextField(blank=True, null=True)
|
||||
description = models.TextField(blank=True, null=True)
|
||||
content = models.TextField(blank=True, null=True)
|
||||
valid_content = models.BooleanField(blank=True, null=True)
|
||||
language = models.CharField(max_length=2, blank=True, null=True)
|
||||
keywords = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
tags = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
authors = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
image_main_url = models.TextField(blank=True, null=True)
|
||||
images_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
videos_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
url_host = models.TextField(blank=True, null=True)
|
||||
site_name = models.TextField(blank=True, null=True)
|
||||
|
||||
# TODO: Associate db_table name with a prefix on project_name
|
||||
class Meta:
|
||||
db_table = 'urls' # db_table = '{}_urls'.format(project_name)
|
||||
```
|
||||
|
||||
* Environment variables
|
||||
@@ -55,6 +76,9 @@ DB_PORT=${DB_NAME:-5432}
|
||||
|
||||
REDIS_HOST=${REDIS_HOST:-localhost}
|
||||
REDIS_PORT=${REDIS_PORT:-6379}
|
||||
|
||||
# Default RQ queue timeout
|
||||
RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900}
|
||||
```
|
||||
|
||||
* Django DB
|
||||
|
||||
Reference in New Issue
Block a user