Dockerization, whitenoise serving static, refactor
This commit is contained in:
0
app_urls/fetcher/__init__.py
Normal file
0
app_urls/fetcher/__init__.py
Normal file
12
app_urls/fetcher/admin.py
Normal file
12
app_urls/fetcher/admin.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from django.contrib import admin
|
||||
|
||||
# Register your models here.
|
||||
from .models import Search, Source, StatusPatternMatching, UrlContent, Urls, UrlsDuplicate, UrlsSourceSearch
|
||||
|
||||
admin.site.register(Search)
|
||||
admin.site.register(Source)
|
||||
admin.site.register(StatusPatternMatching)
|
||||
admin.site.register(UrlContent)
|
||||
admin.site.register(Urls)
|
||||
admin.site.register(UrlsDuplicate)
|
||||
admin.site.register(UrlsSourceSearch)
|
||||
6
app_urls/fetcher/apps.py
Normal file
6
app_urls/fetcher/apps.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class FetcherConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'fetcher'
|
||||
109
app_urls/fetcher/migrations/0001_initial.py
Normal file
109
app_urls/fetcher/migrations/0001_initial.py
Normal file
@@ -0,0 +1,109 @@
|
||||
# Generated by Django 5.2 on 2025-04-02 16:44
|
||||
|
||||
import django.contrib.postgres.fields
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Search',
|
||||
fields=[
|
||||
('id', models.SmallAutoField(primary_key=True, serialize=False)),
|
||||
('search', models.TextField(unique=True)),
|
||||
('type', models.TextField(choices=[('rss_feed', 'RSS_Feed'), ('keyword_search', 'Keyword_Search'), ('url_host', 'URL_Host')])),
|
||||
],
|
||||
options={
|
||||
'db_table': 'search',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Source',
|
||||
fields=[
|
||||
('id', models.SmallAutoField(primary_key=True, serialize=False)),
|
||||
('source', models.TextField(unique=True)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'source',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='StatusPatternMatching',
|
||||
fields=[
|
||||
('pattern', models.TextField(primary_key=True, serialize=False)),
|
||||
('priority', models.SmallIntegerField()),
|
||||
('status', models.TextField()),
|
||||
],
|
||||
options={
|
||||
'db_table': 'status_pattern_matching',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Urls',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('url', models.TextField(unique=True)),
|
||||
('ts_fetch', models.DateTimeField(auto_now_add=True)),
|
||||
('status', models.TextField(choices=[('raw', 'Raw'), ('error', 'Error'), ('valid', 'Valid'), ('unknown', 'Unknown'), ('invalid', 'Invalid'), ('duplicate', 'Duplicate')], default='raw')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'urls',
|
||||
'ordering': ['-ts_fetch'],
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='UrlContent',
|
||||
fields=[
|
||||
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
|
||||
('date_published', models.DateTimeField(blank=True, null=True)),
|
||||
('title', models.TextField(blank=True, null=True)),
|
||||
('description', models.TextField(blank=True, null=True)),
|
||||
('content', models.TextField(blank=True, null=True)),
|
||||
('valid_content', models.BooleanField(blank=True, null=True)),
|
||||
('language', models.CharField(blank=True, max_length=2, null=True)),
|
||||
('keywords', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
|
||||
('tags', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
|
||||
('authors', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
|
||||
('image_main_url', models.TextField(blank=True, null=True)),
|
||||
('images_url', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
|
||||
('videos_url', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
|
||||
('url_host', models.TextField(blank=True, null=True)),
|
||||
('site_name', models.TextField(blank=True, null=True)),
|
||||
],
|
||||
options={
|
||||
'db_table': 'url_content',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='UrlsDuplicate',
|
||||
fields=[
|
||||
('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'urls_duplicate',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='UrlsSourceSearch',
|
||||
fields=[
|
||||
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
|
||||
],
|
||||
options={
|
||||
'db_table': 'urls_source_search',
|
||||
'managed': False,
|
||||
},
|
||||
),
|
||||
]
|
||||
0
app_urls/fetcher/migrations/__init__.py
Normal file
0
app_urls/fetcher/migrations/__init__.py
Normal file
140
app_urls/fetcher/models.py
Normal file
140
app_urls/fetcher/models.py
Normal file
@@ -0,0 +1,140 @@
|
||||
from django.db import models
|
||||
from django.contrib.postgres.fields import ArrayField
|
||||
|
||||
# Create your models here.
|
||||
class Search(models.Model):
|
||||
class TYPE_ENUM(models.TextChoices):
|
||||
RSS_FEED = "rss_feed", "RSS_Feed"
|
||||
KEYWORD_SEARCH = "keyword_search", "Keyword_Search"
|
||||
URL_HOST = "url_host", "URL_Host"
|
||||
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
search = models.TextField(unique=True)
|
||||
type = models.TextField(choices=TYPE_ENUM.choices) # This field type is a guess.
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'search'
|
||||
|
||||
def __str__(self):
|
||||
return "[{}: {}]".format(self.type, self.search)
|
||||
|
||||
class Source(models.Model):
|
||||
id = models.SmallAutoField(primary_key=True)
|
||||
source = models.TextField(unique=True)
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'source'
|
||||
|
||||
def __str__(self):
|
||||
return "[{}]".format(self.source)
|
||||
|
||||
class StatusPatternMatching(models.Model):
|
||||
pattern = models.TextField(primary_key=True)
|
||||
priority = models.SmallIntegerField()
|
||||
status = models.TextField() # This field type is a guess.
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'status_pattern_matching'
|
||||
|
||||
def __str__(self):
|
||||
return "{} -> {} [Priority: {}]".format(self.pattern, self.status, self.priority)
|
||||
|
||||
class UrlContent(models.Model):
|
||||
id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True)
|
||||
date_published = models.DateTimeField(blank=True, null=True)
|
||||
title = models.TextField(blank=True, null=True)
|
||||
description = models.TextField(blank=True, null=True)
|
||||
content = models.TextField(blank=True, null=True)
|
||||
valid_content = models.BooleanField(blank=True, null=True)
|
||||
language = models.CharField(max_length=2, blank=True, null=True)
|
||||
keywords = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
tags = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
authors = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
image_main_url = models.TextField(blank=True, null=True)
|
||||
images_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
videos_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
|
||||
url_host = models.TextField(blank=True, null=True)
|
||||
site_name = models.TextField(blank=True, null=True)
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'url_content'
|
||||
|
||||
class Urls(models.Model):
|
||||
class STATUS_ENUM(models.TextChoices):
|
||||
RAW = "raw", "Raw"
|
||||
ERROR = "error", "Error"
|
||||
VALID = "valid", "Valid"
|
||||
UNKNOWN = "unknown", "Unknown"
|
||||
INVALID = "invalid", "Invalid"
|
||||
DUPLICATE = "duplicate", "Duplicate"
|
||||
|
||||
url = models.TextField(unique=True)
|
||||
ts_fetch = models.DateTimeField(auto_now_add=True)
|
||||
status = models.TextField(choices=STATUS_ENUM.choices, default=STATUS_ENUM.RAW) # This field type is a guess.
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls'
|
||||
ordering = ["-ts_fetch"]
|
||||
|
||||
def __str__(self):
|
||||
return "URL: {} Fetch:{} Status:{}".format(self.url, self.ts_fetch, self.status)
|
||||
|
||||
|
||||
class UrlsDuplicate(models.Model):
|
||||
id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
|
||||
id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls_duplicate'
|
||||
unique_together = (('id_url_canonical', 'id_url_duplicated'),)
|
||||
|
||||
def __str__(self):
|
||||
return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical)
|
||||
|
||||
class UrlsSourceSearch(models.Model):
|
||||
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
|
||||
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
|
||||
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls_source_search'
|
||||
unique_together = (('id_url', 'id_source', 'id_search'),)
|
||||
|
||||
def __str__(self):
|
||||
return "{} {} {}".format(self.id_source, self.id_search, self.id_url)
|
||||
|
||||
""" # TODO: Migrate to django 5.2
|
||||
class UrlsDuplicate(models.Model):
|
||||
pk = models.CompositePrimaryKey('id_url_canonical', 'id_url_duplicated')
|
||||
id_url_canonical = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_canonical')
|
||||
id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls_duplicate'
|
||||
unique_together = (('id_url_canonical', 'id_url_duplicated'),)
|
||||
|
||||
def __str__(self):
|
||||
return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical)
|
||||
|
||||
class UrlsSourceSearch(models.Model):
|
||||
pk = models.CompositePrimaryKey('id_url', 'id_source', 'id_search')
|
||||
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url')
|
||||
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
|
||||
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
|
||||
|
||||
class Meta:
|
||||
managed = False
|
||||
db_table = 'urls_source_search'
|
||||
unique_together = (('id_url', 'id_source', 'id_search'),)
|
||||
|
||||
def __str__(self):
|
||||
return "{} {} {}".format(self.id_source, self.id_search, self.id_url)
|
||||
"""
|
||||
273
app_urls/fetcher/src/db_utils.py
Normal file
273
app_urls/fetcher/src/db_utils.py
Normal file
@@ -0,0 +1,273 @@
|
||||
from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPatternMatching, Source, Search
|
||||
from django.db.models import Q
|
||||
from django.core.cache import cache
|
||||
from django.db import IntegrityError
|
||||
from .url_processor import process_url, get_with_protocol
|
||||
import re
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class DB_Handler():
|
||||
def __init__(self):
|
||||
# Inserting raw URL, cache time: 1 day
|
||||
self._cache_timeout_insert_url = 86400
|
||||
# Processing error URL, cache time: 2 days
|
||||
self._cache_timeout_error_url = 86400*2
|
||||
|
||||
def insert_raw_urls(self, urls, obj_source, obj_search):
|
||||
try:
|
||||
logger.debug("Inserting raw URLs")
|
||||
# Empty?
|
||||
if (len(urls) == 0):
|
||||
logger.debug("Empty batch of urls (not writing to DB) for source-search: {} - {}".format(obj_source.source, obj_search.search))
|
||||
return
|
||||
# Default protocol https://
|
||||
urls_clean = [get_with_protocol(url) for url in urls]
|
||||
|
||||
urls_to_insert = []
|
||||
# Per URL
|
||||
for url in urls_clean:
|
||||
|
||||
### Already processed URL?
|
||||
if (cache.get("insert_{}".format(url)) is not None):
|
||||
logger.debug("Already cached URL: {}".format(url))
|
||||
|
||||
if (cache.get("insert_{}{}{}".format(url, obj_source.source, obj_search.search)) is not None):
|
||||
logger.debug("Already cached (URL, source, search): {} {} {}".format(url, obj_source.source, obj_search.search))
|
||||
else:
|
||||
### Insert (URL_id, source_id, search_id), since not cached
|
||||
# Get URL ID (should already be created)
|
||||
obj_url, created = Urls.objects.get_or_create(url=url)
|
||||
# Create (id_source, id_url) (shouldn't exist)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
|
||||
else:
|
||||
# Add object to insert
|
||||
# url_object_to_insert.append(Urls(url=url))
|
||||
urls_to_insert.append(url)
|
||||
|
||||
### Insert URLs & (URL_id, source_id)
|
||||
try:
|
||||
### Bulk insert, fails if duplicated URL (not retuning IDs when using ignore_conflicts=True)
|
||||
# URLs (ignore_conflicts=False to return IDs)
|
||||
bulk_created_urls = Urls.objects.bulk_create([Urls(url=url) for url in urls_to_insert], ignore_conflicts=False)
|
||||
# (URL_id, source_id)
|
||||
UrlsSourceSearch.objects.bulk_create([UrlsSourceSearch(id_url=obj_url, id_source=obj_source, id_search=obj_search) for obj_url in bulk_created_urls], ignore_conflicts=True)
|
||||
except IntegrityError as e:
|
||||
### Fallback to one-by-one insert
|
||||
logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method")
|
||||
# One by one
|
||||
for url in urls_to_insert:
|
||||
# URL
|
||||
obj_url, created = Urls.objects.get_or_create(url=url)
|
||||
if (created):
|
||||
logger.debug("Inserted: {}".format(obj_url.url))
|
||||
else:
|
||||
logger.debug("Not inserted: {}".format(obj_url.url))
|
||||
# (URL, source, search)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("bulk_create unknown exception while inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
# Avoid caching due to error on insertion
|
||||
urls_clean = []
|
||||
|
||||
# Insert or update cache
|
||||
for url in urls_clean:
|
||||
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
|
||||
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
|
||||
|
||||
logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
|
||||
|
||||
def set_status(obj_url, status):
|
||||
# Update status if setting a new value
|
||||
if (obj_url.status != status):
|
||||
obj_url.status = status
|
||||
obj_url.save()
|
||||
|
||||
##### Filter URL? -> Invalid
|
||||
if (status_pattern_match == "invalid"):
|
||||
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Process URL
|
||||
try:
|
||||
# Get data
|
||||
dict_url_data = process_url(obj_url.url)
|
||||
except Exception as e:
|
||||
if (raise_exception_on_error):
|
||||
# Simply raise exception, handled in a different way
|
||||
raise Exception("Error processing URL, raising exception as expected")
|
||||
else:
|
||||
logger.debug("Error processing URL: {}\n{}\n{}".format(obj_url.url, str(e), traceback.format_exc()))
|
||||
# Set status to error
|
||||
dict_url_data = None
|
||||
|
||||
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
|
||||
if (dict_url_data is None):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
# Invalid? e.g. binary data
|
||||
if (dict_url_data.get("override_status") == "invalid"):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Canonical URL different? -> Duplicate
|
||||
if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
||||
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||
# Get the source-search IDs associated to obj_url.id
|
||||
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
|
||||
for obj_url_source_search in list_url_source_search:
|
||||
# Associate same sources to url_canonical (it might already exist)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
||||
|
||||
# URLs duplciate association
|
||||
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||
|
||||
# TODO: return obj_url_canonical so as to directly process the recently inserted URL
|
||||
# Wherever this function is called, add:
|
||||
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
|
||||
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
|
||||
# Create or update extracted URL data
|
||||
UrlContent.objects.update_or_create(
|
||||
id_url=obj_url,
|
||||
defaults = {
|
||||
"date_published" : dict_url_data.get("publish_date"),
|
||||
"title" : dict_url_data.get("title"),
|
||||
"description" : dict_url_data.get("description"),
|
||||
"content" : dict_url_data.get("content"),
|
||||
"valid_content" : dict_url_data.get("valid_content"),
|
||||
"language" : dict_url_data.get("language"),
|
||||
"keywords" : dict_url_data.get("keywords"),
|
||||
"tags" : dict_url_data.get("tags"),
|
||||
"authors" : dict_url_data.get("authors"),
|
||||
"image_main_url" : dict_url_data.get("image_main_url"),
|
||||
"images_url" : dict_url_data.get("images_url"),
|
||||
"videos_url" : dict_url_data.get("videos_url"),
|
||||
"url_host" : dict_url_data.get("url_host"),
|
||||
"site_name" : dict_url_data.get("site_name"),
|
||||
}
|
||||
)
|
||||
|
||||
def process_raw_urls(self, batch_size):
|
||||
|
||||
def _get_status_pattern_matching(url, list_pattern_status_tuple):
|
||||
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
|
||||
"""
|
||||
# Sort pattern tuples by priority. (pattern, priority, status)
|
||||
for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
|
||||
# Regular expression pattern matching: https://regexr.com/
|
||||
if bool(re.match(regex_pattern, obj_url.url)):
|
||||
logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
|
||||
return status_if_match
|
||||
return None
|
||||
|
||||
try:
|
||||
logger.debug("Processing raw URLs")
|
||||
|
||||
# Get batch of URLs, status='raw'
|
||||
raw_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.RAW)[:batch_size]
|
||||
|
||||
if (len(raw_urls) == 0):
|
||||
logger.debug("No raw URLs to process")
|
||||
return
|
||||
|
||||
# Get list of (pattern, priority, status) tuples to override status if required
|
||||
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
|
||||
|
||||
# Per URL
|
||||
for obj_url in raw_urls:
|
||||
# Override status if pattern matching?
|
||||
status_pattern_match = _get_status_pattern_matching(obj_url.url, list_pattern_status_tuple)
|
||||
# Process URL
|
||||
self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False)
|
||||
|
||||
logger.info("Updated #{} raw URLs".format(len(raw_urls)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def process_error_urls(self, batch_size):
|
||||
try:
|
||||
logger.debug("Processing error URLs")
|
||||
|
||||
# Keep track of processed and skipped "error" URLs
|
||||
num_urls_skipped, num_urls_processed = 0, 0
|
||||
# Get batch of URLs, status='error'
|
||||
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
|
||||
|
||||
while ((len(error_urls) > 0) and (num_urls_processed < batch_size)):
|
||||
# Per URL
|
||||
for obj_url in error_urls:
|
||||
# URL ID cached? -> Tried to process recently already, skip
|
||||
if (cache.get("error_{}".format(obj_url.id)) is not None):
|
||||
logger.debug("Already cached URL ID: {}".format(obj_url.id))
|
||||
num_urls_skipped += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
# Process URL
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
|
||||
num_urls_processed += 1
|
||||
except Exception as e:
|
||||
# Error, cache to avoid re-processing for X time
|
||||
cache.set("error_{}".format(obj_url.id), True, timeout=self._cache_timeout_insert_url)
|
||||
num_urls_skipped += 1
|
||||
|
||||
# Get following batch of URLs, status='error'
|
||||
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
|
||||
|
||||
logger.info("Updated #{}, skipped #{} error URLs".format(num_urls_processed, num_urls_skipped))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def process_missing_kids_urls(self, batch_size=None):
|
||||
try:
|
||||
logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size))
|
||||
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
|
||||
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
|
||||
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
||||
&
|
||||
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
|
||||
)
|
||||
|
||||
# Get batch size
|
||||
if (batch_size is not None):
|
||||
missingkids_urls = missingkids_urls[:batch_size]
|
||||
|
||||
# Per URL
|
||||
for obj_url in missingkids_urls:
|
||||
try:
|
||||
# Process URL. If no exception -> Valid
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
|
||||
except Exception as e:
|
||||
# Raised exception -> Invalid (404 error)
|
||||
obj_url.status = Urls.STATUS_ENUM.INVALID
|
||||
obj_url.save()
|
||||
|
||||
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
51
app_urls/fetcher/src/fetch_feed.py
Normal file
51
app_urls/fetcher/src/fetch_feed.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
import feedparser
|
||||
import dateutil
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchFeeds():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher Feeds")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchFeeds.run()")
|
||||
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source="feeds")
|
||||
|
||||
# Get feeds objects
|
||||
list_obj_search_feeds = Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED)
|
||||
logger.debug("Fetching from feeds: {}".format([e.search for e in list_obj_search_feeds]))
|
||||
|
||||
# Process via RSS feeds
|
||||
for obj_search in list_obj_search_feeds:
|
||||
# Initialize
|
||||
urls_fetched, urls_publish_date = [], []
|
||||
# Fetch feeds
|
||||
feeds = feedparser.parse(obj_search.search)
|
||||
# Parse
|
||||
for f in feeds.get("entries", []):
|
||||
# Get URL
|
||||
url = f.get("link", None)
|
||||
# Process?
|
||||
if (url is not None):
|
||||
# Available publish date?
|
||||
publish_date_parsed = f.get("published_parsed")
|
||||
if (publish_date_parsed is None):
|
||||
publish_date = f.get("published", None)
|
||||
if (publish_date is not None):
|
||||
publish_date_parsed = dateutil.parser.parse(publish_date)
|
||||
|
||||
# Published date
|
||||
urls_publish_date.append(publish_date_parsed)
|
||||
# URL
|
||||
urls_fetched.append(url)
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchFeeds.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
42
app_urls/fetcher/src/fetch_missing_kids.py
Normal file
42
app_urls/fetcher/src/fetch_missing_kids.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
import os
|
||||
import requests
|
||||
import json
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchMissingKids():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher MissingKids")
|
||||
|
||||
def run(self, number_pages=-1):
|
||||
try:
|
||||
logger.debug("Starting MissingKids.run(), processing #{} pages".format(number_pages))
|
||||
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source="missingkids.org")
|
||||
# Get search object
|
||||
obj_search, created = Search.objects.get_or_create(search="missingkids.org/poster", type=Search.TYPE_ENUM.URL_HOST)
|
||||
|
||||
try:
|
||||
# Missing kids fetching endpoint, parameter number of pages to fetch
|
||||
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "get_missing_kids/?pages={}".format(number_pages))
|
||||
# Timeout
|
||||
if (number_pages > 15) or (number_pages == -1):
|
||||
timeout = 60*90 # 1.5h
|
||||
else:
|
||||
timeout = 60*10 # 10 min
|
||||
# Request
|
||||
r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
|
||||
# Decode
|
||||
urls_fetched = json.loads(r.text).get("list_urls", [])
|
||||
except Exception as e:
|
||||
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
|
||||
urls_fetched = []
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in MissingKids.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
46
app_urls/fetcher/src/fetch_parser.py
Normal file
46
app_urls/fetcher/src/fetch_parser.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
from .url_processor import get_with_protocol, url_host_slowdown
|
||||
import newspaper
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchParser():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher Parser")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchParser.run() for {}")
|
||||
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source="newspaper4k")
|
||||
# Get URL hosts
|
||||
list_url_host = Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST)
|
||||
logger.debug("Fetching news by parsing URL hosts: {}".format([e.search for e in list_url_host]))
|
||||
|
||||
# Process newspaper4k build method
|
||||
for obj_search in list_url_host:
|
||||
# Protocol
|
||||
url_host_protocol = get_with_protocol(obj_search.search)
|
||||
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_protocol))
|
||||
|
||||
# Make sure no requests made for the last X seconds
|
||||
url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
|
||||
try:
|
||||
# Source object
|
||||
url_host_built = newspaper.build(url_host_protocol)
|
||||
# Get articles URL list
|
||||
urls_fetched = url_host_built.article_urls()
|
||||
except newspaper.exceptions.ArticleException as e:
|
||||
logger.debug("ArticleException while parsing input URL {}\n{}".format(url_host_protocol, str(e.args)))
|
||||
urls_fetched = []
|
||||
except Exception as e:
|
||||
logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
|
||||
urls_fetched = []
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
57
app_urls/fetcher/src/fetch_search.py
Normal file
57
app_urls/fetcher/src/fetch_search.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search
|
||||
from django.db.models import Q
|
||||
import traceback
|
||||
import time
|
||||
import os
|
||||
from .fetch_search_instances import ListSearchInstances
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchSearcher():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher Searcher")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchSearcher.run()")
|
||||
|
||||
# Get search objects of interest
|
||||
list_search_obj = Search.objects.filter(Q(type=Search.TYPE_ENUM.URL_HOST) | Q(type=Search.TYPE_ENUM.KEYWORD_SEARCH))
|
||||
logger.debug("Fetching from search: {}".format(["{} ({})".format(e.search, e.type) for e in list_search_obj]))
|
||||
|
||||
# Search
|
||||
for obj_search in list_search_obj:
|
||||
# TODO: language & country customization
|
||||
|
||||
# Search
|
||||
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
|
||||
|
||||
if (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
|
||||
# Add search with intitle keyword
|
||||
# TODO: allintitle: "child abuse"
|
||||
# TODO: intitle: "child abuse"
|
||||
pass
|
||||
# language, country = obj_search.language_country.split("-")
|
||||
|
||||
logger.debug("Starting keyword search: {}".format(keyword_search))
|
||||
logger.debug("Search type: {}".format(obj_search.type))
|
||||
|
||||
# DB writer
|
||||
db_writer = DB_Handler()
|
||||
|
||||
# Keyword arguments
|
||||
args = {
|
||||
"language": "en",
|
||||
"country": "US",
|
||||
# "period": ["7d", "1d"], # TODO: List of periods to iterate
|
||||
}
|
||||
|
||||
for SearchInstance in ListSearchInstances:
|
||||
# Sleep between requests, avoid too many requests...
|
||||
time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
|
||||
SearchInstance(args).fetch_articles(db_writer, obj_search)
|
||||
|
||||
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
308
app_urls/fetcher/src/fetch_search_instances.py
Normal file
308
app_urls/fetcher/src/fetch_search_instances.py
Normal file
@@ -0,0 +1,308 @@
|
||||
import time
|
||||
import feedparser
|
||||
import os
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from ..models import Search, Source
|
||||
from .fetch_utils import decode_gnews_urls
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
from gnews import GNews
|
||||
from duckduckgo_search import DDGS
|
||||
from GoogleNews import GoogleNews
|
||||
from search_engines import Yahoo, Aol
|
||||
|
||||
###########################################################################
|
||||
###########################################################################
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
# Generic fetcher (fetches articles, writes to DB)
|
||||
class FetcherAbstract(ABC):
|
||||
@abstractmethod
|
||||
def _fetch_raw_urls(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _get_name(self):
|
||||
pass
|
||||
|
||||
def _get_source_object(self, source):
|
||||
# TODO: Cache
|
||||
# self.cached_sources = {}
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source=source)
|
||||
return obj_source
|
||||
|
||||
def _post_process_urls(self, raw_urls, obj_search):
|
||||
# Searching URL Host based? Make sure results belong to that site
|
||||
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
|
||||
# Get clean URL host
|
||||
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
|
||||
# Ensure URL host in URL
|
||||
raw_urls = [u for u in raw_urls if url_host_clean in u]
|
||||
|
||||
return raw_urls
|
||||
|
||||
def fetch_articles(self, db_writer, obj_search):
|
||||
# Source name
|
||||
source_name = self._get_name()
|
||||
|
||||
# Search
|
||||
keyword_search = obj_search.search
|
||||
# URL Host search? -> site:${URL_HOST}
|
||||
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
|
||||
keyword_search = "{}{}".format("site:", keyword_search)
|
||||
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
|
||||
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
|
||||
start_date = timezone.now() - timedelta(days=7)
|
||||
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
|
||||
|
||||
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
|
||||
# Fetch
|
||||
raw_urls = self._fetch_raw_urls(keyword_search)
|
||||
# Post-process
|
||||
raw_urls = self._post_process_urls(raw_urls, obj_search)
|
||||
|
||||
# Write to DB
|
||||
db_writer.insert_raw_urls(raw_urls, self._get_source_object(source_name), obj_search)
|
||||
|
||||
###########################################################################
|
||||
|
||||
class SearchGNews(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
self.period = args.get("period", "7d")
|
||||
self.max_results = args.get("max_results", 100)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [period] [language-country] [max_results]
|
||||
return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
# Get news
|
||||
results_gnews = GNews(language=self.language, country=self.country, period=self.period, max_results=self.max_results).get_news(keyword_search)
|
||||
# Get list of encoded urls
|
||||
encoded_urls = [e.get("url") for e in results_gnews]
|
||||
# Decode
|
||||
urls = decode_gnews_urls(encoded_urls)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchDuckDuckGoGeneral(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "wt")
|
||||
self.country = args.get("country", "wt")
|
||||
self.max_results = args.get("max_results", 20)
|
||||
self.region = "{}-{}".format(self.language, self.country).lower()
|
||||
self.period = None
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country] [max_results]
|
||||
return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
|
||||
urls = [e.get("href") for e in news]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchDuckDuckGoNews(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "wt")
|
||||
self.country = args.get("country", "wt")
|
||||
self.max_results = args.get("max_results", 100)
|
||||
self.region = "{}-{}".format(self.language, self.country).lower()
|
||||
self.period = None
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country] [max_results]
|
||||
return "ddg-news {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
news = DDGS().news(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
|
||||
urls = [e.get("url") for e in news]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchGoogleNews(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
self.period = args.get("period", "7d")
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [period] [language-country]
|
||||
return "googlenews {} {}-{}".format(self.period, self.language, self.country)
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
# Initialize
|
||||
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
|
||||
googlenews.enableException(True)
|
||||
# Search
|
||||
googlenews.get_news(keyword_search)
|
||||
# Fetch
|
||||
encoded_urls = googlenews.get_links()
|
||||
# Decode
|
||||
urls = decode_gnews_urls(encoded_urls)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchGoogleGeneral(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
self.period = args.get("period", "7d")
|
||||
self.pages = args.get("pages", 1)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [period] [language-country] [pages]
|
||||
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
# Initialize
|
||||
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
|
||||
googlenews.enableException(True)
|
||||
# Search
|
||||
googlenews.search(keyword_search)
|
||||
|
||||
set_links = set()
|
||||
# Iterate pages
|
||||
for i in range(self.pages):
|
||||
# Sleep between pages fetch
|
||||
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
|
||||
# Number of URLs fetched so far
|
||||
num_before = len(set_links)
|
||||
# Get page
|
||||
try:
|
||||
links = googlenews.page_at(i+1)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching page - {}: {}".format(self._get_name(), str(e)))
|
||||
break
|
||||
# Links
|
||||
for l in links:
|
||||
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
|
||||
set_links.add( l.get("link").split("&ved=")[0] )
|
||||
# Finished?
|
||||
if (num_before == len(set_links)):
|
||||
break
|
||||
# To list
|
||||
urls = list(set_links)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchGoogleNewsRSS(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country]
|
||||
return "googlenews-rss {}-{}".format(self.language, self.country).strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
# Search URL with parameters filled: https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
|
||||
search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(self.language, self.country.upper()), self.country.upper(), self.country.upper(), self.language)
|
||||
# Control characters
|
||||
search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
|
||||
# Initialize
|
||||
encoded_urls = []
|
||||
# Fetch feeds
|
||||
feeds = feedparser.parse(search_url)
|
||||
# Parse
|
||||
for f in feeds.get("entries", []):
|
||||
# Encoded URL
|
||||
encoded_url = f.get("link", None)
|
||||
'''
|
||||
# Available publish date?
|
||||
publish_date_parsed = f.get("published_parsed")
|
||||
if (publish_date_parsed is None):
|
||||
publish_date = f.get("published", None)
|
||||
if (publish_date is not None):
|
||||
publish_date_parsed = dateutil.parser.parse(publish_date)
|
||||
|
||||
# Published date
|
||||
urls_publish_date.append(publish_date_parsed)'
|
||||
'''
|
||||
# Append
|
||||
encoded_urls.append(encoded_url)
|
||||
|
||||
# Decode
|
||||
urls = decode_gnews_urls(encoded_urls)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
|
||||
return urls
|
||||
|
||||
class SearchYahooGeneral(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.pages = args.get("pages", 2)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country] [pages]
|
||||
return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
results = Yahoo().search(keyword_search, pages=self.pages)
|
||||
urls = results.links()
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchAOLGeneral(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.pages = args.get("pages", 2)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country] [pages]
|
||||
return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
results = Aol().search(keyword_search, pages=self.pages)
|
||||
urls = results.links()
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
###########################################################################
|
||||
|
||||
# List of instances
|
||||
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]
|
||||
35
app_urls/fetcher/src/fetch_utils.py
Normal file
35
app_urls/fetcher/src/fetch_utils.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import os
|
||||
from django.core.cache import cache
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
from googlenewsdecoder import gnewsdecoder
|
||||
|
||||
|
||||
def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
|
||||
logger.debug("Decoding gnews URLs")
|
||||
# DecodeURLs
|
||||
list_decoded_urls = []
|
||||
for url in encoded_urls:
|
||||
# Already cached?
|
||||
decoded_url = cache.get("gnews_decode_{}".format(url))
|
||||
|
||||
if (decoded_url is not None):
|
||||
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
|
||||
# Append decoded URL
|
||||
list_decoded_urls.append(decoded_url)
|
||||
else:
|
||||
try:
|
||||
# Decode URL, with interval time to avoid block
|
||||
decoded_url_dict = gnewsdecoder(url, interval=interval)
|
||||
# Ok?
|
||||
if decoded_url_dict.get("status"):
|
||||
# Append decoded URL
|
||||
decoded_url = decoded_url_dict["decoded_url"]
|
||||
list_decoded_urls.append(decoded_url)
|
||||
# Cache decoded URL
|
||||
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
|
||||
else:
|
||||
logger.info("Bad status while decoding news.google.com, URL {}\n{}".format(url, decoded_url_dict.get("message")))
|
||||
except Exception as e:
|
||||
logger.warning("Error decoding news.google.com, URL: {}".format(url))
|
||||
return list_decoded_urls
|
||||
33
app_urls/fetcher/src/logger.py
Normal file
33
app_urls/fetcher/src/logger.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
# Get env var
|
||||
logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
|
||||
|
||||
# Directory of logs
|
||||
os.makedirs(logs_directory, exist_ok=True)
|
||||
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("fetcher")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.INFO)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.WARNING)
|
||||
logger.addHandler(fh)
|
||||
|
||||
def get_logger():
|
||||
return logger
|
||||
127
app_urls/fetcher/src/url_processor.py
Normal file
127
app_urls/fetcher/src/url_processor.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from django.core.cache import cache
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
import newspaper
|
||||
import time
|
||||
import os
|
||||
from urllib.parse import unquote
|
||||
import langdetect
|
||||
langdetect.DetectorFactory.seed = 0
|
||||
|
||||
def get_with_protocol(url):
|
||||
# http:// -> https://
|
||||
url = url.replace("http://", "https://")
|
||||
# "" -> https://
|
||||
if not (url.startswith("https://")):
|
||||
url = "https://" + url
|
||||
return url
|
||||
|
||||
def get_url_host(url):
|
||||
# URL no protocol, first substring before '/'
|
||||
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||
return url_host
|
||||
|
||||
def url_host_slowdown(url, url_host_slowdown_seconds):
|
||||
### Avoid (frequent) too many requests to the same URL host
|
||||
# Get URL host
|
||||
url_host = get_url_host(url)
|
||||
# Recently processed URL host? -> Slow down required
|
||||
last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None)
|
||||
if last_cached_timestamp:
|
||||
# Get time since last processed URL host (in seconds)
|
||||
time_since_last_processed = time.time() - last_cached_timestamp
|
||||
# Amount of time required to sleep?
|
||||
slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
|
||||
logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
|
||||
# Sleep
|
||||
time.sleep(slowdown_required)
|
||||
# About to process URL host, cache time
|
||||
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||
|
||||
def process_url(url):
|
||||
try:
|
||||
# Slow down if required to avoid too many requests error
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||
# Process
|
||||
article = newspaper.article(url)
|
||||
except newspaper.ArticleBinaryDataException:
|
||||
logger.warning("ArticleException for input URL {}".format(url))
|
||||
return {"override_status": "invalid"}
|
||||
except newspaper.ArticleException as e:
|
||||
|
||||
# Too many requests? Cool down...
|
||||
if ("Status code 429" in str(e.args)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
logger.debug("TODO: process_url Implement code 429")
|
||||
# Unavailable for legal reasons
|
||||
if ("Status code 451" in str(e.args)):
|
||||
# TODO: Bypass with VPN
|
||||
logger.debug("TODO: process_url Implement code 451")
|
||||
# CloudFlare protection?
|
||||
if ("Website protected with Cloudflare" in str(e.args)):
|
||||
logger.debug("TODO: process_url Implement bypass CloudFlare")
|
||||
# PerimeterX protection?
|
||||
if ("Website protected with PerimeterX" in str(e.args)):
|
||||
logger.debug("TODO: process_url Implement bypass PerimeterX")
|
||||
|
||||
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||
return None
|
||||
|
||||
try:
|
||||
content_merged = "\n".join([article.title, article.meta_description, article.text])
|
||||
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
|
||||
language = langdetect.detect(content_merged)
|
||||
else:
|
||||
language = None
|
||||
except Exception as e:
|
||||
logger.info("Could not detect language: {}\n{}".format(url, str(e)))
|
||||
language = None
|
||||
|
||||
dict_data = {
|
||||
"url": url,
|
||||
"url_canonical": article.canonical_link,
|
||||
"url_host": article.source_url,
|
||||
"site_name": article.meta_site_name,
|
||||
"publish_date": article.publish_date,
|
||||
"language": language, # article.meta_lang -> Not always reliable
|
||||
"title": article.title,
|
||||
"description": article.meta_description,
|
||||
"content": article.text,
|
||||
"valid_content": article.is_valid_body(),
|
||||
"keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
|
||||
"tags": article.tags,
|
||||
"authors": article.authors,
|
||||
"image_main_url": article.top_image, # article.meta_img
|
||||
"images_url": article.images,
|
||||
"videos_url": article.movies,
|
||||
}
|
||||
|
||||
'''
|
||||
# TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",")
|
||||
if (dict_data["tags"] is None):
|
||||
dict_data["tags"] = []
|
||||
for k in article.meta_data.keys():
|
||||
if ("tags" in k):
|
||||
dict_data["tags"] += article.meta_data[k].split(",")
|
||||
'''
|
||||
|
||||
# Sanity check
|
||||
for k in dict_data.keys():
|
||||
if (type(dict_data[k]) is list):
|
||||
# Remove empty string, unquote special characters, e.g. "%20" -> " "
|
||||
dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ]
|
||||
# NULL instead of empty list
|
||||
if (len(dict_data[k]) == 0):
|
||||
dict_data[k] = None
|
||||
elif (type(dict_data[k]) is str):
|
||||
# Unquote special characters
|
||||
if (dict_data[k] is not None):
|
||||
dict_data[k] = unquote(dict_data[k])
|
||||
# NULL instead of empty string
|
||||
if (dict_data[k] == ""):
|
||||
dict_data[k] = None
|
||||
|
||||
return dict_data
|
||||
114
app_urls/fetcher/tasks.py
Normal file
114
app_urls/fetcher/tasks.py
Normal file
@@ -0,0 +1,114 @@
|
||||
from scheduler import job
|
||||
|
||||
from .src.fetch_feed import FetchFeeds
|
||||
from .src.fetch_parser import FetchParser
|
||||
from .src.fetch_search import FetchSearcher
|
||||
from .src.fetch_missing_kids import FetchMissingKids
|
||||
from .src.db_utils import DB_Handler
|
||||
|
||||
from .src.logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
@job('default')
|
||||
def fetch_feeds():
|
||||
task = "Fetch Feeds"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchFeeds().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_parser():
|
||||
task = "Fetch Parser"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchParser().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_search():
|
||||
task = "Fetch Search"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchSearcher().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_missing_kids(number_pages=5):
|
||||
task = "Fetch MissingKids"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_missing_kids_all(number_pages=-1):
|
||||
task = "Fetch MissingKids"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def process_raw_urls(batch_size=50):
|
||||
task = "Process raw URLs"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().process_raw_urls(batch_size=batch_size)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def process_error_urls(batch_size=50):
|
||||
task = "Process error URLs"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().process_error_urls(batch_size=batch_size)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def process_missing_kids_urls(batch_size=50):
|
||||
task = "Process Missing Kids URLs"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def process_missing_kids_urls_all(batch_size=None):
|
||||
task = "Process Missing Kids URLs ALL"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
|
||||
@job('default')
|
||||
def background_task(process_type: str):
|
||||
logger.info("Task triggered: {}".format(process_type))
|
||||
|
||||
try:
|
||||
if (process_type == "fetch_feeds"):
|
||||
FetchFeeds().run()
|
||||
elif (process_type == "fetch_parser"):
|
||||
FetchParser().run()
|
||||
elif (process_type == "fetch_search"):
|
||||
FetchSearcher().run()
|
||||
elif (process_type == "fetch_missingkids_all"):
|
||||
FetchMissingKids().run(number_pages=-1)
|
||||
elif ("fetch_missingkids" in process_type):
|
||||
# number_pages encoded in URL
|
||||
try:
|
||||
number_pages = int(process_type.split("_")[-1])
|
||||
except Exception as e:
|
||||
number_pages = -1
|
||||
FetchMissingKids().run(number_pages=number_pages)
|
||||
elif ("process_" in process_type):
|
||||
# Batch size encoded in URL
|
||||
try:
|
||||
batch_size = int(process_type.split("_")[-1])
|
||||
except Exception as e:
|
||||
batch_size = None
|
||||
# Task type
|
||||
if ("process_raw_urls" in process_type):
|
||||
DB_Handler().process_raw_urls(batch_size=batch_size)
|
||||
elif ("process_error_urls" in process_type):
|
||||
DB_Handler().process_error_urls(batch_size=batch_size)
|
||||
elif ("process_missing_kids_urls" in process_type):
|
||||
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
|
||||
else:
|
||||
logger.info("Task unknown!: {}".format(process_type))
|
||||
|
||||
logger.info("Task completed: {}".format(process_type))
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
179
app_urls/fetcher/templates/charts.html
Normal file
179
app_urls/fetcher/templates/charts.html
Normal file
@@ -0,0 +1,179 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Charts</title>
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
||||
<style>
|
||||
body {
|
||||
background-color: #333;
|
||||
color: #fff;
|
||||
font-family: Arial, sans-serif;
|
||||
}
|
||||
|
||||
h2 {
|
||||
color: #fff;
|
||||
text-align: center;
|
||||
margin-bottom: 40px;
|
||||
}
|
||||
|
||||
.chart-container {
|
||||
width: 45%;
|
||||
display: inline-block;
|
||||
margin: 20px;
|
||||
background-color: #444;
|
||||
border-radius: 10px;
|
||||
padding: 5px;
|
||||
}
|
||||
|
||||
canvas {
|
||||
background-color: #2c2c2c;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.container {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.filter-container {
|
||||
text-align: center;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
select {
|
||||
padding: 8px;
|
||||
background-color: #555;
|
||||
color: white;
|
||||
border: 1px solid #444;
|
||||
border-radius: 5px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h2>Data Visualizations</h2>
|
||||
|
||||
<!-- Filter for Number of Days -->
|
||||
<div class="filter-container">
|
||||
<label for="daysFilter">Select Number of Days:</label>
|
||||
<select id="daysFilter">
|
||||
<option value="0.0625">Last 90 Minutes</option>
|
||||
<option value="0.25">Last 6 Hours</option>
|
||||
<option value="1">Last 24 Hours</option>
|
||||
<option value="7" selected>Last 7 Days</option>
|
||||
<option value="30">Last 30 Days</option>
|
||||
<option value="90">Last 90 Days</option>
|
||||
<option value="365">Last 365 Days</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div class="container">
|
||||
<div class="chart-container">
|
||||
<canvas id="urlFetchDateChart"></canvas>
|
||||
</div>
|
||||
|
||||
<div class="chart-container">
|
||||
<canvas id="urlStatusChart"></canvas>
|
||||
</div>
|
||||
|
||||
<div class="chart-container">
|
||||
<canvas id="urlsPerSourceChart"></canvas>
|
||||
</div>
|
||||
|
||||
<div class="chart-container">
|
||||
<canvas id="urlsPerSearchChart"></canvas>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
$(document).ready(function () {
|
||||
let chartInstances = {}; // Store chart instances
|
||||
|
||||
// Fetch initial data (default 7 days)
|
||||
const defaultDays = 7;
|
||||
fetchDataAndRenderCharts(defaultDays);
|
||||
|
||||
// Apply the filter automatically when the user changes the selection
|
||||
$('#daysFilter').on('change', function () {
|
||||
const selectedDays = $(this).val();
|
||||
fetchDataAndRenderCharts(selectedDays);
|
||||
});
|
||||
|
||||
function fetchDataAndRenderCharts(days) {
|
||||
fetchAndRenderChart(`/urls-by-fetch-date/?days=${days}`, 'urlFetchDateChart', 'URLs by Fetch Date', 'bar');
|
||||
fetchAndRenderChart(`/urls-per-status/?days=${days}`, 'urlStatusChart', 'URLs by Status', 'bar');
|
||||
fetchAndRenderChart(`/urls-per-source/?days=${days}`, 'urlsPerSourceChart', 'URLs by Source', 'bar');
|
||||
fetchAndRenderChart(`/urls-per-search/?days=${days}`, 'urlsPerSearchChart', 'URLs by Search', 'bar');
|
||||
}
|
||||
|
||||
const categoryColors = {
|
||||
'URLs by Fetch Date': '#4BC0C0', // Color for this category
|
||||
'URLs by Status': '#36A2EB', // Color for this category
|
||||
'URLs by Source': '#4BC0C0', // Color for this category
|
||||
'URLs by Search': '#36A2EB' // Color for this category
|
||||
};
|
||||
const maxLabelLength = 35; // Limit X-axis labels to 10 characters
|
||||
|
||||
function fetchAndRenderChart(url, canvasId, chartTitle, chartType) {
|
||||
$.getJSON(url, function (data) {
|
||||
if (chartInstances[canvasId]) {
|
||||
chartInstances[canvasId].destroy(); // Destroy previous chart
|
||||
}
|
||||
|
||||
const ctx = document.getElementById(canvasId).getContext('2d');
|
||||
chartInstances[canvasId] = new Chart(ctx, {
|
||||
type: chartType,
|
||||
data: {
|
||||
labels: data.labels, // Ensure labels are passed as strings
|
||||
datasets: [{
|
||||
label: chartTitle,
|
||||
data: data.values,
|
||||
backgroundColor: categoryColors[chartTitle], // Assign the same color based on category
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
plugins: {
|
||||
legend: {
|
||||
labels: { color: '#fff' }
|
||||
}
|
||||
},
|
||||
scales: {
|
||||
x: {
|
||||
ticks: {
|
||||
color: "#fff", // Set the color of x-axis ticks
|
||||
callback: function (value) {
|
||||
let label = data.labels[value];
|
||||
if (label.length > maxLabelLength) { return label.slice(0, maxLabelLength) + '...'; }
|
||||
return label;
|
||||
}
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set the grid lines color
|
||||
}
|
||||
},
|
||||
y: {
|
||||
ticks: {
|
||||
color: "#fff" // Set the color of y-axis ticks
|
||||
},
|
||||
grid: {
|
||||
color: "#444" // Set the grid lines color
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
});
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
580
app_urls/fetcher/templates/filtered_urls.html
Normal file
580
app_urls/fetcher/templates/filtered_urls.html
Normal file
@@ -0,0 +1,580 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>URLs</title>
|
||||
|
||||
<!--
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
|
||||
-->
|
||||
<style>
|
||||
/* General Styling */
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
background-color: #fff;
|
||||
color: #333;
|
||||
/*transition: background 0.3s ease, color 0.3s ease;*/
|
||||
}
|
||||
|
||||
/* Dark Mode Styles */
|
||||
.dark-mode {
|
||||
background-color: #121212;
|
||||
color: #e0e0e0;
|
||||
}
|
||||
|
||||
/* Default Link Style */
|
||||
a {
|
||||
color: #0066cc; /* Default color for links */
|
||||
text-decoration: none;
|
||||
}
|
||||
a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
/* Dark Mode Links */
|
||||
.dark-mode a {
|
||||
color: #52a8ff; /* Adjust this color to make the link more visible in dark mode */
|
||||
}
|
||||
.dark-mode a:hover {
|
||||
color: #66ccff; /* Change the hover color to something lighter or a contrasting color */
|
||||
}
|
||||
|
||||
/* Layout */
|
||||
.container {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
/* Sidebar */
|
||||
.sidebar {
|
||||
min-width: 110px; /* Minimum width */
|
||||
max-width: 200px; /* Maximum width */
|
||||
width: 100%; /* Make it take full width within the defined min and max */
|
||||
padding: 5px;
|
||||
box-sizing: border-box; /* Ensure padding doesn't increase the overall width */
|
||||
transition: width 0.3s ease-in-out; /* Smooth transition for resizing */
|
||||
background-color: #f4f4f4;
|
||||
box-sizing: border-box;
|
||||
word-wrap: break-word; /* Allow wrapping of long words */
|
||||
overflow-wrap: break-word; /* Ensures wrapping across browsers */
|
||||
white-space: normal; /* Ensure normal word wrapping */
|
||||
}
|
||||
|
||||
.dark-mode .sidebar {
|
||||
background-color: #1e1e1e;
|
||||
}
|
||||
|
||||
/* Sidebar Headers */
|
||||
.sidebar h3 {
|
||||
margin-top: 15px;
|
||||
margin-bottom: 2px;
|
||||
font-size: 16px;
|
||||
}
|
||||
|
||||
/* Table Container */
|
||||
.table-container {
|
||||
flex-grow: 1;
|
||||
}
|
||||
|
||||
/* Table */
|
||||
table {
|
||||
width: 97.5%;
|
||||
border-collapse: collapse;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
table, th, td {
|
||||
border: 1px solid #ddd;
|
||||
}
|
||||
|
||||
th, td {
|
||||
padding: 10px;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
/* Dark Mode Table */
|
||||
.dark-mode table {
|
||||
border-color: #444;
|
||||
}
|
||||
|
||||
.dark-mode th, .dark-mode td {
|
||||
border-color: #555;
|
||||
}
|
||||
|
||||
/* Dark Mode Checkbox Labels */
|
||||
.dark-mode label {
|
||||
color: #e0e0e0;
|
||||
}
|
||||
|
||||
/* Checkbox Styling */
|
||||
input[type="checkbox"] {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
/* Themed Toggle Button */
|
||||
.theme-button, .home-button, .chart-button {
|
||||
background-color: var(--sidebar);
|
||||
border: 1px solid var(--sidebar);
|
||||
border-radius: 50%;
|
||||
width: 30px;
|
||||
height: 45px;
|
||||
font-size: 25px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
transition: background-color 0.1s, color 0.1s, transform 0.1s;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.theme-button:hover, .home-button:hover, .chart-button:hover {
|
||||
transform: rotate(20deg);
|
||||
}
|
||||
.theme-button:active, .home-button:active, .chart-button:acive {
|
||||
transform: scale(0.95);
|
||||
}
|
||||
|
||||
.button-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px; /* Space between buttons */
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/* PAGINATION */
|
||||
.pagination-container {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
font-family: Arial, sans-serif;
|
||||
}
|
||||
.pagination-link {
|
||||
padding: 8px 15px;
|
||||
background-color: #007bff;
|
||||
color: white;
|
||||
text-decoration: none;
|
||||
border-radius: 25px;
|
||||
font-size: 14px;
|
||||
display: inline-block;
|
||||
transition: background-color 0.3s ease, transform 0.2s ease;
|
||||
}
|
||||
.pagination-link:hover {
|
||||
background-color: #0056b3;
|
||||
transform: scale(1.1);
|
||||
}
|
||||
.pagination-link:active {
|
||||
background-color: #003366;
|
||||
transform: scale(0.95);
|
||||
}
|
||||
.first-page, .last-page {
|
||||
font-weight: bold;
|
||||
}
|
||||
.prev-page, .next-page {
|
||||
font-weight: normal;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* ROUNDED SWITCH*/
|
||||
/* Hide the default checkbox */
|
||||
.checkbox-slider {
|
||||
display: none;
|
||||
}
|
||||
/* Container for the toggle switch */
|
||||
.slider-container {
|
||||
display: inline-block;
|
||||
width: 60px;
|
||||
height: 30px;
|
||||
position: relative;
|
||||
}
|
||||
/* Label for the slider */
|
||||
.slider-container label {
|
||||
display: block;
|
||||
position: absolute;
|
||||
top: 0;
|
||||
left: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: #ccc;
|
||||
border-radius: 30px;
|
||||
cursor: pointer;
|
||||
transition: background-color 0.3s ease;
|
||||
}
|
||||
/* The toggle circle */
|
||||
.slider-container label::before {
|
||||
content: '';
|
||||
position: absolute;
|
||||
top: 3px;
|
||||
left: 3px;
|
||||
width: 24px;
|
||||
height: 24px;
|
||||
background-color: white;
|
||||
border-radius: 50%;
|
||||
transition: transform 0.3s ease;
|
||||
}
|
||||
/* When the checkbox is checked */
|
||||
.checkbox-slider:checked + .slider-container label {
|
||||
background-color: #0940b8;
|
||||
}
|
||||
/* When the checkbox is checked, move the circle */
|
||||
.checkbox-slider:checked + .slider-container label::before {
|
||||
transform: translateX(30px);
|
||||
}
|
||||
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
{% load custom_filters %}
|
||||
|
||||
<div class="container">
|
||||
<div class="sidebar">
|
||||
<div class="button-container">
|
||||
<button id="homeButton" class="home-button">🏠</button>
|
||||
<button id="themeToggle" class="theme-button">🌙</button>
|
||||
<button id="chartButton" class="chart-button">📊</button>
|
||||
</div>
|
||||
|
||||
<form method="GET" action="" id="filterForm">
|
||||
<!-- Switch: Table / Charts
|
||||
<form>
|
||||
<label>
|
||||
<input type="radio" name="view" value="table" checked id="tableRadio"> Table
|
||||
</label>
|
||||
<label>
|
||||
<input type="radio" name="view" value="chart" id="chartRadio"> Charts
|
||||
</label>
|
||||
</form>
|
||||
-->
|
||||
|
||||
<!-- Rounded switch
|
||||
<input type="checkbox" id="toggle" class="checkbox-slider">
|
||||
<div class="slider-container">
|
||||
<label for="toggle"></label>
|
||||
<span class="slider-text">
|
||||
<span id="onText" class="on-text">ON</span>
|
||||
<span id="offText" class="off-text">OFF</span>
|
||||
</span>
|
||||
</div>
|
||||
-->
|
||||
|
||||
<!-- Pages Per Page Dropdown -->
|
||||
<h3>Pages Per Page</h3>
|
||||
<select id="perPageSelect" name="per_page">
|
||||
<option value="25" {% if per_page|stringformat:"s" == '25' %}selected{% endif %}>25</option>
|
||||
<option value="100" {% if per_page|stringformat:"s" == '100' %}selected{% endif %}>100</option>
|
||||
<option value="500" {% if per_page|stringformat:"s" == '500' %}selected{% endif %}>500</option>
|
||||
</select>
|
||||
<br>
|
||||
|
||||
<!-- Filter by Time Range -->
|
||||
<h3>Fetch Date</h3>
|
||||
<select id="timeFilterSelect" name="days">
|
||||
<!--
|
||||
{% for form_days in form_days_list %}
|
||||
<option value=form_days.1|stringformat:"s" {% if selected_days|stringformat:"s" == form_days.1|stringformat:"s" %}selected{% endif %}>form_days.2</option>
|
||||
{% endfor %}
|
||||
-->
|
||||
<option value="0.25" {% if selected_days|stringformat:"s" == '0.25' %}selected{% endif %}>Last 6 hours</option>
|
||||
<option value="1" {% if selected_days|stringformat:"s" == '1' %}selected{% endif %}>Last 24 hours</option>
|
||||
<option value="7" {% if selected_days|stringformat:"s" == '7' %}selected{% endif %}>Last 7 days</option>
|
||||
<option value="30" {% if selected_days|stringformat:"s" == '30' %}selected{% endif %}>Last 30 days</option>
|
||||
<option value="90" {% if selected_days|stringformat:"s" == '90' %}selected{% endif %}>Last 90 days</option>
|
||||
<option value="365" {% if selected_days|stringformat:"s" == '365' %}selected{% endif %}>Last 365 days</option>
|
||||
</select>
|
||||
<br>
|
||||
|
||||
<!-- Filter by Status -->
|
||||
<h3>Status</h3>
|
||||
<button type="button" class="toggle-all-btn" data-toggle="status">Toggle All</button><br>
|
||||
{% for status in statuses %}
|
||||
<label>
|
||||
<input type="checkbox" name="status" value="{{ status.0 }}"
|
||||
{% if status.0 in selected_status or 'all' in selected_status %}checked{% endif %}>
|
||||
{{ status.1 }}
|
||||
</label><br>
|
||||
{% endfor %}
|
||||
|
||||
<!-- Filter by valid content -->
|
||||
<h3>Valid content</h3>
|
||||
<button type="button" class="toggle-all-btn" data-toggle="valid_content">Toggle All</button><br>
|
||||
{% for vc in valid_contents %}
|
||||
<label>
|
||||
<input type="checkbox" name="valid_content" value="{{ vc }}"
|
||||
{% if vc|stringformat:"s" in selected_valid_contents or 'all' in selected_valid_contents%}checked{% endif %}>
|
||||
{{ vc|truncatechars:50 }}
|
||||
</label><br>
|
||||
{% endfor %}
|
||||
|
||||
<!-- Filter by Search -->
|
||||
<h3>Search</h3>
|
||||
<button type="button" class="toggle-all-btn" data-toggle="search">Toggle All</button><br>
|
||||
{% for search in searches %}
|
||||
<label>
|
||||
<input type="checkbox" name="search" value="{{ search.id }}"
|
||||
{% if search.id|stringformat:"s" in selected_search or 'all' in selected_search %}checked{% endif %}>
|
||||
[{{ search.type }}] {{ search.search|truncatechars:50 }}
|
||||
</label><br>
|
||||
{% endfor %}
|
||||
|
||||
<!-- Filter by Source -->
|
||||
<h3>Source</h3>
|
||||
<button type="button" class="toggle-all-btn" data-toggle="source">Toggle All</button><br>
|
||||
{% for source in sources %}
|
||||
<label>
|
||||
<input type="checkbox" name="source" value="{{ source.id }}"
|
||||
{% if source.id|stringformat:"s" in selected_source or 'all' in selected_source %}checked{% endif %}>
|
||||
{{ source.source|truncatechars:50 }}
|
||||
</label><br>
|
||||
{% endfor %}
|
||||
|
||||
<!-- Filter by language -->
|
||||
<h3>Language</h3>
|
||||
<button type="button" class="toggle-all-btn" data-toggle="language">Toggle All</button><br>
|
||||
{% for lang in languages %}
|
||||
<label>
|
||||
<input type="checkbox" name="language" value="{{ lang }}"
|
||||
{% if lang|stringformat:"s" in selected_language or 'all' in selected_language%}checked{% endif %}>
|
||||
{{ lang|truncatechars:50 }}
|
||||
</label><br>
|
||||
{% endfor %}
|
||||
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<!-- Table URLs data -->
|
||||
<div class="table-container">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th>URL</th>
|
||||
<th>Status</th>
|
||||
<th>Fetch Date</th>
|
||||
<th>Search</th>
|
||||
<th>Source</th>
|
||||
<th>Valid content?</th>
|
||||
<th>Language</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for url in urls %}
|
||||
<tr>
|
||||
<td><a href="./{{ url.id }}" class="btn btn-primary btn-sm" target="_blank">{{ url.id }}</a></td>
|
||||
<td><a href="{{ url.url }}/" target="_blank">{{ url.url }}</a></td>
|
||||
<td>
|
||||
{% if url.status == 'raw' %}
|
||||
<span class="badge bg-secondary">{{ url.status|capfirst }}</span>
|
||||
{% elif url.status == 'error' %}
|
||||
<span class="badge bg-danger">{{ url.status|capfirst }}</span>
|
||||
{% elif url.status == 'valid' %}
|
||||
<span class="badge bg-success">{{ url.status|capfirst }}</span>
|
||||
{% elif url.status == 'unknown' %}
|
||||
<span class="badge bg-warning">{{ url.status|capfirst }}</span>
|
||||
{% elif url.status == 'invalid' %}
|
||||
<span class="badge bg-danger">{{ url.status|capfirst }}</span>
|
||||
{% elif url.status == 'duplicate' %}
|
||||
<span class="badge bg-info">{{ url.status|capfirst }}</span>
|
||||
{% else %}
|
||||
<span class="badge bg-light">Unknown</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<span class="ts-fetch" data-ts="{{ url.ts_fetch|date:'c' }}"></span>
|
||||
</td>
|
||||
<td>
|
||||
{% with sources_map|dict_get:url.id as sources %}
|
||||
{% if sources %}
|
||||
{% for source in sources %}
|
||||
<span class="badge bg-secondary">{{ source }}</span>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<span class="text-muted">No sources</span>
|
||||
{% endif %}
|
||||
{% endwith %}
|
||||
</td>
|
||||
<td>
|
||||
{% with searches_map|dict_get:url.id as searches %}
|
||||
{% if searches %}
|
||||
{% for search in searches %}
|
||||
<span class="badge bg-secondary">{{ search }}</span>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<span class="text-muted">No searches</span>
|
||||
{% endif %}
|
||||
{% endwith %}
|
||||
</td>
|
||||
<td>
|
||||
{% with url_content_map|dict_get:url.id as content %}
|
||||
{{ content.valid_content }}
|
||||
{% endwith %}
|
||||
</td>
|
||||
<td>
|
||||
{% with url_content_map|dict_get:url.id as content %}
|
||||
{{ content.language }}
|
||||
{% endwith %}
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
{% empty %}
|
||||
<tr>
|
||||
<td colspan="5">No URLs found for the selected filters.</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<!-- Pagination Controls -->
|
||||
<div class="pagination">
|
||||
<!-- <div class="pagination-controls"> -->
|
||||
<div class="pagination-container" style="margin-top: 20px;margin-bottom: 20px;">
|
||||
{% if urls.has_previous %}
|
||||
<a href="#" class="pagination-link" data-page="1">« First</a>
|
||||
<a href="#" class="pagination-link" data-page="{{ urls.previous_page_number }}">Previous</a>
|
||||
{% endif %}
|
||||
|
||||
<span>Page {{ urls.number }} of {{ urls.paginator.num_pages }}</span>
|
||||
|
||||
{% if urls.has_next %}
|
||||
<a href="#" class="pagination-link" data-page="{{ urls.next_page_number }}">Next</a>
|
||||
<a href="#" class="pagination-link" data-page="{{ urls.paginator.num_pages }}">Last »</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<script>
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
//////////////////////////////////////////////
|
||||
// Theme & Home
|
||||
const themeToggle = document.getElementById("themeToggle");
|
||||
const body = document.body;
|
||||
// Load theme from localStorage
|
||||
if (localStorage.getItem("theme") === "dark") {
|
||||
body.classList.add("dark-mode");
|
||||
themeToggle.textContent = "🌞";
|
||||
}
|
||||
// Toggle theme on button click
|
||||
themeToggle.addEventListener("click", function () {
|
||||
if (body.classList.contains("dark-mode")) {
|
||||
body.classList.remove("dark-mode");
|
||||
localStorage.setItem("theme", "light");
|
||||
themeToggle.textContent = "🌙";
|
||||
} else {
|
||||
body.classList.add("dark-mode");
|
||||
localStorage.setItem("theme", "dark");
|
||||
themeToggle.textContent = "🌞";
|
||||
}
|
||||
});
|
||||
// Home
|
||||
document.getElementById("homeButton").addEventListener("click", function () {
|
||||
window.location.href = "./"; // Change this to your homepage URL if different
|
||||
});
|
||||
// Charts
|
||||
document.getElementById("chartButton").addEventListener("click", function () {
|
||||
window.location.href = "./charts"; // Change this to your homepage URL if different
|
||||
});
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Timestamp to local timezone
|
||||
document.querySelectorAll(".ts-fetch").forEach(element => {
|
||||
let utcDate = element.getAttribute("data-ts"); // Get timestamp from data attribute
|
||||
let options = { year: 'numeric', month: 'numeric', day: 'numeric', hour: '2-digit', minute: '2-digit', second: '2-digit', hour12:false};
|
||||
if (utcDate) {
|
||||
let localDate = new Date(utcDate).toLocaleString("en-GB", options); // Convert to local timezone
|
||||
element.textContent = localDate; // Update the text content
|
||||
}
|
||||
});
|
||||
//////////////////////////////////////////////
|
||||
});
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Function to update pagination links
|
||||
function updatePaginationLinks(pageNumber) {
|
||||
// Get current URL and remove existing page parameter
|
||||
const currentUrl = new URL(window.location.href);
|
||||
currentUrl.searchParams.set('page', pageNumber); // Update page parameter
|
||||
window.location.href = currentUrl.toString(); // Redirect to the updated URL
|
||||
}
|
||||
// Attach event listeners to pagination links
|
||||
document.querySelectorAll('.pagination-link').forEach(link => {
|
||||
link.addEventListener('click', function(e) {
|
||||
e.preventDefault();
|
||||
const pageNumber = this.getAttribute('data-page');
|
||||
updatePaginationLinks(pageNumber); // Update the page number in the URL
|
||||
});
|
||||
});
|
||||
|
||||
// Function to update the form parameters for all sections before submitting
|
||||
function updateFormParameters() {
|
||||
// Get all distinct sections by selecting all checkboxes and extracting their "name" attributes
|
||||
const sections = new Set([...document.querySelectorAll("input[type='checkbox']")].map(cb => cb.name));
|
||||
|
||||
sections.forEach(section => {
|
||||
if (!section) return; // Skip any checkboxes without a name
|
||||
|
||||
const checkboxes = document.querySelectorAll(`[name='${section}']`);
|
||||
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
|
||||
|
||||
// If all checkboxes in a section are checked, remove them and add a hidden input
|
||||
if (allChecked) {
|
||||
checkboxes.forEach(checkbox => checkbox.removeAttribute("name"));
|
||||
let hiddenInput = document.createElement("input");
|
||||
hiddenInput.type = "hidden";
|
||||
hiddenInput.name = section;
|
||||
hiddenInput.value = "all";
|
||||
document.getElementById("filterForm").appendChild(hiddenInput);
|
||||
} else {
|
||||
checkboxes.forEach(checkbox => checkbox.setAttribute("name", section));
|
||||
document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove());
|
||||
}
|
||||
});
|
||||
|
||||
// Submit the form after updating all sections
|
||||
document.getElementById("filterForm").submit();
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Function to toggle all checkboxes in a section
|
||||
function toggleCheckboxes(section) {
|
||||
const checkboxes = document.querySelectorAll(`[name='${section}']`);
|
||||
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
|
||||
checkboxes.forEach(cb => cb.checked = !allChecked);
|
||||
updateFormParameters();
|
||||
}
|
||||
|
||||
// Attach event listeners to "Toggle All" buttons
|
||||
document.querySelectorAll('.toggle-all-btn').forEach(button => {
|
||||
button.addEventListener('click', function() {
|
||||
const section = this.getAttribute('data-toggle');
|
||||
toggleCheckboxes(section);
|
||||
});
|
||||
});
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Automatically submit the form when any checkbox changes
|
||||
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
|
||||
checkbox.addEventListener('change', function() {
|
||||
updateFormParameters();
|
||||
});
|
||||
});
|
||||
document.getElementById('perPageSelect').addEventListener('change', function() {
|
||||
updateFormParameters();
|
||||
});
|
||||
document.getElementById('timeFilterSelect').addEventListener('change', function() {
|
||||
updateFormParameters();
|
||||
});
|
||||
|
||||
|
||||
</script>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
297
app_urls/fetcher/templates/url_detail.html
Normal file
297
app_urls/fetcher/templates/url_detail.html
Normal file
@@ -0,0 +1,297 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>{% block title %}News{% endblock %}</title>
|
||||
|
||||
<!-- Bootstrap CSS -->
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
|
||||
<!-- Add jQuery from CDN (before other scripts) -->
|
||||
<script src="https://code.jquery.com/jquery-3.6.4.min.js"></script>
|
||||
<!-- Markdown -->
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||
<!-- Bootstrap JS -->
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
|
||||
|
||||
<!-- Custom Styles -->
|
||||
<style>
|
||||
body {
|
||||
background-color: #f4f4f4;
|
||||
}
|
||||
.navbar-dark .navbar-nav .nav-link {
|
||||
color: rgba(255,255,255,0.75);
|
||||
}
|
||||
.chat-box {
|
||||
background-color: #fff;
|
||||
border: 1px solid #ddd;
|
||||
padding: 15px;
|
||||
border-radius: 8px;
|
||||
overflow-y: auto; /* Enable vertical scrolling */
|
||||
max-width: 100%;
|
||||
min-height: 150px;
|
||||
max-height: 450px;
|
||||
white-space: normal;
|
||||
word-wrap: break-word;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.table {
|
||||
table-layout: auto;
|
||||
width: 100%;
|
||||
}
|
||||
th {
|
||||
white-space: nowrap;
|
||||
}
|
||||
td {
|
||||
word-wrap: break-word;
|
||||
overflow-wrap: break-word;
|
||||
|
||||
}
|
||||
|
||||
/* Sidebar */
|
||||
.sidebar {
|
||||
min-width: 110px; /* Minimum width */
|
||||
max-width: 200px; /* Maximum width */
|
||||
width: 100%; /* Make it take full width within the defined min and max */
|
||||
padding: 5px;
|
||||
box-sizing: border-box; /* Ensure padding doesn't increase the overall width */
|
||||
transition: width 0.3s ease-in-out; /* Smooth transition for resizing */
|
||||
background-color: #f4f4f4;
|
||||
box-sizing: border-box;
|
||||
word-wrap: break-word; /* Allow wrapping of long words */
|
||||
overflow-wrap: break-word; /* Ensures wrapping across browsers */
|
||||
white-space: normal; /* Ensure normal word wrapping */
|
||||
}
|
||||
|
||||
.dark-mode .sidebar {
|
||||
background-color: #1e1e1e;
|
||||
}
|
||||
|
||||
|
||||
</style>
|
||||
</head>
|
||||
<script>
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
document.addEventListener("DOMContentLoaded", function () {
|
||||
//////////////////////////////////////////////
|
||||
// Timestamp to local timezone
|
||||
document.querySelectorAll(".ts-fetch").forEach(element => {
|
||||
let utcDate = element.getAttribute("data-ts"); // Get timestamp from data attribute
|
||||
let options = { year: 'numeric', month: 'numeric', day: 'numeric', hour: '2-digit', minute: '2-digit', second: '2-digit', hour12:false};
|
||||
if (utcDate) {
|
||||
let localDate = new Date(utcDate).toLocaleString("en-GB", options); // Convert to local timezone
|
||||
element.textContent = localDate; // Update the text content
|
||||
}
|
||||
});
|
||||
document.querySelectorAll(".ts-publish").forEach(element => {
|
||||
let utcDate = element.getAttribute("data-ts"); // Get timestamp from data attribute
|
||||
let options = { year: 'numeric', month: 'numeric', day: 'numeric', hour: '2-digit', minute: '2-digit', second: '2-digit', hour12:false};
|
||||
if (utcDate) {
|
||||
let localDate = new Date(utcDate).toLocaleString("en-GB", options); // Convert to local timezone
|
||||
element.textContent = localDate; // Update the text content
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
function fetchDetails(urlId, url) {
|
||||
// Show the loading spinner
|
||||
document.getElementById("loading-spinner").style.display = "block";
|
||||
|
||||
// Get the input value
|
||||
let inputText = document.getElementById(`custom-input-${urlId}`).value;
|
||||
// Get the input model
|
||||
let selectedModel = document.getElementById(`options-${urlId}`).value;
|
||||
// Check if a model is selected
|
||||
if (!selectedModel) {
|
||||
alert("Please select a model before fetching details.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Fetch URL
|
||||
let fetchUrl = `/urls/${urlId}/fetch/?url=${encodeURIComponent(url)}&model=${encodeURIComponent(selectedModel)}&text=${encodeURIComponent(inputText)}`;
|
||||
|
||||
let resultContainer = $("#chat-output");
|
||||
resultContainer.html(""); // Clear previous content before fetching
|
||||
let fetchButton = $("button[onclick^='fetchDetails']"); // Select the button
|
||||
fetchButton.prop("disabled", true); // Disable button
|
||||
|
||||
fetch(fetchUrl/*, {
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
text: inputText
|
||||
}),
|
||||
headers: {
|
||||
"Content-type": "application/json; charset=UTF-8"
|
||||
}
|
||||
}*/).then(response => {
|
||||
if (!response.ok) {
|
||||
throw new Error("Error on network response");
|
||||
}
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
let accumulatedText = ""; // Store streamed text before rendering Markdown
|
||||
let messageContainer = $('<div class="chat-message"></div>'); // Create a temporary container for streaming response
|
||||
resultContainer.append(messageContainer);
|
||||
|
||||
function read() {
|
||||
return reader.read().then(({ done, value }) => {
|
||||
if (done) {
|
||||
messageContainer.html(marked.parse(accumulatedText));
|
||||
fetchButton.prop("disabled", false); // Re-enable button when done
|
||||
return;
|
||||
}
|
||||
// Decode the streamed chunk
|
||||
let chunk = decoder.decode(value);
|
||||
// Append to the accumulated text
|
||||
accumulatedText += chunk;
|
||||
// Render Markdown progressively (but safely)
|
||||
messageContainer.html(marked.parse(accumulatedText));
|
||||
// Auto-scroll to bottom
|
||||
resultContainer.scrollTop(resultContainer[0].scrollHeight);
|
||||
return read();
|
||||
});
|
||||
}
|
||||
return read();
|
||||
})
|
||||
.catch(error => {
|
||||
resultContainer.html(`<p class="text-danger">Error fetching details: ${error.message}</p>`);
|
||||
fetchButton.prop("disabled", false); // Re-enable button on error
|
||||
})
|
||||
.finally(() => {
|
||||
// Hide the loading spinner after request is complete
|
||||
document.getElementById("loading-spinner").style.display = "none";
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<body>
|
||||
|
||||
<!--
|
||||
<div class="sidebar">
|
||||
<div class="button-container">
|
||||
<button id="homeButton" class="home-button">🏠</button>
|
||||
<button id="themeToggle" class="theme-button">🌙</button>
|
||||
</div>
|
||||
</div>
|
||||
-->
|
||||
|
||||
<!-- Main Content -->
|
||||
<div class="container mt-4">
|
||||
<!-- <h2>URL Details</h2> -->
|
||||
<table class="table table-bordered">
|
||||
<tr>
|
||||
<th>URL</th>
|
||||
<td><a href="{{ url_item.url|safe }}" target="_blank">{{ url_item.url }}</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Fetch Date</th>
|
||||
<td> <span class="ts-fetch" data-ts="{{ url_item.ts_fetch|date:'c' }}"></span> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Source</th>
|
||||
<td>{{ sources|join:", " }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Search</th>
|
||||
<td>{{ searches|join:", " }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Status</th>
|
||||
<td>{{ url_item.status }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>URL host</th>
|
||||
<td> <a href="{{ url_content.url_host|safe }}" target="_blank">{{ url_content.url_host }}</a> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Site name</th>
|
||||
<td>{{ url_content.site_name|default:"" }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Published Date</th>
|
||||
<td> <span class="ts-publish" data-ts="{{ url_content.date_published|date:'c' }}"></span> </td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Valid news content?</th>
|
||||
<td>{{ url_content.valid_content }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Tags</th>
|
||||
<td>{{ url_content.tags|default:"" }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Authors</th>
|
||||
<td>{{ url_content.authors|default:"" }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Keywords</th>
|
||||
<td>{{ url_content.keywords|default:"" }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Language</th>
|
||||
<td>{{ url_content.language|default:"" }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Main image</th>
|
||||
<td><a href="{{ url_content.image_main_url|safe }}" target="_blank">{{ url_content.image_main_url|default:"" }}</a></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Image URLs</th>
|
||||
<td>{{ url_content.image_urls|default:"" }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Video URLs</th>
|
||||
<td>{{ url_content.videos_url|default:"" }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Title</th>
|
||||
<td>{{ url_content.title|default:"" }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Description</th>
|
||||
<td>{{ url_content.description|default:"" }}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Content</th>
|
||||
<td>{{ url_content.content|default:"" }}</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<!-- Independent form for optional values -->
|
||||
<form onsubmit="fetchDetailsWithSelection(event, {{ url_item.id }}, '{{ url_item.url }}')">
|
||||
<label for="options-{{ url_item.id }}">Model:</label>
|
||||
<select id="options-{{ url_item.id }}" class="form-control mb-2">
|
||||
{% for model in models %}
|
||||
<option value="{{ model }}">{{ model }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</form>
|
||||
|
||||
<!-- Input field with a default value -->
|
||||
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
|
||||
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="5">{{ prompt }}
|
||||
{{ url_item.url }}</textarea>
|
||||
|
||||
<div class="d-flex align-items-center">
|
||||
<!-- Fetch details button -->
|
||||
<button class="btn btn-primary" onclick="fetchDetails({{ url_item.id }}, '{{ url_item.url }}')">
|
||||
Fetch Details
|
||||
</button>
|
||||
|
||||
<!-- Loading Spinner (Hidden by Default) -->
|
||||
<div id="loading-spinner" class="spinner-border text-primary ms-2" role="status" style="display: none;">
|
||||
<span class="visually-hidden">Loading...</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Chatbot-style response box -->
|
||||
<div class="chat-box mt-3 p-3 border rounded">
|
||||
<div id="chat-output"></div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
{% block extra_js %}{% endblock %}
|
||||
</body>
|
||||
</html>
|
||||
0
app_urls/fetcher/templatetags/__init__.py
Normal file
0
app_urls/fetcher/templatetags/__init__.py
Normal file
8
app_urls/fetcher/templatetags/custom_filters.py
Normal file
8
app_urls/fetcher/templatetags/custom_filters.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from django import template
|
||||
|
||||
register = template.Library()
|
||||
|
||||
@register.filter
|
||||
def dict_get(dictionary, key):
|
||||
"""Custom filter to get a value from a dictionary in Django templates."""
|
||||
return dictionary.get(key, [])
|
||||
3
app_urls/fetcher/tests.py
Normal file
3
app_urls/fetcher/tests.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
||||
20
app_urls/fetcher/urls.py
Normal file
20
app_urls/fetcher/urls.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from django.urls import path
|
||||
from . import views
|
||||
|
||||
urlpatterns = [
|
||||
path('', views.link_list, name='link_list'),
|
||||
#
|
||||
path('logs/<str:log_type>', views.logs, name='logs'),
|
||||
#
|
||||
path('task/<str:task>', views.trigger_task, name='trigger_task'),
|
||||
#
|
||||
path('urls/charts/', views.charts, name='charts'),
|
||||
path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
|
||||
path('urls-per-status/', views.urls_per_status, name='urls_per_status'),
|
||||
path('urls-per-source/', views.urls_per_source, name='urls_per_source'),
|
||||
path('urls-per-search/', views.urls_per_search, name='urls_per_search'),
|
||||
#
|
||||
path('urls/', views.filtered_urls, name='filtered_urls'),
|
||||
path('urls/<int:id>/', views.url_detail_view, name='url_detail'),
|
||||
path('urls/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
|
||||
]
|
||||
356
app_urls/fetcher/views.py
Normal file
356
app_urls/fetcher/views.py
Normal file
@@ -0,0 +1,356 @@
|
||||
from .tasks import background_task
|
||||
from django.core.paginator import Paginator
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
|
||||
from django.contrib.auth.decorators import login_required
|
||||
import ollama
|
||||
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
|
||||
import os
|
||||
|
||||
####################################################################################################
|
||||
def trigger_task(request, task):
|
||||
# Enqueue function in "default" queue
|
||||
background_task.delay(task)
|
||||
return JsonResponse({"message": "Task has been enqueued!", "task": task})
|
||||
|
||||
####################################################################################################
|
||||
def link_list(request):
|
||||
prefix = "http://localhost:8000/task"
|
||||
links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
||||
|
||||
list_links = [
|
||||
# DB
|
||||
"http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500",
|
||||
# Admin panel
|
||||
"http://localhost:8000/admin",
|
||||
# Logs
|
||||
"http://localhost:8000/logs/debug",
|
||||
"http://localhost:8000/logs/info",
|
||||
"http://localhost:8000/logs/error",
|
||||
# URLs
|
||||
"http://localhost:8000/urls",
|
||||
# Charts
|
||||
"http://localhost:8000/urls/charts",
|
||||
# Fetcher tasks
|
||||
] + [os.path.join(prefix, l) for l in links]
|
||||
# Json
|
||||
return JsonResponse({"links": list_links })
|
||||
|
||||
####################################################################################################
|
||||
# @login_required(login_url='/admin')
|
||||
def logs(request, log_type):
|
||||
# Capture output: python manage.py rqstats
|
||||
try:
|
||||
with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
|
||||
file_content = f.read()
|
||||
except Exception as e:
|
||||
file_content = "Error reading logs for log type :{}".format(log_type)
|
||||
return HttpResponse(file_content, content_type="text/plain")
|
||||
|
||||
####################################################################################################
|
||||
class OllamaClient():
|
||||
def __init__(self):
|
||||
self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
|
||||
|
||||
def _get_default_model(self):
|
||||
return "llama3.2:3b"
|
||||
|
||||
def get_models(self):
|
||||
models = sorted([m.model for m in self.client.list().models])
|
||||
if (self._get_default_model() in models):
|
||||
return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
|
||||
else:
|
||||
return models
|
||||
|
||||
def get_prompt(self):
|
||||
return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
|
||||
#return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
|
||||
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
|
||||
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
|
||||
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
|
||||
# return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
|
||||
#return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
|
||||
|
||||
# TODO: move to ollamajs...
|
||||
def fetch_details(request, id):
|
||||
url_item = get_object_or_404(Urls, id=id)
|
||||
url_param = request.GET.get("url", "") # Get URL
|
||||
model = request.GET.get("model", "") # Get LLM model
|
||||
text = request.GET.get("text", "") # Get LLM prompt
|
||||
|
||||
# print(request)
|
||||
# print(text)
|
||||
|
||||
# LLM
|
||||
ollama = OllamaClient()
|
||||
|
||||
def stream_response():
|
||||
msg_content = {
|
||||
"role": "user",
|
||||
"content": text,
|
||||
}
|
||||
response = ollama.client.chat(model=model, messages=[msg_content], stream=True)
|
||||
for chunk in response:
|
||||
yield chunk["message"]["content"] # Stream each chunk of text
|
||||
|
||||
return StreamingHttpResponse(stream_response(), content_type="text/plain")
|
||||
|
||||
|
||||
def url_detail_view(request, id):
|
||||
url_item = get_object_or_404(Urls, id=id)
|
||||
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
||||
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
|
||||
# url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
|
||||
|
||||
try:
|
||||
url_content = UrlContent.objects.get(pk=id)
|
||||
except UrlContent.DoesNotExist:
|
||||
url_content = {}
|
||||
|
||||
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
|
||||
ollama = OllamaClient()
|
||||
|
||||
context = {
|
||||
'url_item': url_item,
|
||||
'sources': url_sources,
|
||||
'searches': url_searches,
|
||||
'models': ollama.get_models(),
|
||||
'prompt': ollama.get_prompt(),
|
||||
'url_content': url_content,
|
||||
}
|
||||
return render(request, 'url_detail.html', context)
|
||||
|
||||
####################################################################################################
|
||||
from django.shortcuts import render
|
||||
from django.http import JsonResponse
|
||||
from django.db.models import Count
|
||||
from datetime import timedelta
|
||||
from django.utils import timezone
|
||||
from .models import Urls, UrlsSourceSearch
|
||||
|
||||
def charts(request):
|
||||
return render(request, 'charts.html')
|
||||
|
||||
def urls_by_fetch_date(request):
|
||||
# Get the filtering date parameter
|
||||
days = float(request.GET.get('days', 30)) # Default is 30 days
|
||||
start_date = timezone.now() - timedelta(days=days)
|
||||
|
||||
# Count the number of URLs grouped by fetch date
|
||||
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
|
||||
.values('ts_fetch__date') \
|
||||
.annotate(count=Count('id')) \
|
||||
.order_by('ts_fetch__date')
|
||||
|
||||
# Format data to return as JSON
|
||||
data = {
|
||||
'labels': [item['ts_fetch__date'] for item in urls_data],
|
||||
'values': [item['count'] for item in urls_data],
|
||||
}
|
||||
|
||||
return JsonResponse(data)
|
||||
|
||||
def urls_per_status(request):
|
||||
# Get the filtering date parameter
|
||||
days = float(request.GET.get('days', 30)) # Default is 30 days
|
||||
start_date = timezone.now() - timedelta(days=days)
|
||||
|
||||
# Count the number of URLs grouped by status within the date range
|
||||
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
|
||||
.values('status') \
|
||||
.annotate(count=Count('id')) \
|
||||
.order_by('status')
|
||||
|
||||
# Format data for JSON
|
||||
data = {
|
||||
'labels': [item['status'] for item in urls_data],
|
||||
'values': [item['count'] for item in urls_data],
|
||||
}
|
||||
|
||||
return JsonResponse(data)
|
||||
|
||||
def urls_per_source(request):
|
||||
# Get the filtering date parameter
|
||||
days = float(request.GET.get('days', 30)) # Default is 30 days
|
||||
start_date = timezone.now() - timedelta(days=days)
|
||||
|
||||
# Count the number of URLs grouped by source
|
||||
urls_data = UrlsSourceSearch.objects \
|
||||
.filter(id_url__ts_fetch__gte=start_date) \
|
||||
.values('id_source__source') \
|
||||
.annotate(count=Count('id_url')) \
|
||||
.order_by('id_source__source')
|
||||
|
||||
# Format data for JSON
|
||||
data = {
|
||||
'labels': [item['id_source__source'] for item in urls_data],
|
||||
'values': [item['count'] for item in urls_data],
|
||||
}
|
||||
|
||||
return JsonResponse(data)
|
||||
|
||||
def urls_per_search(request):
|
||||
# Get the filtering date parameter
|
||||
days = float(request.GET.get('days', 30)) # Default is 30 days
|
||||
start_date = timezone.now() - timedelta(days=days)
|
||||
|
||||
# Count the number of URLs grouped by search
|
||||
urls_data = UrlsSourceSearch.objects \
|
||||
.filter(id_url__ts_fetch__gte=start_date) \
|
||||
.values('id_search__search') \
|
||||
.annotate(count=Count('id_url')) \
|
||||
.order_by('id_search__search')
|
||||
|
||||
# Format data for JSON
|
||||
data = {
|
||||
'labels': [item['id_search__search'] for item in urls_data],
|
||||
'values': [item['count'] for item in urls_data],
|
||||
}
|
||||
|
||||
return JsonResponse(data)
|
||||
|
||||
|
||||
|
||||
####################################################################################################
|
||||
from django.shortcuts import render
|
||||
from .models import Urls, Search, Source
|
||||
from django.db.models import Q
|
||||
from django.utils.timezone import now, timedelta
|
||||
|
||||
|
||||
def filtered_urls(request):
|
||||
statuses = Urls.STATUS_ENUM.choices
|
||||
searches = Search.objects.all()
|
||||
sources = Source.objects.all()
|
||||
# TODO: Cache languages, update once every N
|
||||
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
|
||||
# Null for visualization
|
||||
languages = ["Unknown"] + [l for l in languages if l is not None]
|
||||
valid_contents = ["True", "False", "Unknown"]
|
||||
|
||||
# Get selected parameters
|
||||
selected_status = request.GET.getlist('status', ["null"])
|
||||
selected_search = request.GET.getlist('search', ["null"])
|
||||
selected_source = request.GET.getlist('source', ["null"])
|
||||
selected_language = request.GET.getlist('language', ["null"])
|
||||
selected_valid_contents = request.GET.getlist('valid_content', ["null"])
|
||||
selected_days = request.GET.get("days", 30)
|
||||
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
|
||||
page_number = request.GET.get('page') # Get the current page number
|
||||
|
||||
|
||||
all_status = [str(status[0]) for status in statuses]
|
||||
all_search = [str(search.id) for search in searches]
|
||||
all_source = [str(source.id) for source in sources]
|
||||
all_languages = languages
|
||||
all_valid_contents = valid_contents
|
||||
|
||||
|
||||
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
|
||||
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
|
||||
selected_status = ["all"]
|
||||
selected_search = ["all"]
|
||||
selected_source = ["all"]
|
||||
selected_language = ["all"]
|
||||
selected_valid_contents = ["all"]
|
||||
else:
|
||||
# Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
|
||||
if (set(selected_status) == set(all_status)):
|
||||
selected_status = ["all"]
|
||||
if (set(selected_search) == set(all_search)):
|
||||
selected_search = ["all"]
|
||||
if (set(selected_source) == set(all_source)):
|
||||
selected_source = ["all"]
|
||||
if (set(selected_language) == set(all_languages)):
|
||||
selected_language = ["all"]
|
||||
if (set(selected_valid_contents) == set(all_valid_contents)):
|
||||
selected_valid_contents = ["all"]
|
||||
|
||||
# Filter URLs based on selected filters
|
||||
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
|
||||
urls = []
|
||||
else:
|
||||
# Filter by date
|
||||
query = Q(ts_fetch__gte=now() - timedelta(days=float(selected_days)))
|
||||
# Additional filters
|
||||
if ("all" not in selected_status):
|
||||
query &= Q(status__in=selected_status)
|
||||
if ("all" not in selected_source):
|
||||
query &= Q(urlssourcesearch__id_source__in=selected_source)
|
||||
if ("all" not in selected_search):
|
||||
query &= Q(urlssourcesearch__id_search__in=selected_search)
|
||||
if ("all" not in selected_language):
|
||||
# URLs with selected languages
|
||||
subquery = Q(urlcontent__language__in=selected_language)
|
||||
if ("Unknown" in selected_language):
|
||||
# URLs with NULL language
|
||||
subquery |= Q(urlcontent__language__isnull=True)
|
||||
# URLs with no UrlContent record at all (similar to URLs with NULL language)
|
||||
subquery |= Q(urlcontent__id_url__isnull=True)
|
||||
# Update query
|
||||
query &= (subquery)
|
||||
if ("all" not in selected_valid_contents):
|
||||
# Boolean array
|
||||
bool_array = []
|
||||
if ('True' in selected_valid_contents):
|
||||
bool_array.append(True)
|
||||
if ('False' in selected_valid_contents):
|
||||
bool_array.append(False)
|
||||
# URLs with selected valid_contents
|
||||
subquery = Q(urlcontent__valid_content__in=bool_array)
|
||||
if ("Unknown" in selected_valid_contents):
|
||||
# URLs with NULL valid_content
|
||||
subquery |= Q(urlcontent__valid_content__isnull=True)
|
||||
# URLs with no UrlContent record at all (similar to URLs with NULL valid_content)
|
||||
subquery |= Q(urlcontent__id_url__isnull=True)
|
||||
# Update query
|
||||
query &= (subquery)
|
||||
|
||||
# Run query
|
||||
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
|
||||
# print(urls.query)
|
||||
|
||||
# Pagination
|
||||
paginator = Paginator(urls, per_page) # Paginate the filtered URLs
|
||||
page_obj = paginator.get_page(page_number) # Get the current page object
|
||||
|
||||
# Map URL IDs to their sources & searches, only for subset of URLs (page of interest)
|
||||
sources_map = {
|
||||
url.id: list(Source.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
|
||||
}
|
||||
searches_map = {
|
||||
url.id: list(Search.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
|
||||
}
|
||||
url_content_map = {
|
||||
url.id: UrlContent.objects.filter(pk=url).first() for url in page_obj.object_list
|
||||
}
|
||||
# Custom replace search type text
|
||||
for s in searches:
|
||||
s.type = s.type.replace("rss_feed", "rss").replace("url_host", "url").replace("keyword_search", "keyword")
|
||||
|
||||
context = {
|
||||
'urls': page_obj, # Pass the paginated URLs
|
||||
'per_page': per_page, # Send per_page value for dynamic pagination
|
||||
'statuses': statuses,
|
||||
'searches': sorted(searches, key=lambda x: (x.type, x.search)),
|
||||
'sources': sorted(sources, key=lambda x: x.source),
|
||||
'languages': sorted(languages, key=lambda x: (x is None, x)),
|
||||
'valid_contents': valid_contents,
|
||||
# Selection
|
||||
'selected_status': selected_status,
|
||||
'selected_search': selected_search,
|
||||
'selected_source': selected_source,
|
||||
'selected_language': selected_language,
|
||||
'selected_valid_contents': selected_valid_contents,
|
||||
"selected_days": selected_days,
|
||||
# Map
|
||||
"sources_map": sources_map,
|
||||
"searches_map": searches_map,
|
||||
"url_content_map": url_content_map,
|
||||
# "charts": charts,
|
||||
# "list_per_page": [15, 100, 500],
|
||||
# "list_days_text": ([0.25, 1, 7, 30, 365], ["Last 6 hours", "Last 24 hours", "Last 7 days", "Last 30 days", "Last 365 days"])
|
||||
}
|
||||
|
||||
return render(request, 'filtered_urls.html', context)
|
||||
####################################################################################################
|
||||
Reference in New Issue
Block a user