Dockerization, whitenoise serving static, refactor

This commit is contained in:
Luciano Gervasoni
2025-04-04 10:53:16 +02:00
parent 5addfa5ba9
commit 4dbe2e55ef
39 changed files with 708 additions and 1238 deletions

View File

12
app_urls/fetcher/admin.py Normal file
View File

@@ -0,0 +1,12 @@
from django.contrib import admin
# Register your models here.
from .models import Search, Source, StatusPatternMatching, UrlContent, Urls, UrlsDuplicate, UrlsSourceSearch
admin.site.register(Search)
admin.site.register(Source)
admin.site.register(StatusPatternMatching)
admin.site.register(UrlContent)
admin.site.register(Urls)
admin.site.register(UrlsDuplicate)
admin.site.register(UrlsSourceSearch)

6
app_urls/fetcher/apps.py Normal file
View File

@@ -0,0 +1,6 @@
from django.apps import AppConfig
class FetcherConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'fetcher'

View File

@@ -0,0 +1,109 @@
# Generated by Django 5.2 on 2025-04-02 16:44
import django.contrib.postgres.fields
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Search',
fields=[
('id', models.SmallAutoField(primary_key=True, serialize=False)),
('search', models.TextField(unique=True)),
('type', models.TextField(choices=[('rss_feed', 'RSS_Feed'), ('keyword_search', 'Keyword_Search'), ('url_host', 'URL_Host')])),
],
options={
'db_table': 'search',
'managed': False,
},
),
migrations.CreateModel(
name='Source',
fields=[
('id', models.SmallAutoField(primary_key=True, serialize=False)),
('source', models.TextField(unique=True)),
],
options={
'db_table': 'source',
'managed': False,
},
),
migrations.CreateModel(
name='StatusPatternMatching',
fields=[
('pattern', models.TextField(primary_key=True, serialize=False)),
('priority', models.SmallIntegerField()),
('status', models.TextField()),
],
options={
'db_table': 'status_pattern_matching',
'managed': False,
},
),
migrations.CreateModel(
name='Urls',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('url', models.TextField(unique=True)),
('ts_fetch', models.DateTimeField(auto_now_add=True)),
('status', models.TextField(choices=[('raw', 'Raw'), ('error', 'Error'), ('valid', 'Valid'), ('unknown', 'Unknown'), ('invalid', 'Invalid'), ('duplicate', 'Duplicate')], default='raw')),
],
options={
'db_table': 'urls',
'ordering': ['-ts_fetch'],
'managed': False,
},
),
migrations.CreateModel(
name='UrlContent',
fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
('date_published', models.DateTimeField(blank=True, null=True)),
('title', models.TextField(blank=True, null=True)),
('description', models.TextField(blank=True, null=True)),
('content', models.TextField(blank=True, null=True)),
('valid_content', models.BooleanField(blank=True, null=True)),
('language', models.CharField(blank=True, max_length=2, null=True)),
('keywords', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
('tags', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
('authors', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
('image_main_url', models.TextField(blank=True, null=True)),
('images_url', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
('videos_url', django.contrib.postgres.fields.ArrayField(base_field=models.TextField(blank=True, null=True), size=None)),
('url_host', models.TextField(blank=True, null=True)),
('site_name', models.TextField(blank=True, null=True)),
],
options={
'db_table': 'url_content',
'managed': False,
},
),
migrations.CreateModel(
name='UrlsDuplicate',
fields=[
('id_url_canonical', models.OneToOneField(db_column='id_url_canonical', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
],
options={
'db_table': 'urls_duplicate',
'managed': False,
},
),
migrations.CreateModel(
name='UrlsSourceSearch',
fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='fetcher.urls')),
],
options={
'db_table': 'urls_source_search',
'managed': False,
},
),
]

View File

140
app_urls/fetcher/models.py Normal file
View File

@@ -0,0 +1,140 @@
from django.db import models
from django.contrib.postgres.fields import ArrayField
# Create your models here.
class Search(models.Model):
class TYPE_ENUM(models.TextChoices):
RSS_FEED = "rss_feed", "RSS_Feed"
KEYWORD_SEARCH = "keyword_search", "Keyword_Search"
URL_HOST = "url_host", "URL_Host"
id = models.SmallAutoField(primary_key=True)
search = models.TextField(unique=True)
type = models.TextField(choices=TYPE_ENUM.choices) # This field type is a guess.
class Meta:
managed = False
db_table = 'search'
def __str__(self):
return "[{}: {}]".format(self.type, self.search)
class Source(models.Model):
id = models.SmallAutoField(primary_key=True)
source = models.TextField(unique=True)
class Meta:
managed = False
db_table = 'source'
def __str__(self):
return "[{}]".format(self.source)
class StatusPatternMatching(models.Model):
pattern = models.TextField(primary_key=True)
priority = models.SmallIntegerField()
status = models.TextField() # This field type is a guess.
class Meta:
managed = False
db_table = 'status_pattern_matching'
def __str__(self):
return "{} -> {} [Priority: {}]".format(self.pattern, self.status, self.priority)
class UrlContent(models.Model):
id_url = models.OneToOneField('Urls', models.DO_NOTHING, db_column='id_url', primary_key=True)
date_published = models.DateTimeField(blank=True, null=True)
title = models.TextField(blank=True, null=True)
description = models.TextField(blank=True, null=True)
content = models.TextField(blank=True, null=True)
valid_content = models.BooleanField(blank=True, null=True)
language = models.CharField(max_length=2, blank=True, null=True)
keywords = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
tags = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
authors = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
image_main_url = models.TextField(blank=True, null=True)
images_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
videos_url = ArrayField(models.TextField(blank=True, null=True)) # This field type is a guess.
url_host = models.TextField(blank=True, null=True)
site_name = models.TextField(blank=True, null=True)
class Meta:
managed = False
db_table = 'url_content'
class Urls(models.Model):
class STATUS_ENUM(models.TextChoices):
RAW = "raw", "Raw"
ERROR = "error", "Error"
VALID = "valid", "Valid"
UNKNOWN = "unknown", "Unknown"
INVALID = "invalid", "Invalid"
DUPLICATE = "duplicate", "Duplicate"
url = models.TextField(unique=True)
ts_fetch = models.DateTimeField(auto_now_add=True)
status = models.TextField(choices=STATUS_ENUM.choices, default=STATUS_ENUM.RAW) # This field type is a guess.
class Meta:
managed = False
db_table = 'urls'
ordering = ["-ts_fetch"]
def __str__(self):
return "URL: {} Fetch:{} Status:{}".format(self.url, self.ts_fetch, self.status)
class UrlsDuplicate(models.Model):
id_url_canonical = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url_canonical', primary_key=True) # The composite primary key (id_url_canonical, id_url_duplicated) found, that is not supported. The first column is selected.
id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
class Meta:
managed = False
db_table = 'urls_duplicate'
unique_together = (('id_url_canonical', 'id_url_duplicated'),)
def __str__(self):
return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical)
class UrlsSourceSearch(models.Model):
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
class Meta:
managed = False
db_table = 'urls_source_search'
unique_together = (('id_url', 'id_source', 'id_search'),)
def __str__(self):
return "{} {} {}".format(self.id_source, self.id_search, self.id_url)
""" # TODO: Migrate to django 5.2
class UrlsDuplicate(models.Model):
pk = models.CompositePrimaryKey('id_url_canonical', 'id_url_duplicated')
id_url_canonical = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_canonical')
id_url_duplicated = models.ForeignKey(Urls, models.DO_NOTHING, db_column='id_url_duplicated', related_name='urlsduplicate_id_url_duplicated_set')
class Meta:
managed = False
db_table = 'urls_duplicate'
unique_together = (('id_url_canonical', 'id_url_duplicated'),)
def __str__(self):
return "{} {} ".format(self.id_url_duplicated, self.id_url_canonical)
class UrlsSourceSearch(models.Model):
pk = models.CompositePrimaryKey('id_url', 'id_source', 'id_search')
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url')
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
class Meta:
managed = False
db_table = 'urls_source_search'
unique_together = (('id_url', 'id_source', 'id_search'),)
def __str__(self):
return "{} {} {}".format(self.id_source, self.id_search, self.id_url)
"""

View File

@@ -0,0 +1,273 @@
from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPatternMatching, Source, Search
from django.db.models import Q
from django.core.cache import cache
from django.db import IntegrityError
from .url_processor import process_url, get_with_protocol
import re
import traceback
from .logger import get_logger
logger = get_logger()
class DB_Handler():
def __init__(self):
# Inserting raw URL, cache time: 1 day
self._cache_timeout_insert_url = 86400
# Processing error URL, cache time: 2 days
self._cache_timeout_error_url = 86400*2
def insert_raw_urls(self, urls, obj_source, obj_search):
try:
logger.debug("Inserting raw URLs")
# Empty?
if (len(urls) == 0):
logger.debug("Empty batch of urls (not writing to DB) for source-search: {} - {}".format(obj_source.source, obj_search.search))
return
# Default protocol https://
urls_clean = [get_with_protocol(url) for url in urls]
urls_to_insert = []
# Per URL
for url in urls_clean:
### Already processed URL?
if (cache.get("insert_{}".format(url)) is not None):
logger.debug("Already cached URL: {}".format(url))
if (cache.get("insert_{}{}{}".format(url, obj_source.source, obj_search.search)) is not None):
logger.debug("Already cached (URL, source, search): {} {} {}".format(url, obj_source.source, obj_search.search))
else:
### Insert (URL_id, source_id, search_id), since not cached
# Get URL ID (should already be created)
obj_url, created = Urls.objects.get_or_create(url=url)
# Create (id_source, id_url) (shouldn't exist)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
else:
# Add object to insert
# url_object_to_insert.append(Urls(url=url))
urls_to_insert.append(url)
### Insert URLs & (URL_id, source_id)
try:
### Bulk insert, fails if duplicated URL (not retuning IDs when using ignore_conflicts=True)
# URLs (ignore_conflicts=False to return IDs)
bulk_created_urls = Urls.objects.bulk_create([Urls(url=url) for url in urls_to_insert], ignore_conflicts=False)
# (URL_id, source_id)
UrlsSourceSearch.objects.bulk_create([UrlsSourceSearch(id_url=obj_url, id_source=obj_source, id_search=obj_search) for obj_url in bulk_created_urls], ignore_conflicts=True)
except IntegrityError as e:
### Fallback to one-by-one insert
logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method")
# One by one
for url in urls_to_insert:
# URL
obj_url, created = Urls.objects.get_or_create(url=url)
if (created):
logger.debug("Inserted: {}".format(obj_url.url))
else:
logger.debug("Not inserted: {}".format(obj_url.url))
# (URL, source, search)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
except Exception as e:
logger.warning("bulk_create unknown exception while inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
# Avoid caching due to error on insertion
urls_clean = []
# Insert or update cache
for url in urls_clean:
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search))
except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
def set_status(obj_url, status):
# Update status if setting a new value
if (obj_url.status != status):
obj_url.status = status
obj_url.save()
##### Filter URL? -> Invalid
if (status_pattern_match == "invalid"):
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
##### Process URL
try:
# Get data
dict_url_data = process_url(obj_url.url)
except Exception as e:
if (raise_exception_on_error):
# Simply raise exception, handled in a different way
raise Exception("Error processing URL, raising exception as expected")
else:
logger.debug("Error processing URL: {}\n{}\n{}".format(obj_url.url, str(e), traceback.format_exc()))
# Set status to error
dict_url_data = None
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
if (dict_url_data is None):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
# Invalid? e.g. binary data
if (dict_url_data.get("override_status") == "invalid"):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
##### Canonical URL different? -> Duplicate
if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
# Get or create URL with canonical form
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
# Get the source-search IDs associated to obj_url.id
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
for obj_url_source_search in list_url_source_search:
# Associate same sources to url_canonical (it might already exist)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
# URLs duplciate association
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
# TODO: return obj_url_canonical so as to directly process the recently inserted URL
# Wherever this function is called, add:
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
# Next URL
return
##### Valid URL
# Update status
set_status(obj_url, Urls.STATUS_ENUM.VALID)
# Create or update extracted URL data
UrlContent.objects.update_or_create(
id_url=obj_url,
defaults = {
"date_published" : dict_url_data.get("publish_date"),
"title" : dict_url_data.get("title"),
"description" : dict_url_data.get("description"),
"content" : dict_url_data.get("content"),
"valid_content" : dict_url_data.get("valid_content"),
"language" : dict_url_data.get("language"),
"keywords" : dict_url_data.get("keywords"),
"tags" : dict_url_data.get("tags"),
"authors" : dict_url_data.get("authors"),
"image_main_url" : dict_url_data.get("image_main_url"),
"images_url" : dict_url_data.get("images_url"),
"videos_url" : dict_url_data.get("videos_url"),
"url_host" : dict_url_data.get("url_host"),
"site_name" : dict_url_data.get("site_name"),
}
)
def process_raw_urls(self, batch_size):
def _get_status_pattern_matching(url, list_pattern_status_tuple):
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
"""
# Sort pattern tuples by priority. (pattern, priority, status)
for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
# Regular expression pattern matching: https://regexr.com/
if bool(re.match(regex_pattern, obj_url.url)):
logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
return status_if_match
return None
try:
logger.debug("Processing raw URLs")
# Get batch of URLs, status='raw'
raw_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.RAW)[:batch_size]
if (len(raw_urls) == 0):
logger.debug("No raw URLs to process")
return
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
# Per URL
for obj_url in raw_urls:
# Override status if pattern matching?
status_pattern_match = _get_status_pattern_matching(obj_url.url, list_pattern_status_tuple)
# Process URL
self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False)
logger.info("Updated #{} raw URLs".format(len(raw_urls)))
except Exception as e:
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
def process_error_urls(self, batch_size):
try:
logger.debug("Processing error URLs")
# Keep track of processed and skipped "error" URLs
num_urls_skipped, num_urls_processed = 0, 0
# Get batch of URLs, status='error'
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
while ((len(error_urls) > 0) and (num_urls_processed < batch_size)):
# Per URL
for obj_url in error_urls:
# URL ID cached? -> Tried to process recently already, skip
if (cache.get("error_{}".format(obj_url.id)) is not None):
logger.debug("Already cached URL ID: {}".format(obj_url.id))
num_urls_skipped += 1
continue
try:
# Process URL
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
num_urls_processed += 1
except Exception as e:
# Error, cache to avoid re-processing for X time
cache.set("error_{}".format(obj_url.id), True, timeout=self._cache_timeout_insert_url)
num_urls_skipped += 1
# Get following batch of URLs, status='error'
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
logger.info("Updated #{}, skipped #{} error URLs".format(num_urls_processed, num_urls_skipped))
except Exception as e:
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
def process_missing_kids_urls(self, batch_size=None):
try:
logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size))
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
&
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
)
# Get batch size
if (batch_size is not None):
missingkids_urls = missingkids_urls[:batch_size]
# Per URL
for obj_url in missingkids_urls:
try:
# Process URL. If no exception -> Valid
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
except Exception as e:
# Raised exception -> Invalid (404 error)
obj_url.status = Urls.STATUS_ENUM.INVALID
obj_url.save()
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
except Exception as e:
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,51 @@
from .db_utils import DB_Handler
from ..models import Search, Source
import feedparser
import dateutil
import traceback
from .logger import get_logger
logger = get_logger()
class FetchFeeds():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Feeds")
def run(self):
try:
logger.debug("Starting FetchFeeds.run()")
# Get source object
obj_source, created = Source.objects.get_or_create(source="feeds")
# Get feeds objects
list_obj_search_feeds = Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED)
logger.debug("Fetching from feeds: {}".format([e.search for e in list_obj_search_feeds]))
# Process via RSS feeds
for obj_search in list_obj_search_feeds:
# Initialize
urls_fetched, urls_publish_date = [], []
# Fetch feeds
feeds = feedparser.parse(obj_search.search)
# Parse
for f in feeds.get("entries", []):
# Get URL
url = f.get("link", None)
# Process?
if (url is not None):
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)
# URL
urls_fetched.append(url)
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e:
logger.warning("Exception in FetchFeeds.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,42 @@
from .db_utils import DB_Handler
from ..models import Search, Source
import os
import requests
import json
import traceback
from .logger import get_logger
logger = get_logger()
class FetchMissingKids():
def __init__(self) -> None:
logger.debug("Initializing Fetcher MissingKids")
def run(self, number_pages=-1):
try:
logger.debug("Starting MissingKids.run(), processing #{} pages".format(number_pages))
# Get source object
obj_source, created = Source.objects.get_or_create(source="missingkids.org")
# Get search object
obj_search, created = Search.objects.get_or_create(search="missingkids.org/poster", type=Search.TYPE_ENUM.URL_HOST)
try:
# Missing kids fetching endpoint, parameter number of pages to fetch
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "get_missing_kids/?pages={}".format(number_pages))
# Timeout
if (number_pages > 15) or (number_pages == -1):
timeout = 60*90 # 1.5h
else:
timeout = 60*10 # 10 min
# Request
r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
# Decode
urls_fetched = json.loads(r.text).get("list_urls", [])
except Exception as e:
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
urls_fetched = []
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e:
logger.warning("Exception in MissingKids.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,46 @@
from .db_utils import DB_Handler
from ..models import Search, Source
from .url_processor import get_with_protocol, url_host_slowdown
import newspaper
import traceback
from .logger import get_logger
logger = get_logger()
class FetchParser():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Parser")
def run(self):
try:
logger.debug("Starting FetchParser.run() for {}")
# Get source object
obj_source, created = Source.objects.get_or_create(source="newspaper4k")
# Get URL hosts
list_url_host = Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST)
logger.debug("Fetching news by parsing URL hosts: {}".format([e.search for e in list_url_host]))
# Process newspaper4k build method
for obj_search in list_url_host:
# Protocol
url_host_protocol = get_with_protocol(obj_search.search)
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_protocol))
# Make sure no requests made for the last X seconds
url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
try:
# Source object
url_host_built = newspaper.build(url_host_protocol)
# Get articles URL list
urls_fetched = url_host_built.article_urls()
except newspaper.exceptions.ArticleException as e:
logger.debug("ArticleException while parsing input URL {}\n{}".format(url_host_protocol, str(e.args)))
urls_fetched = []
except Exception as e:
logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
urls_fetched = []
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e:
logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,57 @@
from .db_utils import DB_Handler
from ..models import Search
from django.db.models import Q
import traceback
import time
import os
from .fetch_search_instances import ListSearchInstances
from .logger import get_logger
logger = get_logger()
class FetchSearcher():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Searcher")
def run(self):
try:
logger.debug("Starting FetchSearcher.run()")
# Get search objects of interest
list_search_obj = Search.objects.filter(Q(type=Search.TYPE_ENUM.URL_HOST) | Q(type=Search.TYPE_ENUM.KEYWORD_SEARCH))
logger.debug("Fetching from search: {}".format(["{} ({})".format(e.search, e.type) for e in list_search_obj]))
# Search
for obj_search in list_search_obj:
# TODO: language & country customization
# Search
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
if (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
# Add search with intitle keyword
# TODO: allintitle: "child abuse"
# TODO: intitle: "child abuse"
pass
# language, country = obj_search.language_country.split("-")
logger.debug("Starting keyword search: {}".format(keyword_search))
logger.debug("Search type: {}".format(obj_search.type))
# DB writer
db_writer = DB_Handler()
# Keyword arguments
args = {
"language": "en",
"country": "US",
# "period": ["7d", "1d"], # TODO: List of periods to iterate
}
for SearchInstance in ListSearchInstances:
# Sleep between requests, avoid too many requests...
time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
SearchInstance(args).fetch_articles(db_writer, obj_search)
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
except Exception as e:
logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,308 @@
import time
import feedparser
import os
from django.utils import timezone
from datetime import timedelta
from ..models import Search, Source
from .fetch_utils import decode_gnews_urls
from .logger import get_logger
logger = get_logger()
from gnews import GNews
from duckduckgo_search import DDGS
from GoogleNews import GoogleNews
from search_engines import Yahoo, Aol
###########################################################################
###########################################################################
from abc import ABC, abstractmethod
# Generic fetcher (fetches articles, writes to DB)
class FetcherAbstract(ABC):
@abstractmethod
def _fetch_raw_urls(self):
pass
@abstractmethod
def _get_name(self):
pass
def _get_source_object(self, source):
# TODO: Cache
# self.cached_sources = {}
# Get source object
obj_source, created = Source.objects.get_or_create(source=source)
return obj_source
def _post_process_urls(self, raw_urls, obj_search):
# Searching URL Host based? Make sure results belong to that site
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
# Get clean URL host
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
# Ensure URL host in URL
raw_urls = [u for u in raw_urls if url_host_clean in u]
return raw_urls
def fetch_articles(self, db_writer, obj_search):
# Source name
source_name = self._get_name()
# Search
keyword_search = obj_search.search
# URL Host search? -> site:${URL_HOST}
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
keyword_search = "{}{}".format("site:", keyword_search)
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
start_date = timezone.now() - timedelta(days=7)
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
# Fetch
raw_urls = self._fetch_raw_urls(keyword_search)
# Post-process
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
db_writer.insert_raw_urls(raw_urls, self._get_source_object(source_name), obj_search)
###########################################################################
class SearchGNews(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "en")
self.country = args.get("country", "US")
self.period = args.get("period", "7d")
self.max_results = args.get("max_results", 100)
def _get_name(self):
# [source] [period] [language-country] [max_results]
return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Get news
results_gnews = GNews(language=self.language, country=self.country, period=self.period, max_results=self.max_results).get_news(keyword_search)
# Get list of encoded urls
encoded_urls = [e.get("url") for e in results_gnews]
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchDuckDuckGoGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "wt")
self.country = args.get("country", "wt")
self.max_results = args.get("max_results", 20)
self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None
def _get_name(self):
# [source] [language-country] [max_results]
return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
urls = [e.get("href") for e in news]
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchDuckDuckGoNews(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "wt")
self.country = args.get("country", "wt")
self.max_results = args.get("max_results", 100)
self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None
def _get_name(self):
# [source] [language-country] [max_results]
return "ddg-news {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
news = DDGS().news(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
urls = [e.get("url") for e in news]
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleNews(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "en")
self.country = args.get("country", "US")
self.period = args.get("period", "7d")
def _get_name(self):
# [source] [period] [language-country]
return "googlenews {} {}-{}".format(self.period, self.language, self.country)
def _fetch_raw_urls(self, keyword_search):
try:
# Initialize
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
googlenews.enableException(True)
# Search
googlenews.get_news(keyword_search)
# Fetch
encoded_urls = googlenews.get_links()
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "en")
self.country = args.get("country", "US")
self.period = args.get("period", "7d")
self.pages = args.get("pages", 1)
def _get_name(self):
# [source] [period] [language-country] [pages]
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Initialize
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
googlenews.enableException(True)
# Search
googlenews.search(keyword_search)
set_links = set()
# Iterate pages
for i in range(self.pages):
# Sleep between pages fetch
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
# Number of URLs fetched so far
num_before = len(set_links)
# Get page
try:
links = googlenews.page_at(i+1)
except Exception as e:
logger.warning("Exception fetching page - {}: {}".format(self._get_name(), str(e)))
break
# Links
for l in links:
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
set_links.add( l.get("link").split("&ved=")[0] )
# Finished?
if (num_before == len(set_links)):
break
# To list
urls = list(set_links)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleNewsRSS(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "en")
self.country = args.get("country", "US")
def _get_name(self):
# [source] [language-country]
return "googlenews-rss {}-{}".format(self.language, self.country).strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Search URL with parameters filled: https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(self.language, self.country.upper()), self.country.upper(), self.country.upper(), self.language)
# Control characters
search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
# Initialize
encoded_urls = []
# Fetch feeds
feeds = feedparser.parse(search_url)
# Parse
for f in feeds.get("entries", []):
# Encoded URL
encoded_url = f.get("link", None)
'''
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)'
'''
# Append
encoded_urls.append(encoded_url)
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchYahooGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.pages = args.get("pages", 2)
def _get_name(self):
# [source] [language-country] [pages]
return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
results = Yahoo().search(keyword_search, pages=self.pages)
urls = results.links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchAOLGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.pages = args.get("pages", 2)
def _get_name(self):
# [source] [language-country] [pages]
return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
results = Aol().search(keyword_search, pages=self.pages)
urls = results.links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
###########################################################################
# List of instances
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]

View File

@@ -0,0 +1,35 @@
import os
from django.core.cache import cache
from .logger import get_logger
logger = get_logger()
from googlenewsdecoder import gnewsdecoder
def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
logger.debug("Decoding gnews URLs")
# DecodeURLs
list_decoded_urls = []
for url in encoded_urls:
# Already cached?
decoded_url = cache.get("gnews_decode_{}".format(url))
if (decoded_url is not None):
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
# Append decoded URL
list_decoded_urls.append(decoded_url)
else:
try:
# Decode URL, with interval time to avoid block
decoded_url_dict = gnewsdecoder(url, interval=interval)
# Ok?
if decoded_url_dict.get("status"):
# Append decoded URL
decoded_url = decoded_url_dict["decoded_url"]
list_decoded_urls.append(decoded_url)
# Cache decoded URL
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
else:
logger.info("Bad status while decoding news.google.com, URL {}\n{}".format(url, decoded_url_dict.get("message")))
except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}".format(url))
return list_decoded_urls

View File

@@ -0,0 +1,33 @@
import logging
import os
# Get env var
logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
# Directory of logs
os.makedirs(logs_directory, exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("fetcher")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# To file log: INFO / WARNING / ERROR
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.INFO)
logger.addHandler(fh)
# To file log: WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.WARNING)
logger.addHandler(fh)
def get_logger():
return logger

View File

@@ -0,0 +1,127 @@
from django.core.cache import cache
from .logger import get_logger
logger = get_logger()
import newspaper
import time
import os
from urllib.parse import unquote
import langdetect
langdetect.DetectorFactory.seed = 0
def get_with_protocol(url):
# http:// -> https://
url = url.replace("http://", "https://")
# "" -> https://
if not (url.startswith("https://")):
url = "https://" + url
return url
def get_url_host(url):
# URL no protocol, first substring before '/'
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
return url_host
def url_host_slowdown(url, url_host_slowdown_seconds):
### Avoid (frequent) too many requests to the same URL host
# Get URL host
url_host = get_url_host(url)
# Recently processed URL host? -> Slow down required
last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None)
if last_cached_timestamp:
# Get time since last processed URL host (in seconds)
time_since_last_processed = time.time() - last_cached_timestamp
# Amount of time required to sleep?
slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
# Sleep
time.sleep(slowdown_required)
# About to process URL host, cache time
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
def process_url(url):
try:
# Slow down if required to avoid too many requests error
url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
# Process
article = newspaper.article(url)
except newspaper.ArticleBinaryDataException:
logger.warning("ArticleException for input URL {}".format(url))
return {"override_status": "invalid"}
except newspaper.ArticleException as e:
# Too many requests? Cool down...
if ("Status code 429" in str(e.args)):
# TODO: cool down and retry once?, proxy/VPN, ...
logger.debug("TODO: process_url Implement code 429")
# Unavailable for legal reasons
if ("Status code 451" in str(e.args)):
# TODO: Bypass with VPN
logger.debug("TODO: process_url Implement code 451")
# CloudFlare protection?
if ("Website protected with Cloudflare" in str(e.args)):
logger.debug("TODO: process_url Implement bypass CloudFlare")
# PerimeterX protection?
if ("Website protected with PerimeterX" in str(e.args)):
logger.debug("TODO: process_url Implement bypass PerimeterX")
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
return None
except Exception as e:
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
return None
try:
content_merged = "\n".join([article.title, article.meta_description, article.text])
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
language = langdetect.detect(content_merged)
else:
language = None
except Exception as e:
logger.info("Could not detect language: {}\n{}".format(url, str(e)))
language = None
dict_data = {
"url": url,
"url_canonical": article.canonical_link,
"url_host": article.source_url,
"site_name": article.meta_site_name,
"publish_date": article.publish_date,
"language": language, # article.meta_lang -> Not always reliable
"title": article.title,
"description": article.meta_description,
"content": article.text,
"valid_content": article.is_valid_body(),
"keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
"tags": article.tags,
"authors": article.authors,
"image_main_url": article.top_image, # article.meta_img
"images_url": article.images,
"videos_url": article.movies,
}
'''
# TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",")
if (dict_data["tags"] is None):
dict_data["tags"] = []
for k in article.meta_data.keys():
if ("tags" in k):
dict_data["tags"] += article.meta_data[k].split(",")
'''
# Sanity check
for k in dict_data.keys():
if (type(dict_data[k]) is list):
# Remove empty string, unquote special characters, e.g. "%20" -> " "
dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ]
# NULL instead of empty list
if (len(dict_data[k]) == 0):
dict_data[k] = None
elif (type(dict_data[k]) is str):
# Unquote special characters
if (dict_data[k] is not None):
dict_data[k] = unquote(dict_data[k])
# NULL instead of empty string
if (dict_data[k] == ""):
dict_data[k] = None
return dict_data

114
app_urls/fetcher/tasks.py Normal file
View File

@@ -0,0 +1,114 @@
from scheduler import job
from .src.fetch_feed import FetchFeeds
from .src.fetch_parser import FetchParser
from .src.fetch_search import FetchSearcher
from .src.fetch_missing_kids import FetchMissingKids
from .src.db_utils import DB_Handler
from .src.logger import get_logger
logger = get_logger()
@job('default')
def fetch_feeds():
task = "Fetch Feeds"
logger.info("Task triggered: {}".format(task))
FetchFeeds().run()
logger.info("Task completed: {}".format(task))
@job('default')
def fetch_parser():
task = "Fetch Parser"
logger.info("Task triggered: {}".format(task))
FetchParser().run()
logger.info("Task completed: {}".format(task))
@job('default')
def fetch_search():
task = "Fetch Search"
logger.info("Task triggered: {}".format(task))
FetchSearcher().run()
logger.info("Task completed: {}".format(task))
@job('default')
def fetch_missing_kids(number_pages=5):
task = "Fetch MissingKids"
logger.info("Task triggered: {}".format(task))
FetchMissingKids().run(number_pages)
logger.info("Task completed: {}".format(task))
@job('default')
def fetch_missing_kids_all(number_pages=-1):
task = "Fetch MissingKids"
logger.info("Task triggered: {}".format(task))
FetchMissingKids().run(number_pages)
logger.info("Task completed: {}".format(task))
@job('default')
def process_raw_urls(batch_size=50):
task = "Process raw URLs"
logger.info("Task triggered: {}".format(task))
DB_Handler().process_raw_urls(batch_size=batch_size)
logger.info("Task completed: {}".format(task))
@job('default')
def process_error_urls(batch_size=50):
task = "Process error URLs"
logger.info("Task triggered: {}".format(task))
DB_Handler().process_error_urls(batch_size=batch_size)
logger.info("Task completed: {}".format(task))
@job('default')
def process_missing_kids_urls(batch_size=50):
task = "Process Missing Kids URLs"
logger.info("Task triggered: {}".format(task))
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
logger.info("Task completed: {}".format(task))
@job('default')
def process_missing_kids_urls_all(batch_size=None):
task = "Process Missing Kids URLs ALL"
logger.info("Task triggered: {}".format(task))
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
logger.info("Task completed: {}".format(task))
@job('default')
def background_task(process_type: str):
logger.info("Task triggered: {}".format(process_type))
try:
if (process_type == "fetch_feeds"):
FetchFeeds().run()
elif (process_type == "fetch_parser"):
FetchParser().run()
elif (process_type == "fetch_search"):
FetchSearcher().run()
elif (process_type == "fetch_missingkids_all"):
FetchMissingKids().run(number_pages=-1)
elif ("fetch_missingkids" in process_type):
# number_pages encoded in URL
try:
number_pages = int(process_type.split("_")[-1])
except Exception as e:
number_pages = -1
FetchMissingKids().run(number_pages=number_pages)
elif ("process_" in process_type):
# Batch size encoded in URL
try:
batch_size = int(process_type.split("_")[-1])
except Exception as e:
batch_size = None
# Task type
if ("process_raw_urls" in process_type):
DB_Handler().process_raw_urls(batch_size=batch_size)
elif ("process_error_urls" in process_type):
DB_Handler().process_error_urls(batch_size=batch_size)
elif ("process_missing_kids_urls" in process_type):
DB_Handler().process_missing_kids_urls(batch_size=batch_size)
else:
logger.info("Task unknown!: {}".format(process_type))
logger.info("Task completed: {}".format(process_type))
except Exception as e:
logger.error(e)

View File

@@ -0,0 +1,179 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Charts</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<style>
body {
background-color: #333;
color: #fff;
font-family: Arial, sans-serif;
}
h2 {
color: #fff;
text-align: center;
margin-bottom: 40px;
}
.chart-container {
width: 45%;
display: inline-block;
margin: 20px;
background-color: #444;
border-radius: 10px;
padding: 5px;
}
canvas {
background-color: #2c2c2c;
border-radius: 5px;
}
.container {
display: flex;
justify-content: center;
flex-wrap: wrap;
}
.filter-container {
text-align: center;
margin-bottom: 20px;
}
select {
padding: 8px;
background-color: #555;
color: white;
border: 1px solid #444;
border-radius: 5px;
}
</style>
</head>
<body>
<h2>Data Visualizations</h2>
<!-- Filter for Number of Days -->
<div class="filter-container">
<label for="daysFilter">Select Number of Days:</label>
<select id="daysFilter">
<option value="0.0625">Last 90 Minutes</option>
<option value="0.25">Last 6 Hours</option>
<option value="1">Last 24 Hours</option>
<option value="7" selected>Last 7 Days</option>
<option value="30">Last 30 Days</option>
<option value="90">Last 90 Days</option>
<option value="365">Last 365 Days</option>
</select>
</div>
<div class="container">
<div class="chart-container">
<canvas id="urlFetchDateChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlStatusChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSourceChart"></canvas>
</div>
<div class="chart-container">
<canvas id="urlsPerSearchChart"></canvas>
</div>
</div>
<script>
$(document).ready(function () {
let chartInstances = {}; // Store chart instances
// Fetch initial data (default 7 days)
const defaultDays = 7;
fetchDataAndRenderCharts(defaultDays);
// Apply the filter automatically when the user changes the selection
$('#daysFilter').on('change', function () {
const selectedDays = $(this).val();
fetchDataAndRenderCharts(selectedDays);
});
function fetchDataAndRenderCharts(days) {
fetchAndRenderChart(`/urls-by-fetch-date/?days=${days}`, 'urlFetchDateChart', 'URLs by Fetch Date', 'bar');
fetchAndRenderChart(`/urls-per-status/?days=${days}`, 'urlStatusChart', 'URLs by Status', 'bar');
fetchAndRenderChart(`/urls-per-source/?days=${days}`, 'urlsPerSourceChart', 'URLs by Source', 'bar');
fetchAndRenderChart(`/urls-per-search/?days=${days}`, 'urlsPerSearchChart', 'URLs by Search', 'bar');
}
const categoryColors = {
'URLs by Fetch Date': '#4BC0C0', // Color for this category
'URLs by Status': '#36A2EB', // Color for this category
'URLs by Source': '#4BC0C0', // Color for this category
'URLs by Search': '#36A2EB' // Color for this category
};
const maxLabelLength = 35; // Limit X-axis labels to 10 characters
function fetchAndRenderChart(url, canvasId, chartTitle, chartType) {
$.getJSON(url, function (data) {
if (chartInstances[canvasId]) {
chartInstances[canvasId].destroy(); // Destroy previous chart
}
const ctx = document.getElementById(canvasId).getContext('2d');
chartInstances[canvasId] = new Chart(ctx, {
type: chartType,
data: {
labels: data.labels, // Ensure labels are passed as strings
datasets: [{
label: chartTitle,
data: data.values,
backgroundColor: categoryColors[chartTitle], // Assign the same color based on category
}]
},
options: {
responsive: true,
plugins: {
legend: {
labels: { color: '#fff' }
}
},
scales: {
x: {
ticks: {
color: "#fff", // Set the color of x-axis ticks
callback: function (value) {
let label = data.labels[value];
if (label.length > maxLabelLength) { return label.slice(0, maxLabelLength) + '...'; }
return label;
}
},
grid: {
color: "#444" // Set the grid lines color
}
},
y: {
ticks: {
color: "#fff" // Set the color of y-axis ticks
},
grid: {
color: "#444" // Set the grid lines color
}
}
}
}
});
});
}
});
</script>
</body>
</html>

View File

@@ -0,0 +1,580 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>URLs</title>
<!--
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
-->
<style>
/* General Styling */
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 0;
background-color: #fff;
color: #333;
/*transition: background 0.3s ease, color 0.3s ease;*/
}
/* Dark Mode Styles */
.dark-mode {
background-color: #121212;
color: #e0e0e0;
}
/* Default Link Style */
a {
color: #0066cc; /* Default color for links */
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
/* Dark Mode Links */
.dark-mode a {
color: #52a8ff; /* Adjust this color to make the link more visible in dark mode */
}
.dark-mode a:hover {
color: #66ccff; /* Change the hover color to something lighter or a contrasting color */
}
/* Layout */
.container {
display: flex;
}
/* Sidebar */
.sidebar {
min-width: 110px; /* Minimum width */
max-width: 200px; /* Maximum width */
width: 100%; /* Make it take full width within the defined min and max */
padding: 5px;
box-sizing: border-box; /* Ensure padding doesn't increase the overall width */
transition: width 0.3s ease-in-out; /* Smooth transition for resizing */
background-color: #f4f4f4;
box-sizing: border-box;
word-wrap: break-word; /* Allow wrapping of long words */
overflow-wrap: break-word; /* Ensures wrapping across browsers */
white-space: normal; /* Ensure normal word wrapping */
}
.dark-mode .sidebar {
background-color: #1e1e1e;
}
/* Sidebar Headers */
.sidebar h3 {
margin-top: 15px;
margin-bottom: 2px;
font-size: 16px;
}
/* Table Container */
.table-container {
flex-grow: 1;
}
/* Table */
table {
width: 97.5%;
border-collapse: collapse;
margin-top: 20px;
}
table, th, td {
border: 1px solid #ddd;
}
th, td {
padding: 10px;
text-align: left;
}
/* Dark Mode Table */
.dark-mode table {
border-color: #444;
}
.dark-mode th, .dark-mode td {
border-color: #555;
}
/* Dark Mode Checkbox Labels */
.dark-mode label {
color: #e0e0e0;
}
/* Checkbox Styling */
input[type="checkbox"] {
cursor: pointer;
}
/* Themed Toggle Button */
.theme-button, .home-button, .chart-button {
background-color: var(--sidebar);
border: 1px solid var(--sidebar);
border-radius: 50%;
width: 30px;
height: 45px;
font-size: 25px;
display: flex;
align-items: center;
justify-content: center;
transition: background-color 0.1s, color 0.1s, transform 0.1s;
cursor: pointer;
}
.theme-button:hover, .home-button:hover, .chart-button:hover {
transform: rotate(20deg);
}
.theme-button:active, .home-button:active, .chart-button:acive {
transform: scale(0.95);
}
.button-container {
display: flex;
align-items: center;
gap: 10px; /* Space between buttons */
}
/* PAGINATION */
.pagination-container {
display: flex;
justify-content: center;
align-items: center;
gap: 10px;
font-family: Arial, sans-serif;
}
.pagination-link {
padding: 8px 15px;
background-color: #007bff;
color: white;
text-decoration: none;
border-radius: 25px;
font-size: 14px;
display: inline-block;
transition: background-color 0.3s ease, transform 0.2s ease;
}
.pagination-link:hover {
background-color: #0056b3;
transform: scale(1.1);
}
.pagination-link:active {
background-color: #003366;
transform: scale(0.95);
}
.first-page, .last-page {
font-weight: bold;
}
.prev-page, .next-page {
font-weight: normal;
}
/* ROUNDED SWITCH*/
/* Hide the default checkbox */
.checkbox-slider {
display: none;
}
/* Container for the toggle switch */
.slider-container {
display: inline-block;
width: 60px;
height: 30px;
position: relative;
}
/* Label for the slider */
.slider-container label {
display: block;
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
background-color: #ccc;
border-radius: 30px;
cursor: pointer;
transition: background-color 0.3s ease;
}
/* The toggle circle */
.slider-container label::before {
content: '';
position: absolute;
top: 3px;
left: 3px;
width: 24px;
height: 24px;
background-color: white;
border-radius: 50%;
transition: transform 0.3s ease;
}
/* When the checkbox is checked */
.checkbox-slider:checked + .slider-container label {
background-color: #0940b8;
}
/* When the checkbox is checked, move the circle */
.checkbox-slider:checked + .slider-container label::before {
transform: translateX(30px);
}
</style>
</head>
<body>
{% load custom_filters %}
<div class="container">
<div class="sidebar">
<div class="button-container">
<button id="homeButton" class="home-button">🏠</button>
<button id="themeToggle" class="theme-button">🌙</button>
<button id="chartButton" class="chart-button">📊</button>
</div>
<form method="GET" action="" id="filterForm">
<!-- Switch: Table / Charts
<form>
<label>
<input type="radio" name="view" value="table" checked id="tableRadio"> Table
</label>
<label>
<input type="radio" name="view" value="chart" id="chartRadio"> Charts
</label>
</form>
-->
<!-- Rounded switch
<input type="checkbox" id="toggle" class="checkbox-slider">
<div class="slider-container">
<label for="toggle"></label>
<span class="slider-text">
<span id="onText" class="on-text">ON</span>
<span id="offText" class="off-text">OFF</span>
</span>
</div>
-->
<!-- Pages Per Page Dropdown -->
<h3>Pages Per Page</h3>
<select id="perPageSelect" name="per_page">
<option value="25" {% if per_page|stringformat:"s" == '25' %}selected{% endif %}>25</option>
<option value="100" {% if per_page|stringformat:"s" == '100' %}selected{% endif %}>100</option>
<option value="500" {% if per_page|stringformat:"s" == '500' %}selected{% endif %}>500</option>
</select>
<br>
<!-- Filter by Time Range -->
<h3>Fetch Date</h3>
<select id="timeFilterSelect" name="days">
<!--
{% for form_days in form_days_list %}
<option value=form_days.1|stringformat:"s" {% if selected_days|stringformat:"s" == form_days.1|stringformat:"s" %}selected{% endif %}>form_days.2</option>
{% endfor %}
-->
<option value="0.25" {% if selected_days|stringformat:"s" == '0.25' %}selected{% endif %}>Last 6 hours</option>
<option value="1" {% if selected_days|stringformat:"s" == '1' %}selected{% endif %}>Last 24 hours</option>
<option value="7" {% if selected_days|stringformat:"s" == '7' %}selected{% endif %}>Last 7 days</option>
<option value="30" {% if selected_days|stringformat:"s" == '30' %}selected{% endif %}>Last 30 days</option>
<option value="90" {% if selected_days|stringformat:"s" == '90' %}selected{% endif %}>Last 90 days</option>
<option value="365" {% if selected_days|stringformat:"s" == '365' %}selected{% endif %}>Last 365 days</option>
</select>
<br>
<!-- Filter by Status -->
<h3>Status</h3>
<button type="button" class="toggle-all-btn" data-toggle="status">Toggle All</button><br>
{% for status in statuses %}
<label>
<input type="checkbox" name="status" value="{{ status.0 }}"
{% if status.0 in selected_status or 'all' in selected_status %}checked{% endif %}>
{{ status.1 }}
</label><br>
{% endfor %}
<!-- Filter by valid content -->
<h3>Valid content</h3>
<button type="button" class="toggle-all-btn" data-toggle="valid_content">Toggle All</button><br>
{% for vc in valid_contents %}
<label>
<input type="checkbox" name="valid_content" value="{{ vc }}"
{% if vc|stringformat:"s" in selected_valid_contents or 'all' in selected_valid_contents%}checked{% endif %}>
{{ vc|truncatechars:50 }}
</label><br>
{% endfor %}
<!-- Filter by Search -->
<h3>Search</h3>
<button type="button" class="toggle-all-btn" data-toggle="search">Toggle All</button><br>
{% for search in searches %}
<label>
<input type="checkbox" name="search" value="{{ search.id }}"
{% if search.id|stringformat:"s" in selected_search or 'all' in selected_search %}checked{% endif %}>
[{{ search.type }}] {{ search.search|truncatechars:50 }}
</label><br>
{% endfor %}
<!-- Filter by Source -->
<h3>Source</h3>
<button type="button" class="toggle-all-btn" data-toggle="source">Toggle All</button><br>
{% for source in sources %}
<label>
<input type="checkbox" name="source" value="{{ source.id }}"
{% if source.id|stringformat:"s" in selected_source or 'all' in selected_source %}checked{% endif %}>
{{ source.source|truncatechars:50 }}
</label><br>
{% endfor %}
<!-- Filter by language -->
<h3>Language</h3>
<button type="button" class="toggle-all-btn" data-toggle="language">Toggle All</button><br>
{% for lang in languages %}
<label>
<input type="checkbox" name="language" value="{{ lang }}"
{% if lang|stringformat:"s" in selected_language or 'all' in selected_language%}checked{% endif %}>
{{ lang|truncatechars:50 }}
</label><br>
{% endfor %}
</form>
</div>
<!-- Table URLs data -->
<div class="table-container">
<table>
<thead>
<tr>
<th>ID</th>
<th>URL</th>
<th>Status</th>
<th>Fetch Date</th>
<th>Search</th>
<th>Source</th>
<th>Valid content?</th>
<th>Language</th>
</tr>
</thead>
<tbody>
{% for url in urls %}
<tr>
<td><a href="./{{ url.id }}" class="btn btn-primary btn-sm" target="_blank">{{ url.id }}</a></td>
<td><a href="{{ url.url }}/" target="_blank">{{ url.url }}</a></td>
<td>
{% if url.status == 'raw' %}
<span class="badge bg-secondary">{{ url.status|capfirst }}</span>
{% elif url.status == 'error' %}
<span class="badge bg-danger">{{ url.status|capfirst }}</span>
{% elif url.status == 'valid' %}
<span class="badge bg-success">{{ url.status|capfirst }}</span>
{% elif url.status == 'unknown' %}
<span class="badge bg-warning">{{ url.status|capfirst }}</span>
{% elif url.status == 'invalid' %}
<span class="badge bg-danger">{{ url.status|capfirst }}</span>
{% elif url.status == 'duplicate' %}
<span class="badge bg-info">{{ url.status|capfirst }}</span>
{% else %}
<span class="badge bg-light">Unknown</span>
{% endif %}
</td>
<td>
<span class="ts-fetch" data-ts="{{ url.ts_fetch|date:'c' }}"></span>
</td>
<td>
{% with sources_map|dict_get:url.id as sources %}
{% if sources %}
{% for source in sources %}
<span class="badge bg-secondary">{{ source }}</span>
{% endfor %}
{% else %}
<span class="text-muted">No sources</span>
{% endif %}
{% endwith %}
</td>
<td>
{% with searches_map|dict_get:url.id as searches %}
{% if searches %}
{% for search in searches %}
<span class="badge bg-secondary">{{ search }}</span>
{% endfor %}
{% else %}
<span class="text-muted">No searches</span>
{% endif %}
{% endwith %}
</td>
<td>
{% with url_content_map|dict_get:url.id as content %}
{{ content.valid_content }}
{% endwith %}
</td>
<td>
{% with url_content_map|dict_get:url.id as content %}
{{ content.language }}
{% endwith %}
</td>
</tr>
{% empty %}
<tr>
<td colspan="5">No URLs found for the selected filters.</td>
</tr>
{% endfor %}
</tbody>
</table>
<!-- Pagination Controls -->
<div class="pagination">
<!-- <div class="pagination-controls"> -->
<div class="pagination-container" style="margin-top: 20px;margin-bottom: 20px;">
{% if urls.has_previous %}
<a href="#" class="pagination-link" data-page="1">« First</a>
<a href="#" class="pagination-link" data-page="{{ urls.previous_page_number }}">Previous</a>
{% endif %}
<span>Page {{ urls.number }} of {{ urls.paginator.num_pages }}</span>
{% if urls.has_next %}
<a href="#" class="pagination-link" data-page="{{ urls.next_page_number }}">Next</a>
<a href="#" class="pagination-link" data-page="{{ urls.paginator.num_pages }}">Last »</a>
{% endif %}
</div>
</div>
</div>
</div>
<script>
//////////////////////////////////////////////////////////////////////
document.addEventListener("DOMContentLoaded", function () {
//////////////////////////////////////////////
// Theme & Home
const themeToggle = document.getElementById("themeToggle");
const body = document.body;
// Load theme from localStorage
if (localStorage.getItem("theme") === "dark") {
body.classList.add("dark-mode");
themeToggle.textContent = "🌞";
}
// Toggle theme on button click
themeToggle.addEventListener("click", function () {
if (body.classList.contains("dark-mode")) {
body.classList.remove("dark-mode");
localStorage.setItem("theme", "light");
themeToggle.textContent = "🌙";
} else {
body.classList.add("dark-mode");
localStorage.setItem("theme", "dark");
themeToggle.textContent = "🌞";
}
});
// Home
document.getElementById("homeButton").addEventListener("click", function () {
window.location.href = "./"; // Change this to your homepage URL if different
});
// Charts
document.getElementById("chartButton").addEventListener("click", function () {
window.location.href = "./charts"; // Change this to your homepage URL if different
});
//////////////////////////////////////////////
// Timestamp to local timezone
document.querySelectorAll(".ts-fetch").forEach(element => {
let utcDate = element.getAttribute("data-ts"); // Get timestamp from data attribute
let options = { year: 'numeric', month: 'numeric', day: 'numeric', hour: '2-digit', minute: '2-digit', second: '2-digit', hour12:false};
if (utcDate) {
let localDate = new Date(utcDate).toLocaleString("en-GB", options); // Convert to local timezone
element.textContent = localDate; // Update the text content
}
});
//////////////////////////////////////////////
});
//////////////////////////////////////////////////////////////////////
// Function to update pagination links
function updatePaginationLinks(pageNumber) {
// Get current URL and remove existing page parameter
const currentUrl = new URL(window.location.href);
currentUrl.searchParams.set('page', pageNumber); // Update page parameter
window.location.href = currentUrl.toString(); // Redirect to the updated URL
}
// Attach event listeners to pagination links
document.querySelectorAll('.pagination-link').forEach(link => {
link.addEventListener('click', function(e) {
e.preventDefault();
const pageNumber = this.getAttribute('data-page');
updatePaginationLinks(pageNumber); // Update the page number in the URL
});
});
// Function to update the form parameters for all sections before submitting
function updateFormParameters() {
// Get all distinct sections by selecting all checkboxes and extracting their "name" attributes
const sections = new Set([...document.querySelectorAll("input[type='checkbox']")].map(cb => cb.name));
sections.forEach(section => {
if (!section) return; // Skip any checkboxes without a name
const checkboxes = document.querySelectorAll(`[name='${section}']`);
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
// If all checkboxes in a section are checked, remove them and add a hidden input
if (allChecked) {
checkboxes.forEach(checkbox => checkbox.removeAttribute("name"));
let hiddenInput = document.createElement("input");
hiddenInput.type = "hidden";
hiddenInput.name = section;
hiddenInput.value = "all";
document.getElementById("filterForm").appendChild(hiddenInput);
} else {
checkboxes.forEach(checkbox => checkbox.setAttribute("name", section));
document.querySelectorAll(`input[name="${section}"][type="hidden"]`).forEach(hiddenInput => hiddenInput.remove());
}
});
// Submit the form after updating all sections
document.getElementById("filterForm").submit();
}
//////////////////////////////////////////////////////////////////////
// Function to toggle all checkboxes in a section
function toggleCheckboxes(section) {
const checkboxes = document.querySelectorAll(`[name='${section}']`);
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
checkboxes.forEach(cb => cb.checked = !allChecked);
updateFormParameters();
}
// Attach event listeners to "Toggle All" buttons
document.querySelectorAll('.toggle-all-btn').forEach(button => {
button.addEventListener('click', function() {
const section = this.getAttribute('data-toggle');
toggleCheckboxes(section);
});
});
//////////////////////////////////////////////////////////////////////
// Automatically submit the form when any checkbox changes
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
checkbox.addEventListener('change', function() {
updateFormParameters();
});
});
document.getElementById('perPageSelect').addEventListener('change', function() {
updateFormParameters();
});
document.getElementById('timeFilterSelect').addEventListener('change', function() {
updateFormParameters();
});
</script>
</body>
</html>

View File

@@ -0,0 +1,297 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% block title %}News{% endblock %}</title>
<!-- Bootstrap CSS -->
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<!-- Add jQuery from CDN (before other scripts) -->
<script src="https://code.jquery.com/jquery-3.6.4.min.js"></script>
<!-- Markdown -->
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<!-- Bootstrap JS -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
<!-- Custom Styles -->
<style>
body {
background-color: #f4f4f4;
}
.navbar-dark .navbar-nav .nav-link {
color: rgba(255,255,255,0.75);
}
.chat-box {
background-color: #fff;
border: 1px solid #ddd;
padding: 15px;
border-radius: 8px;
overflow-y: auto; /* Enable vertical scrolling */
max-width: 100%;
min-height: 150px;
max-height: 450px;
white-space: normal;
word-wrap: break-word;
word-break: break-word;
}
.table {
table-layout: auto;
width: 100%;
}
th {
white-space: nowrap;
}
td {
word-wrap: break-word;
overflow-wrap: break-word;
}
/* Sidebar */
.sidebar {
min-width: 110px; /* Minimum width */
max-width: 200px; /* Maximum width */
width: 100%; /* Make it take full width within the defined min and max */
padding: 5px;
box-sizing: border-box; /* Ensure padding doesn't increase the overall width */
transition: width 0.3s ease-in-out; /* Smooth transition for resizing */
background-color: #f4f4f4;
box-sizing: border-box;
word-wrap: break-word; /* Allow wrapping of long words */
overflow-wrap: break-word; /* Ensures wrapping across browsers */
white-space: normal; /* Ensure normal word wrapping */
}
.dark-mode .sidebar {
background-color: #1e1e1e;
}
</style>
</head>
<script>
//////////////////////////////////////////////////////////////////////
document.addEventListener("DOMContentLoaded", function () {
//////////////////////////////////////////////
// Timestamp to local timezone
document.querySelectorAll(".ts-fetch").forEach(element => {
let utcDate = element.getAttribute("data-ts"); // Get timestamp from data attribute
let options = { year: 'numeric', month: 'numeric', day: 'numeric', hour: '2-digit', minute: '2-digit', second: '2-digit', hour12:false};
if (utcDate) {
let localDate = new Date(utcDate).toLocaleString("en-GB", options); // Convert to local timezone
element.textContent = localDate; // Update the text content
}
});
document.querySelectorAll(".ts-publish").forEach(element => {
let utcDate = element.getAttribute("data-ts"); // Get timestamp from data attribute
let options = { year: 'numeric', month: 'numeric', day: 'numeric', hour: '2-digit', minute: '2-digit', second: '2-digit', hour12:false};
if (utcDate) {
let localDate = new Date(utcDate).toLocaleString("en-GB", options); // Convert to local timezone
element.textContent = localDate; // Update the text content
}
});
});
function fetchDetails(urlId, url) {
// Show the loading spinner
document.getElementById("loading-spinner").style.display = "block";
// Get the input value
let inputText = document.getElementById(`custom-input-${urlId}`).value;
// Get the input model
let selectedModel = document.getElementById(`options-${urlId}`).value;
// Check if a model is selected
if (!selectedModel) {
alert("Please select a model before fetching details.");
return;
}
// Fetch URL
let fetchUrl = `/urls/${urlId}/fetch/?url=${encodeURIComponent(url)}&model=${encodeURIComponent(selectedModel)}&text=${encodeURIComponent(inputText)}`;
let resultContainer = $("#chat-output");
resultContainer.html(""); // Clear previous content before fetching
let fetchButton = $("button[onclick^='fetchDetails']"); // Select the button
fetchButton.prop("disabled", true); // Disable button
fetch(fetchUrl/*, {
method: "POST",
body: JSON.stringify({
text: inputText
}),
headers: {
"Content-type": "application/json; charset=UTF-8"
}
}*/).then(response => {
if (!response.ok) {
throw new Error("Error on network response");
}
const reader = response.body.getReader();
const decoder = new TextDecoder();
let accumulatedText = ""; // Store streamed text before rendering Markdown
let messageContainer = $('<div class="chat-message"></div>'); // Create a temporary container for streaming response
resultContainer.append(messageContainer);
function read() {
return reader.read().then(({ done, value }) => {
if (done) {
messageContainer.html(marked.parse(accumulatedText));
fetchButton.prop("disabled", false); // Re-enable button when done
return;
}
// Decode the streamed chunk
let chunk = decoder.decode(value);
// Append to the accumulated text
accumulatedText += chunk;
// Render Markdown progressively (but safely)
messageContainer.html(marked.parse(accumulatedText));
// Auto-scroll to bottom
resultContainer.scrollTop(resultContainer[0].scrollHeight);
return read();
});
}
return read();
})
.catch(error => {
resultContainer.html(`<p class="text-danger">Error fetching details: ${error.message}</p>`);
fetchButton.prop("disabled", false); // Re-enable button on error
})
.finally(() => {
// Hide the loading spinner after request is complete
document.getElementById("loading-spinner").style.display = "none";
});
}
</script>
<body>
<!--
<div class="sidebar">
<div class="button-container">
<button id="homeButton" class="home-button">🏠</button>
<button id="themeToggle" class="theme-button">🌙</button>
</div>
</div>
-->
<!-- Main Content -->
<div class="container mt-4">
<!-- <h2>URL Details</h2> -->
<table class="table table-bordered">
<tr>
<th>URL</th>
<td><a href="{{ url_item.url|safe }}" target="_blank">{{ url_item.url }}</a></td>
</tr>
<tr>
<th>Fetch Date</th>
<td> <span class="ts-fetch" data-ts="{{ url_item.ts_fetch|date:'c' }}"></span> </td>
</tr>
<tr>
<th>Source</th>
<td>{{ sources|join:", " }}</td>
</tr>
<tr>
<th>Search</th>
<td>{{ searches|join:", " }}</td>
</tr>
<tr>
<th>Status</th>
<td>{{ url_item.status }}</td>
</tr>
<tr>
<th>URL host</th>
<td> <a href="{{ url_content.url_host|safe }}" target="_blank">{{ url_content.url_host }}</a> </td>
</tr>
<tr>
<th>Site name</th>
<td>{{ url_content.site_name|default:"" }}</td>
</tr>
<tr>
<th>Published Date</th>
<td> <span class="ts-publish" data-ts="{{ url_content.date_published|date:'c' }}"></span> </td>
</tr>
<tr>
<th>Valid news content?</th>
<td>{{ url_content.valid_content }}</td>
</tr>
<tr>
<th>Tags</th>
<td>{{ url_content.tags|default:"" }}</td>
</tr>
<tr>
<th>Authors</th>
<td>{{ url_content.authors|default:"" }}</td>
</tr>
<tr>
<th>Keywords</th>
<td>{{ url_content.keywords|default:"" }}</td>
</tr>
<tr>
<th>Language</th>
<td>{{ url_content.language|default:"" }}</td>
</tr>
<tr>
<th>Main image</th>
<td><a href="{{ url_content.image_main_url|safe }}" target="_blank">{{ url_content.image_main_url|default:"" }}</a></td>
</tr>
<tr>
<th>Image URLs</th>
<td>{{ url_content.image_urls|default:"" }}</td>
</tr>
<tr>
<th>Video URLs</th>
<td>{{ url_content.videos_url|default:"" }}</td>
</tr>
<tr>
<th>Title</th>
<td>{{ url_content.title|default:"" }}</td>
</tr>
<tr>
<th>Description</th>
<td>{{ url_content.description|default:"" }}</td>
</tr>
<tr>
<th>Content</th>
<td>{{ url_content.content|default:"" }}</td>
</tr>
</table>
<!-- Independent form for optional values -->
<form onsubmit="fetchDetailsWithSelection(event, {{ url_item.id }}, '{{ url_item.url }}')">
<label for="options-{{ url_item.id }}">Model:</label>
<select id="options-{{ url_item.id }}" class="form-control mb-2">
{% for model in models %}
<option value="{{ model }}">{{ model }}</option>
{% endfor %}
</select>
</form>
<!-- Input field with a default value -->
<label for="custom-input-{{ url_item.id }}">Prompt:</label>
<textarea id="custom-input-{{ url_item.id }}" class="form-control mb-2" rows="5">{{ prompt }}
{{ url_item.url }}</textarea>
<div class="d-flex align-items-center">
<!-- Fetch details button -->
<button class="btn btn-primary" onclick="fetchDetails({{ url_item.id }}, '{{ url_item.url }}')">
Fetch Details
</button>
<!-- Loading Spinner (Hidden by Default) -->
<div id="loading-spinner" class="spinner-border text-primary ms-2" role="status" style="display: none;">
<span class="visually-hidden">Loading...</span>
</div>
</div>
<!-- Chatbot-style response box -->
<div class="chat-box mt-3 p-3 border rounded">
<div id="chat-output"></div>
</div>
</div>
{% block extra_js %}{% endblock %}
</body>
</html>

View File

@@ -0,0 +1,8 @@
from django import template
register = template.Library()
@register.filter
def dict_get(dictionary, key):
"""Custom filter to get a value from a dictionary in Django templates."""
return dictionary.get(key, [])

View File

@@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

20
app_urls/fetcher/urls.py Normal file
View File

@@ -0,0 +1,20 @@
from django.urls import path
from . import views
urlpatterns = [
path('', views.link_list, name='link_list'),
#
path('logs/<str:log_type>', views.logs, name='logs'),
#
path('task/<str:task>', views.trigger_task, name='trigger_task'),
#
path('urls/charts/', views.charts, name='charts'),
path('urls-by-fetch-date/', views.urls_by_fetch_date, name='urls_by_fetch_date'),
path('urls-per-status/', views.urls_per_status, name='urls_per_status'),
path('urls-per-source/', views.urls_per_source, name='urls_per_source'),
path('urls-per-search/', views.urls_per_search, name='urls_per_search'),
#
path('urls/', views.filtered_urls, name='filtered_urls'),
path('urls/<int:id>/', views.url_detail_view, name='url_detail'),
path('urls/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
]

356
app_urls/fetcher/views.py Normal file
View File

@@ -0,0 +1,356 @@
from .tasks import background_task
from django.core.paginator import Paginator
from django.shortcuts import render, get_object_or_404
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
from django.contrib.auth.decorators import login_required
import ollama
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch
import os
####################################################################################################
def trigger_task(request, task):
# Enqueue function in "default" queue
background_task.delay(task)
return JsonResponse({"message": "Task has been enqueued!", "task": task})
####################################################################################################
def link_list(request):
prefix = "http://localhost:8000/task"
links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
list_links = [
# DB
"http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id&limit=500",
# Admin panel
"http://localhost:8000/admin",
# Logs
"http://localhost:8000/logs/debug",
"http://localhost:8000/logs/info",
"http://localhost:8000/logs/error",
# URLs
"http://localhost:8000/urls",
# Charts
"http://localhost:8000/urls/charts",
# Fetcher tasks
] + [os.path.join(prefix, l) for l in links]
# Json
return JsonResponse({"links": list_links })
####################################################################################################
# @login_required(login_url='/admin')
def logs(request, log_type):
# Capture output: python manage.py rqstats
try:
with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
file_content = f.read()
except Exception as e:
file_content = "Error reading logs for log type :{}".format(log_type)
return HttpResponse(file_content, content_type="text/plain")
####################################################################################################
class OllamaClient():
def __init__(self):
self.client = ollama.Client(host=os.getenv("ENDPOINT_OLLAMA", "https://ollamamodel.matitos.org"))
def _get_default_model(self):
return "llama3.2:3b"
def get_models(self):
models = sorted([m.model for m in self.client.list().models])
if (self._get_default_model() in models):
return [self._get_default_model()] + [m for m in models if m != self._get_default_model()]
else:
return models
def get_prompt(self):
return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
#return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
# return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
#return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
# TODO: move to ollamajs...
def fetch_details(request, id):
url_item = get_object_or_404(Urls, id=id)
url_param = request.GET.get("url", "") # Get URL
model = request.GET.get("model", "") # Get LLM model
text = request.GET.get("text", "") # Get LLM prompt
# print(request)
# print(text)
# LLM
ollama = OllamaClient()
def stream_response():
msg_content = {
"role": "user",
"content": text,
}
response = ollama.client.chat(model=model, messages=[msg_content], stream=True)
for chunk in response:
yield chunk["message"]["content"] # Stream each chunk of text
return StreamingHttpResponse(stream_response(), content_type="text/plain")
def url_detail_view(request, id):
url_item = get_object_or_404(Urls, id=id)
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
url_searches = list(Search.objects.filter(urlssourcesearch__id_url=url_item).distinct())
# url_source_search = UrlsSourceSearch.objects.filter(id_url=url_item)
try:
url_content = UrlContent.objects.get(pk=id)
except UrlContent.DoesNotExist:
url_content = {}
# TODO: https://github.com/ollama/ollama-python?tab=readme-ov-file#async-client
ollama = OllamaClient()
context = {
'url_item': url_item,
'sources': url_sources,
'searches': url_searches,
'models': ollama.get_models(),
'prompt': ollama.get_prompt(),
'url_content': url_content,
}
return render(request, 'url_detail.html', context)
####################################################################################################
from django.shortcuts import render
from django.http import JsonResponse
from django.db.models import Count
from datetime import timedelta
from django.utils import timezone
from .models import Urls, UrlsSourceSearch
def charts(request):
return render(request, 'charts.html')
def urls_by_fetch_date(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by fetch date
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
.values('ts_fetch__date') \
.annotate(count=Count('id')) \
.order_by('ts_fetch__date')
# Format data to return as JSON
data = {
'labels': [item['ts_fetch__date'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)
def urls_per_status(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by status within the date range
urls_data = Urls.objects.filter(ts_fetch__gte=start_date) \
.values('status') \
.annotate(count=Count('id')) \
.order_by('status')
# Format data for JSON
data = {
'labels': [item['status'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)
def urls_per_source(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by source
urls_data = UrlsSourceSearch.objects \
.filter(id_url__ts_fetch__gte=start_date) \
.values('id_source__source') \
.annotate(count=Count('id_url')) \
.order_by('id_source__source')
# Format data for JSON
data = {
'labels': [item['id_source__source'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)
def urls_per_search(request):
# Get the filtering date parameter
days = float(request.GET.get('days', 30)) # Default is 30 days
start_date = timezone.now() - timedelta(days=days)
# Count the number of URLs grouped by search
urls_data = UrlsSourceSearch.objects \
.filter(id_url__ts_fetch__gte=start_date) \
.values('id_search__search') \
.annotate(count=Count('id_url')) \
.order_by('id_search__search')
# Format data for JSON
data = {
'labels': [item['id_search__search'] for item in urls_data],
'values': [item['count'] for item in urls_data],
}
return JsonResponse(data)
####################################################################################################
from django.shortcuts import render
from .models import Urls, Search, Source
from django.db.models import Q
from django.utils.timezone import now, timedelta
def filtered_urls(request):
statuses = Urls.STATUS_ENUM.choices
searches = Search.objects.all()
sources = Source.objects.all()
# TODO: Cache languages, update once every N
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
# Null for visualization
languages = ["Unknown"] + [l for l in languages if l is not None]
valid_contents = ["True", "False", "Unknown"]
# Get selected parameters
selected_status = request.GET.getlist('status', ["null"])
selected_search = request.GET.getlist('search', ["null"])
selected_source = request.GET.getlist('source', ["null"])
selected_language = request.GET.getlist('language', ["null"])
selected_valid_contents = request.GET.getlist('valid_content', ["null"])
selected_days = request.GET.get("days", 30)
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
page_number = request.GET.get('page') # Get the current page number
all_status = [str(status[0]) for status in statuses]
all_search = [str(search.id) for search in searches]
all_source = [str(source.id) for source in sources]
all_languages = languages
all_valid_contents = valid_contents
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
selected_status = ["all"]
selected_search = ["all"]
selected_source = ["all"]
selected_language = ["all"]
selected_valid_contents = ["all"]
else:
# Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
if (set(selected_status) == set(all_status)):
selected_status = ["all"]
if (set(selected_search) == set(all_search)):
selected_search = ["all"]
if (set(selected_source) == set(all_source)):
selected_source = ["all"]
if (set(selected_language) == set(all_languages)):
selected_language = ["all"]
if (set(selected_valid_contents) == set(all_valid_contents)):
selected_valid_contents = ["all"]
# Filter URLs based on selected filters
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
urls = []
else:
# Filter by date
query = Q(ts_fetch__gte=now() - timedelta(days=float(selected_days)))
# Additional filters
if ("all" not in selected_status):
query &= Q(status__in=selected_status)
if ("all" not in selected_source):
query &= Q(urlssourcesearch__id_source__in=selected_source)
if ("all" not in selected_search):
query &= Q(urlssourcesearch__id_search__in=selected_search)
if ("all" not in selected_language):
# URLs with selected languages
subquery = Q(urlcontent__language__in=selected_language)
if ("Unknown" in selected_language):
# URLs with NULL language
subquery |= Q(urlcontent__language__isnull=True)
# URLs with no UrlContent record at all (similar to URLs with NULL language)
subquery |= Q(urlcontent__id_url__isnull=True)
# Update query
query &= (subquery)
if ("all" not in selected_valid_contents):
# Boolean array
bool_array = []
if ('True' in selected_valid_contents):
bool_array.append(True)
if ('False' in selected_valid_contents):
bool_array.append(False)
# URLs with selected valid_contents
subquery = Q(urlcontent__valid_content__in=bool_array)
if ("Unknown" in selected_valid_contents):
# URLs with NULL valid_content
subquery |= Q(urlcontent__valid_content__isnull=True)
# URLs with no UrlContent record at all (similar to URLs with NULL valid_content)
subquery |= Q(urlcontent__id_url__isnull=True)
# Update query
query &= (subquery)
# Run query
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
# print(urls.query)
# Pagination
paginator = Paginator(urls, per_page) # Paginate the filtered URLs
page_obj = paginator.get_page(page_number) # Get the current page object
# Map URL IDs to their sources & searches, only for subset of URLs (page of interest)
sources_map = {
url.id: list(Source.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
}
searches_map = {
url.id: list(Search.objects.filter(urlssourcesearch__id_url=url).distinct()) for url in page_obj.object_list
}
url_content_map = {
url.id: UrlContent.objects.filter(pk=url).first() for url in page_obj.object_list
}
# Custom replace search type text
for s in searches:
s.type = s.type.replace("rss_feed", "rss").replace("url_host", "url").replace("keyword_search", "keyword")
context = {
'urls': page_obj, # Pass the paginated URLs
'per_page': per_page, # Send per_page value for dynamic pagination
'statuses': statuses,
'searches': sorted(searches, key=lambda x: (x.type, x.search)),
'sources': sorted(sources, key=lambda x: x.source),
'languages': sorted(languages, key=lambda x: (x is None, x)),
'valid_contents': valid_contents,
# Selection
'selected_status': selected_status,
'selected_search': selected_search,
'selected_source': selected_source,
'selected_language': selected_language,
'selected_valid_contents': selected_valid_contents,
"selected_days": selected_days,
# Map
"sources_map": sources_map,
"searches_map": searches_map,
"url_content_map": url_content_map,
# "charts": charts,
# "list_per_page": [15, 100, 500],
# "list_days_text": ([0.25, 1, 7, 30, 365], ["Last 6 hours", "Last 24 hours", "Last 7 days", "Last 30 days", "Last 365 days"])
}
return render(request, 'filtered_urls.html', context)
####################################################################################################