Urls source search, cleaning code

This commit is contained in:
Luciano Gervasoni
2025-03-20 17:19:52 +01:00
parent 05e17266f1
commit f84c7729f8
13 changed files with 241 additions and 300 deletions

File diff suppressed because one or more lines are too long

View File

@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 33, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -14,25 +14,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 34, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Searching Bing \n",
" \r"
]
}
],
"source": [ "source": [
"results = engine.search('news: \"child abuse\"', pages=2)" "results = engine.search('news: \"child abuse\"', pages=2)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 40, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -41,18 +32,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 43, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"name": "stdout",
"output_type": "stream",
"text": [
"Searching Brave \n",
" \r"
]
}
],
"source": [ "source": [
"query = 'news: child abuse'\n", "query = 'news: child abuse'\n",
"r = engine.search(query, pages=2)" "r = engine.search(query, pages=2)"
@@ -60,20 +42,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 44, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"data": {
"text/plain": [
"{'_results': []}"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"r.__dict__" "r.__dict__"
] ]
@@ -87,20 +58,9 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 46, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"data": {
"text/plain": [
"newspaper.exceptions.ArticleBinaryDataException"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"import newspaper\n", "import newspaper\n",
"newspaper.ArticleBinaryDataException" "newspaper.ArticleBinaryDataException"

View File

@@ -2,7 +2,7 @@
``` ```
conda create -n matitos_urls python=3.12 conda create -n matitos_urls python=3.12
conda activate matitos_urls conda activate matitos_urls
pip install django psycopg[binary] django-rq pip install django psycopg[binary] django-redis django-rq
pip install feedparser python-dateutil newspaper4k lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews pip install feedparser python-dateutil newspaper4k lxml[html_clean] googlenewsdecoder gnews duckduckgo_search GoogleNews
``` ```
@@ -77,8 +77,10 @@ DB_PORT=${DB_NAME:-5432}
REDIS_HOST=${REDIS_HOST:-localhost} REDIS_HOST=${REDIS_HOST:-localhost}
REDIS_PORT=${REDIS_PORT:-6379} REDIS_PORT=${REDIS_PORT:-6379}
# Default RQ queue timeout # Default RQ job timeout
RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900} RQ_DEFAULT_TIMEOUT=${REDIS_PORT:-900}
# Default RQ job queue TTL
RQ_DEFAULT_RESULT_TTL=${RQ_DEFAULT_RESULT_TTL:-3600}
``` ```
* Django DB * Django DB
@@ -94,9 +96,9 @@ python manage.py makemigrations api; python manage.py migrate --fake-initial
# Server # Server
python manage.py runserver python manage.py runserver
# Worker # Workers
python manage.py rqworker default # python manage.py rqworker high default low
while true; do python manage.py rqworker default --burst -v 0; sleep 5; done python manage.py rqworker high default low
# Visualize DB # Visualize DB
http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public&select=urls&order%5B0%5D=id

View File

@@ -0,0 +1,27 @@
# Generated by Django 4.2.20 on 2025-03-20 16:12
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('api', '0002_delete_feed_delete_websiteofinterest_and_more'),
]
operations = [
migrations.CreateModel(
name='UrlsSourceSearch',
fields=[
('id_url', models.OneToOneField(db_column='id_url', on_delete=django.db.models.deletion.DO_NOTHING, primary_key=True, serialize=False, to='api.urls')),
],
options={
'db_table': 'urls_source_search',
'managed': False,
},
),
migrations.DeleteModel(
name='UrlsSource',
),
]

View File

@@ -87,11 +87,12 @@ class UrlsDuplicate(models.Model):
unique_together = (('id_url_canonical', 'id_url_duplicated'),) unique_together = (('id_url_canonical', 'id_url_duplicated'),)
class UrlsSource(models.Model): class UrlsSourceSearch(models.Model):
id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source) found, that is not supported. The first column is selected. id_url = models.OneToOneField(Urls, models.DO_NOTHING, db_column='id_url', primary_key=True) # The composite primary key (id_url, id_source, id_search) found, that is not supported. The first column is selected.
id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source') id_source = models.ForeignKey(Source, models.DO_NOTHING, db_column='id_source')
id_search = models.ForeignKey(Search, models.DO_NOTHING, db_column='id_search')
class Meta: class Meta:
managed = False managed = False
db_table = 'urls_source' db_table = 'urls_source_search'
unique_together = (('id_url', 'id_source'),) unique_together = (('id_url', 'id_source', 'id_search'),)

View File

@@ -1,11 +1,9 @@
from ..models import Urls, UrlContent, UrlsSource, UrlsDuplicate, Source, StatusPatternMatching from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPatternMatching, Source, Search
from django.db.models import Q from django.db.models import Q
from .url_processor import process_url
from django.core.cache import cache from django.core.cache import cache
from django.db import IntegrityError from django.db import IntegrityError
import hashlib from .url_processor import process_url, get_with_protocol
import re import re
import time
import traceback import traceback
from .logger import get_logger from .logger import get_logger
logger = get_logger() logger = get_logger()
@@ -19,61 +17,32 @@ class DB_Handler():
# URL host slowdown # URL host slowdown
self.url_host_slowdown_seconds = 5 self.url_host_slowdown_seconds = 5
def _get_safe_cache_key(self, raw_key): def insert_raw_urls(self, urls, obj_source, obj_search):
"""Generate a safe cache key using an MD5 hash"""
return hashlib.md5(raw_key.encode()).hexdigest()
def _cache_key(self, cache_key, hash_encode, cache_timeout):
if (hash_encode):
cache.set(self._get_safe_cache_key(cache_key), True, timeout=cache_timeout)
else:
cache.set(cache_key, True, timeout=cache_timeout)
def _is_cached_key(self, cache_key, hash_encoded):
# Returns True if cached
if (hash_encoded):
return cache.get(self._get_safe_cache_key(cache_key)) is not None
else:
return cache.get(cache_key) is not None
def _clean_protocol(self, url):
# http:// -> https://
url = url.replace("http://", "https://")
# "" -> https://
if not (url.startswith("https://")):
url = "https://" + url
return url
def insert_raw_urls(self, urls, source):
try: try:
logger.debug("Inserting raw URLs") logger.debug("Inserting raw URLs")
# Empty? # Empty?
if (len(urls) == 0): if (len(urls) == 0):
logger.debug("Empty batch of urls (not writing to DB) for source: {}".format(source)) logger.debug("Empty batch of urls (not writing to DB) for source-search: {} - {}".format(obj_source.source, obj_search.search))
return return
# Default protocol https:// # Default protocol https://
urls_clean = [self._clean_protocol(url) for url in urls] urls_clean = [get_with_protocol(url) for url in urls]
# Get the source (create if not exists)
source_obj, created = Source.objects.get_or_create(source=source)
urls_to_insert = [] urls_to_insert = []
# Per URL # Per URL
for url in urls_clean: for url in urls_clean:
### Already processed URL? ### Already processed URL?
if (self._is_cached_key(url, hash_encoded=True)): if (cache.get("insert_{}".format(url)) is not None):
logger.debug("Already cached URL: {}".format(url)) logger.debug("Already cached URL: {}".format(url))
if (self._is_cached_key("{}{}".format(source, url), hash_encoded=True)): if (cache.get("insert_{}{}{}".format(url, obj_source.source, obj_search.search)) is not None):
logger.debug("Already cached (source, URL): {} {}".format(source, url)) logger.debug("Already cached (URL, source, search): {} {} {}".format(url, obj_source.source, obj_search.search))
else: else:
### Insert (URL_id, source_id), since not cached ### Insert (URL_id, source_id, search_id), since not cached
# Get URL ID (should already be created) # Get URL ID (should already be created)
url_obj, created = Urls.objects.get_or_create(url=url) obj_url, created = Urls.objects.get_or_create(url=url)
# Create (id_source, id_url) (shouldn't exist) # Create (id_source, id_url) (shouldn't exist)
UrlsSource.objects.get_or_create(id_source=source_obj, id_url=url_obj) UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
else: else:
# Add object to insert # Add object to insert
# url_object_to_insert.append(Urls(url=url)) # url_object_to_insert.append(Urls(url=url))
@@ -85,16 +54,20 @@ class DB_Handler():
# URLs (ignore_conflicts=False to return IDs) # URLs (ignore_conflicts=False to return IDs)
bulk_created_urls = Urls.objects.bulk_create([Urls(url=url) for url in urls_to_insert], ignore_conflicts=False) bulk_created_urls = Urls.objects.bulk_create([Urls(url=url) for url in urls_to_insert], ignore_conflicts=False)
# (URL_id, source_id) # (URL_id, source_id)
UrlsSource.objects.bulk_create([UrlsSource(id_source=source_obj, id_url=url_obj) for url_obj in bulk_created_urls], ignore_conflicts=True) UrlsSourceSearch.objects.bulk_create([UrlsSourceSearch(id_url=obj_url, id_source=obj_source, id_search=obj_search) for obj_url in bulk_created_urls], ignore_conflicts=True)
except IntegrityError as e: except IntegrityError as e:
### Fallback to one-by-one insert ### Fallback to one-by-one insert
logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method") logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method")
# One by one # One by one
for url in urls_to_insert: for url in urls_to_insert:
# URL # URL
url_obj, created = Urls.objects.get_or_create(url=url) obj_url, created = Urls.objects.get_or_create(url=url)
# (URL, source) if (created):
UrlsSource.objects.get_or_create(id_source=source_obj, id_url=url_obj) logger.info("CREATED: {}".format(obj_url.url))
else:
logger.info("NOT CREATED: {}".format(obj_url.url))
# (URL, source, search)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
except Exception as e: except Exception as e:
logger.warning("bulk_create unknown exception while inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) logger.warning("bulk_create unknown exception while inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
# Avoid caching due to error on insertion # Avoid caching due to error on insertion
@@ -102,37 +75,14 @@ class DB_Handler():
# Insert or update cache # Insert or update cache
for url in urls_clean: for url in urls_clean:
# Hash encode URLs for special characters cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
self._cache_key(url, hash_encode=True, cache_timeout=self._cache_timeout_insert_url) cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
self._cache_key("{}{}".format(source, url), hash_encode=True, cache_timeout=self._cache_timeout_insert_url)
logger.info("Inserted #{} raw URLs".format(len(urls_to_insert))) logger.info("Inserted #{} raw URLs".format(len(urls_to_insert)))
except Exception as e: except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
def _get_url_host(self, url):
# URL no protocol, first substring before '/'
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
return url_host
def _url_host_slowdown(self, url, url_host_slowdown_seconds):
### Avoid (frequent) too many requests to the same URL host
# Get URL host
url_host = self._get_url_host(url)
# Recently processed URL host? -> Slow down required
last_cached_timestamp = cache.get("processed_{}".format(url_host), None)
if last_cached_timestamp:
# Get time since last processed URL host (in seconds)
time_since_last_processed = time.time() - last_cached_timestamp
# Amount of time required to sleep?
slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
# Sleep
time.sleep(slowdown_required)
# About to process URL host, cache time
cache.set("processed_{}".format(url_host), time.time(), timeout=60*5) # Expire after 5 minutes
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error): def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
def set_status(obj_url, status): def set_status(obj_url, status):
@@ -158,8 +108,6 @@ class DB_Handler():
##### Process URL ##### Process URL
try: try:
# Slow down if required to avoid too many requests error
self._url_host_slowdown(obj_url.url, self.url_host_slowdown_seconds)
# Get data # Get data
dict_url_data = process_url(obj_url.url) dict_url_data = process_url(obj_url.url)
# Not none or handle as exception # Not none or handle as exception
@@ -190,17 +138,17 @@ class DB_Handler():
# Get or create URL with canonical form # Get or create URL with canonical form
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical")) obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
# Get the sources id associated to obj_url.id # Get the source-search IDs associated to obj_url.id
url_sources = UrlsSource.objects.filter(id_url=obj_url) list_url_source_search = UrlsSourceSearch.objects.fiter(id_url=obj_url)
for url_source_obj in url_sources: for obj_url_source_search in list_url_source_search:
# Associate same sources to url_canonical (it might already exist) # Associate same sources to url_canonical (it might already exist)
obj_urls_source, created = UrlsSource.objects.get_or_create(id_source=url_source_obj.id_source, id_url=obj_url_canonical) UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
# URLs duplciate association # URLs duplciate association
obj_urls_duplicate, created = UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url) UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
# TODO: return obj_url_canonical so as to directly process the recently inserted URL # TODO: return obj_url_canonical so as to directly process the recently inserted URL
# Whever this function is called, add: # Wherever this function is called, add:
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error) # self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
# Next URL # Next URL
@@ -281,7 +229,7 @@ class DB_Handler():
# Per URL # Per URL
for obj_url in error_urls: for obj_url in error_urls:
# URL ID cached? -> Tried to process recently already, skip # URL ID cached? -> Tried to process recently already, skip
if (self._is_cached_key("error_{}".format(obj_url.id), hash_encoded=False)): if (cache.get("error_{}".format(obj_url.id)) is not None):
logger.debug("Already cached URL ID: {}".format(obj_url.id)) logger.debug("Already cached URL ID: {}".format(obj_url.id))
num_urls_skipped += 1 num_urls_skipped += 1
continue continue
@@ -292,7 +240,7 @@ class DB_Handler():
num_urls_processed += 1 num_urls_processed += 1
except Exception as e: except Exception as e:
# Error, cache to avoid re-processing for X time # Error, cache to avoid re-processing for X time
self._cache_key("error_{}".format(obj_url.id), hash_encode=False, cache_timeout=self._cache_timeout_error_url) cache.set("error_{}".format(obj_url.id), True, timeout=self._cache_timeout_insert_url)
num_urls_skipped += 1 num_urls_skipped += 1
# Get following batch of URLs, status='error' # Get following batch of URLs, status='error'

View File

@@ -1,5 +1,5 @@
from .db_utils import DB_Handler from .db_utils import DB_Handler
from ..models import Search from ..models import Search, Source
import feedparser import feedparser
import dateutil import dateutil
import traceback import traceback
@@ -14,16 +14,19 @@ class FetchFeeds():
try: try:
logger.debug("Starting FetchFeeds.run()") logger.debug("Starting FetchFeeds.run()")
# Get feeds # Get source object
list_url_feeds = list(Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED).values_list('search', flat=True)) obj_source, created = Source.objects.get_or_create(source="feeds")
logger.debug("Fetching from feeds: {}".format(list_url_feeds))
# Get feeds objects
list_obj_search_feeds = Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED)
logger.debug("Fetching from feeds: {}".format([e.search for e in list_obj_search_feeds]))
# Process via RSS feeds # Process via RSS feeds
for url_feed in list_url_feeds: for obj_search in list_obj_search_feeds:
# Initialize # Initialize
urls_fetched, urls_publish_date = [], [] urls_fetched, urls_publish_date = [], []
# Fetch feeds # Fetch feeds
feeds = feedparser.parse(url_feed) feeds = feedparser.parse(obj_search.search)
# Parse # Parse
for f in feeds.get("entries", []): for f in feeds.get("entries", []):
# Get URL # Get URL
@@ -41,10 +44,8 @@ class FetchFeeds():
urls_publish_date.append(publish_date_parsed) urls_publish_date.append(publish_date_parsed)
# URL # URL
urls_fetched.append(url) urls_fetched.append(url)
# URL fetching source
source = "feed {}".format(url_feed)
# Write to DB # Write to DB
DB_Handler().insert_raw_urls(urls_fetched, source) DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e: except Exception as e:
logger.warning("Exception in FetchFeeds.run(): {}\n{}".format(e, traceback.format_exc())) logger.warning("Exception in FetchFeeds.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -1,5 +1,6 @@
from .db_utils import DB_Handler from .db_utils import DB_Handler
from ..models import Search from ..models import Search, Source
from .url_processor import get_with_protocol, url_host_slowdown
import newspaper import newspaper
import traceback import traceback
from .logger import get_logger from .logger import get_logger
@@ -13,27 +14,26 @@ class FetchParser():
try: try:
logger.debug("Starting FetchParser.run() for {}") logger.debug("Starting FetchParser.run() for {}")
# Get source object
obj_source, created = Source.objects.get_or_create(source="newspaper4k")
# Get URL hosts # Get URL hosts
list_url_host = list(Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST).values_list('search', flat=True)) list_url_host = Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST)
logger.debug("Fetching news by parsing URL hosts: {}".format(list_url_host)) logger.debug("Fetching news by parsing URL hosts: {}".format([e.search for e in list_url_host]))
# Process newspaper4k build method # Process newspaper4k build method
for url_host_feed in list_url_host: for obj_search in list_url_host:
# Protocol # Protocol
if not (url_host_feed.startswith("http")): url_host_protocol = get_with_protocol(obj_search.search)
url_host_feed_formatted = "https://" + url_host_feed logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_protocol))
else:
url_host_feed_formatted = url_host_feed # Make sure no requests made for the last X seconds
url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
# Source object # Source object
url_host_built = newspaper.build(url_host_feed_formatted) url_host_built = newspaper.build(url_host_protocol)
# Get articles URL list # Get articles URL list
urls_fetched = url_host_built.article_urls() urls_fetched = url_host_built.article_urls()
# URL fetching source
source = "newspaper4k {}".format(url_host_feed)
# Write to DB # Write to DB
DB_Handler().insert_raw_urls(urls_fetched, source) DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e: except Exception as e:
logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc())) logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -1,5 +1,6 @@
from .db_utils import DB_Handler from .db_utils import DB_Handler
from ..models import Search from ..models import Search, Source
from django.db.models import Q
import traceback import traceback
import time import time
from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news
@@ -10,54 +11,59 @@ class FetchSearcher():
def __init__(self) -> None: def __init__(self) -> None:
logger.debug("Initializing Fetcher Searcher") logger.debug("Initializing Fetcher Searcher")
def _get_source_object(self, source):
# TODO: Cache
# self.cached_sources = {}
# Get source object
obj_source, created = Source.objects.get_or_create(source=source)
return obj_source
def run(self): def run(self):
try: try:
logger.debug("Starting FetchSearcher.run()") logger.debug("Starting FetchSearcher.run()")
# Get keyword searches of interest # Get search objects of interest
list_keyword_search = list(Search.objects.filter(type=Search.TYPE_ENUM.KEYWORD_SEARCH).values_list('search', flat=True)) list_search_obj = Search.objects.filter(Q(type=Search.TYPE_ENUM.URL_HOST) | Q(type=Search.TYPE_ENUM.KEYWORD_SEARCH))
# Get URL host of interest logger.debug("Fetching from search: {}".format(["{} ({})".format(e.search, e.type) for e in list_search_obj]))
list_url_host = list(Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST).values_list('search', flat=True))
# TODO: allintitle: "child abuse"
# TODO: intitle: "child abuse"
# list_keyword_search + ['allintitle: "{}"'.format(s) for s in list_keyword_search] + ['intitle: "{}"'.format(s) for s in list_keyword_search]
# Merge searches
list_search = list_keyword_search + ["site:{}".format(u) for u in list_url_host]
logger.debug("Fetching from keyword search: {}".format(list_search))
# Search # Search
for keyword_search in list_search: for obj_search in list_search_obj:
# TODO: language & country customization # TODO: language & country customization
# TODO: allintitle: "child abuse"
# TODO: intitle: "child abuse"
# Search
keyword_search = "{}{}".format("site:" if obj_search.type is Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
# DDG News # DDG News
time.sleep(5) time.sleep(5)
raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "wt-wt") raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "wt-wt")
# Write to DB # Write to DB
DB_Handler().insert_raw_urls(raw_urls, source) DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# GNews # GNews
time.sleep(5) time.sleep(5)
raw_urls, source = search_gnews(keyword_search, language="en", country="US") raw_urls, source = search_gnews(keyword_search, language="en", country="US")
# Write to DB # Write to DB
DB_Handler().insert_raw_urls(raw_urls, source) DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# DDG Text # DDG Text
time.sleep(5) time.sleep(5)
raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=None, region = "wt-wt") raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=None, region = "wt-wt")
# Write to DB # Write to DB
DB_Handler().insert_raw_urls(raw_urls, source) DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# GoogleNews news # GoogleNews news
time.sleep(5) time.sleep(5)
raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US") raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US")
# Write to DB # Write to DB
DB_Handler().insert_raw_urls(raw_urls, source) DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# GoogleNews general # GoogleNews general
time.sleep(5) time.sleep(5)
raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5) raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5)
# Write to DB # Write to DB
DB_Handler().insert_raw_urls(raw_urls, source) DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# TODO: # TODO:
# SearxNG # SearxNG

View File

@@ -1,3 +1,4 @@
from django.core.cache import cache
import traceback import traceback
import random import random
import time import time
@@ -10,20 +11,31 @@ from duckduckgo_search import DDGS
from GoogleNews import GoogleNews from GoogleNews import GoogleNews
########################################################################### ###########################################################################
def decode_gnews_urls(encoded_urls): def decode_gnews_urls(encoded_urls, interval=2):
# DecodeURLs # DecodeURLs
list_decoded_urls = [] list_decoded_urls = []
for url in encoded_urls: for url in encoded_urls:
try: # Already cached?
# Decode URL, with interval time to avoid block decoded_url = cache.get("gnews_decode_{}".format(url))
decoded_url = gnewsdecoder(url, interval=5) if (decoded_url is not None):
# Ok? logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
if decoded_url.get("status"): # Append decoded URL
list_decoded_urls.append(decoded_url["decoded_url"]) list_decoded_urls.append(decoded_url)
else: else:
logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, decoded_url["message"])) try:
except Exception as e: # Decode URL, with interval time to avoid block
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc())) decoded_url_dict = gnewsdecoder(url, interval=interval)
# Ok?
if decoded_url_dict.get("status"):
# Append decoded URL
decoded_url = decoded_url_dict["decoded_url"]
list_decoded_urls.append(decoded_url)
# Cache decoded URL
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
else:
logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, decoded_url["message"]))
except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
return list_decoded_urls return list_decoded_urls
########################################################################### ###########################################################################
@@ -33,13 +45,18 @@ def search_gnews(keyword_search, period="1d", language="en", country="US", max_r
source = "gnews {} {} {}-{} max_results={}".format("news", period, language, country, max_results).replace("None", "").strip() source = "gnews {} {} {}-{} max_results={}".format("news", period, language, country, max_results).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source)) logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# Get news try:
results_gnews = GNews(language=language, country=country).get_news(keyword_search) # Get news
# Get list of encoded urls results_gnews = GNews(language=language, country=country).get_news(keyword_search)
encoded_urls = [e.get("url") for e in results_gnews] # Get list of encoded urls
# Decode encoded_urls = [e.get("url") for e in results_gnews]
list_decoded_urls = decode_gnews_urls(encoded_urls) # Decode
return list_decoded_urls, source logger.debug("Decoding gnews URLs")
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source
########################################################################### ###########################################################################
@@ -51,14 +68,18 @@ def search_ddg(keyword_search, category="news", timelimit="d", max_results=None,
# region="{}-{}".format(langauge, country.lower()) # region="{}-{}".format(langauge, country.lower())
# timelimit= # Options: d, w, m # timelimit= # Options: d, w, m
# max_results # max number of results. If None, returns results only from the first response. Defaults to None # max_results # max number of results. If None, returns results only from the first response. Defaults to None
if (category == "news"):
news = DDGS().news(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
urls = [e.get("url") for e in news]
if (category == "text"):
news = DDGS().text(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
urls = [e.get("href") for e in news]
try:
if (category == "news"):
news = DDGS().news(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
urls = [e.get("url") for e in news]
if (category == "text"):
news = DDGS().text(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
urls = [e.get("href") for e in news]
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source return urls, source
########################################################################### ###########################################################################
@@ -78,6 +99,7 @@ def search_googlenews_news(keyword_search, period="1d", language="en", country="
# Fetch # Fetch
encoded_urls = googlenews.get_links() encoded_urls = googlenews.get_links()
# Decode # Decode
logger.debug("Decoding gnews URLs")
urls = decode_gnews_urls(encoded_urls) urls = decode_gnews_urls(encoded_urls)
except Exception as e: except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc())) logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))

View File

@@ -2,14 +2,46 @@ from django.core.cache import cache
from .logger import get_logger from .logger import get_logger
logger = get_logger() logger = get_logger()
import newspaper import newspaper
import time
from urllib.parse import unquote from urllib.parse import unquote
# pip install langdetect # pip install langdetect
#import langdetect #import langdetect
#langdetect.DetectorFactory.seed = 0 #langdetect.DetectorFactory.seed = 0
def get_with_protocol(url):
# http:// -> https://
url = url.replace("http://", "https://")
# "" -> https://
if not (url.startswith("https://")):
url = "https://" + url
return url
def get_url_host(url):
# URL no protocol, first substring before '/'
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
return url_host
def url_host_slowdown(url, url_host_slowdown_seconds):
### Avoid (frequent) too many requests to the same URL host
# Get URL host
url_host = get_url_host(url)
# Recently processed URL host? -> Slow down required
last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None)
if last_cached_timestamp:
# Get time since last processed URL host (in seconds)
time_since_last_processed = time.time() - last_cached_timestamp
# Amount of time required to sleep?
slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
# Sleep
time.sleep(slowdown_required)
# About to process URL host, cache time
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
def process_url(url): def process_url(url):
try: try:
# Slow down if required to avoid too many requests error
url_host_slowdown(url, url_host_slowdown_seconds=2)
# Process # Process
article = newspaper.article(url) article = newspaper.article(url)
except newspaper.ArticleBinaryDataException: except newspaper.ArticleBinaryDataException:

View File

@@ -13,6 +13,11 @@ from src.missing_kids_status import MissingKidsStatus
from .src.logger import get_logger from .src.logger import get_logger
logger = get_logger() logger = get_logger()
@job
def fetch_feeds():
logger.info("Task triggered: {}".format("FetchFeeds"))
FetchFeeds().run()
@job @job
def background_task(process_type: str): def background_task(process_type: str):
logger.info("Task triggered: {}".format(process_type)) logger.info("Task triggered: {}".format(process_type))

View File

@@ -21,7 +21,7 @@ BASE_DIR = Path(__file__).resolve().parent.parent
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/ # See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret! # SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-kc0jj#_=7i$_79p(n5)p3taxvhnq=w*ori-%%iu_a6wye@$(*n' SECRET_KEY = 'django-insecure-54mqLbW5NlO8OlVDsT3fcbg3Vf6C8Fgcoj8H0hXv3Pr8bpgqvOuiaeqvGn34sGwt'
# SECURITY WARNING: don't run with debug turned on in production! # SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True DEBUG = True
@@ -38,7 +38,6 @@ INSTALLED_APPS = [
'django.contrib.sessions', 'django.contrib.sessions',
'django.contrib.messages', 'django.contrib.messages',
'django.contrib.staticfiles', 'django.contrib.staticfiles',
# 'rest_framework',
'django_rq', 'django_rq',
'api', 'api',
] ]
@@ -93,11 +92,16 @@ DATABASES = {
CACHES = { CACHES = {
"default": { "default": {
"BACKEND": "django.core.cache.backends.redis.RedisCache", #"BACKEND": "django.core.cache.backends.redis.RedisCache",
"BACKEND": "django_redis.cache.RedisCache",
"LOCATION": "redis://{}:{}".format( "LOCATION": "redis://{}:{}".format(
os.environ.get("REDIS_HOST", "localhost"), os.environ.get("REDIS_HOST", "localhost"),
os.environ.get("REDIS_PORT", 6379) os.environ.get("REDIS_PORT", 6379)
), ),
"OPTIONS": {
"MEMCACHE_MAX_KEY_LENGTH": 2048,
"CLIENT_CLASS": "django_redis.client.DefaultClient",
},
} }
} }
@@ -107,6 +111,7 @@ RQ_QUEUES = {
'PORT': os.environ.get("REDIS_PORT", 6379), 'PORT': os.environ.get("REDIS_PORT", 6379),
'DB': os.environ.get("REDIS_DB", 0), 'DB': os.environ.get("REDIS_DB", 0),
'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 900), 'DEFAULT_TIMEOUT': os.environ.get("RQ_DEFAULT_TIMEOUT", 900),
'DEFAULT_RESULT_TTL': os.environ.get("RQ_DEFAULT_RESULT_TTL", 3600),
} }
} }