General search fix, status pattern match regex, find feeds on startup

This commit is contained in:
Luciano Gervasoni
2025-04-09 15:52:35 +02:00
parent 296a8fe8a8
commit f369b23d81
22 changed files with 538 additions and 356 deletions

View File

@@ -4,7 +4,7 @@
- Fetch parsing URL host
- Fetch from RSS feed
- Fetch keyword search (Google search & news, DuckDuckGo, ...)
++ Sources -> Robustness to TooManyRequests block
- TODO: More sources -> Robustness to TooManyRequests block
- Selenium based
- Sites change their logic, request captcha, ...
- Brave Search API
@@ -12,22 +12,32 @@
- Bing API
- Subscription required
- Yandex. No API?
++ Proxy / VPN?
TooManyRequests, ...
++ Search per locale (nl-NL, fr-FR, en-GB)
- TODO: Proxy / VPN?
- TooManyRequests, ...
- TODO: Search per locale (nl-NL, fr-FR, en-GB)
- URLs Processing -> Updates raw URLs
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
- Determines if it is a valid article content
++ Proxy / VPN?
Bypass geoblock
- TODO: Proxy / VPN?
- Bypass geoblock and TooManyRequests
- Visualization of URLs
- Filter URLs
- By status, search, source, language, ...
- By fetch date, status, search, source, language, has valid content, minimum amount of sources, ...
- Charts
- Valid URLs
- URLs selection
- Published (or fetch) date during last_week / last 24 hrs
- Language of interest
- Valid content
- Fetched by at least N sources
- Use classifications and summaries
- TODO: Manual inspection -> Improve automation
- Rules or pattern for invalid articles, e.g. "youtube.com/*"
- URL host with "priority" or "weight"
- Content generation
- Generate summary
- One paragraph
- At most three paragraphs
@@ -35,12 +45,4 @@
- 5W: Who, What, When, Where, Why of a Story
- Related to child abuse?
- ...
- Content generation
- URLs Selection
- Valid content
- Language of interest
- Published (or fetch) date during last_week
- Fetched by at least N sources
- Use classifications and summaries
- Merge summaries, ...
- Merge similar articles?

View File

@@ -7,12 +7,8 @@ logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
# Directory of logs
os.makedirs(logs_directory, exist_ok=True)
# Too many logging entries otherwise
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("selenium")
logger = logging.getLogger("app_selenium")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL

View File

@@ -39,7 +39,7 @@ class MissingKidsFetcher():
logger.debug("Processing page: {}...".format(i))
try:
time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3)
time.sleep(float(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4))) #driver.implicitly_wait(3)
# Fetch poster URLs
for element_type in ["a"]: # ["a", "p", "div"]:
for elem in driver.find_elements(By.TAG_NAME, element_type):
@@ -75,7 +75,7 @@ class MissingKidsFetcher():
logger.info(e.text)
# driver.refresh()
time.sleep(int(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)))
time.sleep(float(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)))
if (i == first_n_pages):
continue_iterating = False
@@ -86,6 +86,7 @@ class MissingKidsFetcher():
logger.warning("Exception while fetching MissingKids {}".format(str(e)), exc_info=True)
set_urls = set()
# Release memory
try:
driver.close()
except Exception as e:

View File

@@ -25,4 +25,4 @@ RUN chown -R appuser:appuser /opt
USER appuser
# Run Djangos server & workers
CMD ["sh", "-c", "/opt/app/script_initialize.sh && /opt/app/script_run.sh"]
CMD ["sh", "-c", "/opt/app/initialize.sh && /opt/app/run.sh"]

View File

@@ -67,6 +67,8 @@ class Meta:
* Database & initialization
* Check initialize.sh on Dockerfile
* init_data.json
Insert URLs host of interest, RSS feeds, keyword searches, and Regex (escaped) status patterns to set "invalid" or "valid" URLs
* Environment variables
* In docker-compose.yml

View File

@@ -1,202 +0,0 @@
import argparse
import os
import psycopg
import re
import time
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
os.environ.get("DB_HOST", "localhost"),
os.environ.get("DB_PORT", "5432"),
os.environ.get("DB_NAME", "matitos"),
os.environ.get("DB_USER", "supermatitos"),
os.environ.get("DB_PASSWORD", "supermatitos")
)
def wait_connection():
connected = False
while (not connected):
try:
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Create URLs table
c = cur.execute("SELECT 1;").fetchall()
connected = True
except psycopg.OperationalError as e:
# Connection not ready...
# print(".", end="")
time.sleep(2)
except Exception as e:
# Connection not ready...
# print("e", end="")
time.sleep(2)
print("DB connection ready")
def initialize_tables():
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Autocommit at end of transaction (Atomic creation of tables)
with conn.transaction() as tx:
# Create URLs table
c = cur.execute("""
CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');
CREATE TABLE URLS (
id SERIAL PRIMARY KEY,
url TEXT NOT NULL UNIQUE,
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
-- status_wendy WENDY_STATUS DEFAULT NULL,
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
);
CREATE INDEX idx_urls_status ON urls(status);
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
CREATE TABLE URLS_DUPLICATE (
id_url_canonical INTEGER REFERENCES URLS(id),
id_url_duplicated INTEGER REFERENCES URLS(id),
PRIMARY KEY (id_url_canonical, id_url_duplicated)
);
CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');
CREATE TABLE SEARCH (
id SMALLSERIAL PRIMARY KEY,
search TEXT NOT NULL UNIQUE,
type SEARCH_TYPE NOT NULL
-- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search
-- UNIQUE(search, language_country)
);
CREATE INDEX idx_search_type ON SEARCH(type);
CREATE TABLE SOURCE (
id SMALLSERIAL PRIMARY KEY,
source TEXT NOT NULL UNIQUE
);
-- CREATE TABLE SEARCH_LANGUAGE (
-- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. "en"
-- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. "us"
-- PRIMARY KEY (language, country)
-- );
CREATE TABLE URLS_SOURCE_SEARCH (
id_url INTEGER REFERENCES URLS(id),
id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,
id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,
PRIMARY KEY(id_url, id_source, id_search)
);
CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);
CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);
CREATE TABLE STATUS_PATTERN_MATCHING (
pattern TEXT PRIMARY KEY,
priority SMALLINT NOT NULL,
status URL_STATUS NOT NULL
);
CREATE TABLE URL_CONTENT (
id_url INTEGER PRIMARY KEY REFERENCES URLS(id),
date_published TIMESTAMPTZ DEFAULT NOW(),
title TEXT,
description TEXT,
content TEXT,
valid_content BOOLEAN,
language CHAR(2), -- ISO 639-1 Code
keywords TEXT[],
tags TEXT[],
authors TEXT[],
image_main_url TEXT,
images_url TEXT[],
videos_url TEXT[],
url_host TEXT, -- www.breitbart.com
site_name TEXT -- Breitbart News
);
CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);
CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);
CREATE INDEX idx_date_published ON URL_CONTENT (date_published);
CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);
CREATE INDEX idx_language ON URL_CONTENT (language);
CREATE INDEX idx_url_host ON URL_CONTENT (url_host);
""")
def initialize_data():
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Autocommit at end of transaction (Atomic creation of data)
with conn.transaction() as tx:
# Feeds
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC', 'rss_feed');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://feeds.feedburner.com/breitbart', 'rss_feed');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('http://feeds.feedburner.com/zerohedge/feed', 'rss_feed');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://moxie.foxnews.com/google-publisher/latest.xml', 'rss_feed');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362', 'rss_feed');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362', 'rss_feed');" )
# Websites of interest
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/poster', 'url_host');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('missingkids.org/new-poster', 'url_host');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('breitbart.com', 'url_host');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('zerohedge.com', 'url_host');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('foxnews.com', 'url_host');" )
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('cnbc.com', 'url_host');" )
# Search keywords
cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search');" )
# TODO: Language per search
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');" )
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');" )
# Status update based on pattern matching (with priority to apply in order). Regex test https://regex101.com/
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("youtube.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("yewtu.be/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("tiktok.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("twitter.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("reddit.com/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("libreddit.de/"))) )
cur.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', 50, 'invalid');".format(".*{}.*".format(re.escape("radio.foxnews.com/"))) )
""" # TODO: To review with new scheme
# Status update based on pattern matching (with priority to apply in order)
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/(video|quotes)/.*', 100, 'invalid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(video|category)/.*', 100, 'invalid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(tag|author)/.*', 100, 'invalid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*zerohedge.com/(economics|political|markets)/.*', 50, 'valid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*breitbart.com/(economy|entertainment|border|crime|clips)/.*', 50, 'valid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*foxnews.com/(lifestyle|opinion|sports|world)/.*', 50, 'valid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*cnbc.com/[0-9]{4}/[0-9]{2}/[0-9]{2}/.*', 50, 'valid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*bbc.com/news/.*', 50, 'valid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*msn.com/[A-z]{2}-[A-z]{2}/news/.*', 50, 'valid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*newschannel9.com/news/.*', 50, 'valid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*radaronline.com/p.*', 25, 'valid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*okmagazine.com/p.*', 25, 'valid');" )
cursor.execute( "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('.*9news.com.au/national.*', 25, 'valid');" )
"""
def main(name):
print('Hello, %s!' % name)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Database initialization')
parser.add_argument('--initialize_tables', help='Create DB tables', action='store_true', default=False)
parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
args = parser.parse_args()
# Wait for DB connection
wait_connection()
if (args.initialize_tables):
print("Initializing tables")
initialize_tables()
if (args.initialize_data):
print("Initializing data")
initialize_data()

View File

@@ -81,6 +81,7 @@ class DB_Handler():
except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
def set_status(obj_url, status):
@@ -89,17 +90,17 @@ class DB_Handler():
obj_url.status = status
obj_url.save()
##### Filter URL? -> Invalid
if (status_pattern_match == "invalid"):
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
# Found a pattern match -> Override status
if (status_pattern_match is not None):
logger.debug("Pattern match, status '{}' for input URL: {}".format(status_pattern_match, obj_url.url))
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
set_status(obj_url, status_pattern_match)
##### Filter URL? -> Invalid (don't extract content)
if (status_pattern_match == "invalid"):
return
##### Process URL
try:
# Get data
# Extract URL content
dict_url_data = process_url(obj_url.url)
except Exception as e:
if (raise_exception_on_error):
@@ -110,25 +111,10 @@ class DB_Handler():
# Set status to error
dict_url_data = None
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
if (dict_url_data is None):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
# Invalid? e.g. binary data
if (dict_url_data.get("override_status") == "invalid"):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
##### Canonical URL different? -> Duplicate
if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
if (dict_url_data is not None) and (dict_url_data.get("url_canonical") is not None) and (dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
# Get or create URL with canonical form
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
# Get the source-search IDs associated to obj_url.id
@@ -136,42 +122,54 @@ class DB_Handler():
for obj_url_source_search in list_url_source_search:
# Associate same sources to url_canonical (it might already exist)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
# URLs duplciate association
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
# TODO: return obj_url_canonical so as to directly process the recently inserted URL
# Wherever this function is called, add:
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
# Next URL
return
##### Valid URL
# Update status
set_status(obj_url, Urls.STATUS_ENUM.VALID)
# Not overriding status given pattern matching?
if (status_pattern_match is None):
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
if (dict_url_data is None):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
# Invalid? e.g. binary data
if (dict_url_data.get("override_status") == "invalid"):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
##### Valid URL
# Update status
set_status(obj_url, Urls.STATUS_ENUM.VALID)
try:
# Create or update extracted URL data
UrlContent.objects.update_or_create(
id_url=obj_url,
defaults = {
"date_published" : dict_url_data.get("publish_date"),
"title" : dict_url_data.get("title"),
"description" : dict_url_data.get("description"),
"content" : dict_url_data.get("content"),
"valid_content" : dict_url_data.get("valid_content"),
"language" : dict_url_data.get("language"),
"keywords" : dict_url_data.get("keywords"),
"tags" : dict_url_data.get("tags"),
"authors" : dict_url_data.get("authors"),
"image_main_url" : dict_url_data.get("image_main_url"),
"images_url" : dict_url_data.get("images_url"),
"videos_url" : dict_url_data.get("videos_url"),
"url_host" : dict_url_data.get("url_host"),
"site_name" : dict_url_data.get("site_name"),
}
)
if (dict_url_data is not None):
# Create or update extracted URL data
UrlContent.objects.update_or_create(
id_url=obj_url,
defaults = {
"date_published" : dict_url_data.get("publish_date"),
"title" : dict_url_data.get("title"),
"description" : dict_url_data.get("description"),
"content" : dict_url_data.get("content"),
"valid_content" : dict_url_data.get("valid_content"),
"language" : dict_url_data.get("language"),
"keywords" : dict_url_data.get("keywords"),
"tags" : dict_url_data.get("tags"),
"authors" : dict_url_data.get("authors"),
"image_main_url" : dict_url_data.get("image_main_url"),
"images_url" : dict_url_data.get("images_url"),
"videos_url" : dict_url_data.get("videos_url"),
"url_host" : dict_url_data.get("url_host"),
"site_name" : dict_url_data.get("site_name"),
}
)
except Exception as e:
logger.debug("Error in update_or_create UrlContent: {}\ndict_url_data: {}\n{}\n{}".format(obj_url.url, dict_url_data, str(e), traceback.format_exc()))
@@ -179,13 +177,12 @@ class DB_Handler():
def process_raw_urls(self, batch_size):
def _get_status_pattern_matching(url, list_pattern_status_tuple):
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
"""
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only """
# Sort pattern tuples by priority. (pattern, priority, status)
for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
# Regular expression pattern matching: https://regexr.com/
if bool(re.match(regex_pattern, obj_url.url)):
logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
# logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
return status_if_match
return None

View File

@@ -49,11 +49,11 @@ class FetchSearcher():
"language": "en",
"country": "US",
# "period": ["7d", "1d"], # TODO: List of periods to iterate
}
}
for SearchInstance in ListSearchInstances:
# Sleep between requests, avoid too many requests...
time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
SearchInstance(args).fetch_articles(db_writer, obj_search)
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master

View File

@@ -55,8 +55,8 @@ class FetcherAbstract(ABC):
keyword_search = "{}{}".format("site:", keyword_search)
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
start_date = timezone.now() - timedelta(days=7)
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
logger.debug("Appending news to general search")
keyword_search = "{}{}".format(keyword_search, "news")
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
# Fetch
@@ -194,7 +194,7 @@ class SearchGoogleGeneral(FetcherAbstract):
# Iterate pages
for i in range(self.pages):
# Sleep between pages fetch
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
time.sleep(float(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
# Number of URLs fetched so far
num_before = len(set_links)
# Get page

View File

@@ -6,7 +6,7 @@ logger = get_logger()
from googlenewsdecoder import gnewsdecoder
def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
def decode_gnews_urls(encoded_urls, interval=float(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
logger.debug("Decoding gnews URLs")
# DecodeURLs
list_decoded_urls = []

View File

@@ -41,7 +41,7 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
def process_url(url):
try:
# Slow down if required to avoid too many requests error
url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
url_host_slowdown(url, url_host_slowdown_seconds=float(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
# Process
article = newspaper.article(url)
except newspaper.ArticleBinaryDataException:

View File

@@ -4,6 +4,7 @@ from . import views
urlpatterns = [
path('', views.link_list, name='link_list'),
#
path('logs/database', views.log_db, name='log_db'),
path('logs/<str:log_type>', views.logs, name='logs'),
#
path('task/<str:task>', views.trigger_task, name='trigger_task'),
@@ -17,4 +18,5 @@ urlpatterns = [
path('urls/', views.filtered_urls, name='filtered_urls'),
path('urls/<int:id>/', views.url_detail_view, name='url_detail'),
path('urls/<int:id>/fetch/', views.fetch_details, name='fetch_details'),
path('urls/content_generation', views.content_generation, name='content_generation'),
]

View File

@@ -1,44 +1,16 @@
from .tasks import background_task
from .views_base import link_list, logs, log_db, trigger_task
from django.core.paginator import Paginator
from django.shortcuts import render, get_object_or_404
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
from django.contrib.auth.decorators import login_required
import ollama
from django.http import StreamingHttpResponse, JsonResponse
from django.db.models import Q, Count
from django.utils import timezone
from django.utils.timezone import now, timedelta
from .models import Urls, Source, Search, UrlContent, UrlsSourceSearch, UrlsDuplicate
import ollama
import os
from .src.logger import get_logger
logger = get_logger()
#from datetime import timedelta
####################################################################################################
def trigger_task(request, task):
# Enqueue function in "default" queue
background_task.delay(task)
return JsonResponse({"message": "Task has been enqueued!", "task": task})
####################################################################################################
def link_list(request):
# Base URL path
app_url = request.build_absolute_uri()
# Tasks
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
# List of links
list_links = \
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
[ os.path.join(app_url, "logs", log_type) for log_type in ["debug", "info", "warning"] ] + \
[ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
# Json
return JsonResponse({"links": list_links })
####################################################################################################
def logs(request, log_type):
# Capture output: python manage.py rqstats
try:
with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
file_content = f.read()
except Exception as e:
file_content = "Error reading logs for log type :{}".format(log_type)
return HttpResponse(file_content, content_type="text/plain")
####################################################################################################
class OllamaClient():
@@ -57,13 +29,6 @@ class OllamaClient():
def get_prompt(self):
return "Rewrite the text below into a clear and concise summary, presenting the key points as if they are newly written insights. Do not mention or reference the original text, its source, or any phrases like 'According to' or 'The text states'. Instead, write in a natural, standalone format that feels like an original explanation. Keep it brief, engaging, informative, in the style of a news article, and no longer than a paragraph:"
#return "Provide a summary of the content below, avoid mentioning the source of information, and only answer with the summary. The summary needs to be brief and compact, consisting of one paragraph."
#return "Explain in a single and compact paragraph the what, why, when, where, who, and how of the content below. Also provide a single paragraph summary of the content:"
#return "Provide in one paragraph the what, why, when, where, who, and how of the content below. Also provide a one paragraph summary of the content:"
#return "Provide two summaries of the content below, and avoid mentioning the source of information. First, provide a very brief and compact paragraph summary. Second, provide a larger and more detailed summary, which describe the what, why, when, where, who, and how of the content:"
# return "Imagine you are a journalist, TLDR in a paragraph. Only answer with the summary:"
#return "Below you will find the whole content of a news article:\n{}\nProvide a concise summary of one paragraph maximum of the content.".format(content)
def fetch_details(request, id):
url_item = get_object_or_404(Urls, id=id)
@@ -83,7 +48,6 @@ def fetch_details(request, id):
return StreamingHttpResponse(stream_response(), content_type="text/plain")
def url_detail_view(request, id):
url_item = get_object_or_404(Urls, id=id)
url_sources = list(Source.objects.filter(urlssourcesearch__id_url=url_item).distinct())
@@ -114,13 +78,6 @@ def url_detail_view(request, id):
return render(request, 'url_detail.html', context)
####################################################################################################
from django.shortcuts import render
from django.http import JsonResponse
from django.db.models import Count
from datetime import timedelta
from django.utils import timezone
from .models import Urls, UrlsSourceSearch
def charts(request):
return render(request, 'charts.html')
@@ -202,14 +159,7 @@ def urls_per_search(request):
return JsonResponse(data)
####################################################################################################
from django.shortcuts import render
from .models import Urls, Search, Source
from django.db.models import Q
from django.utils.timezone import now, timedelta
def filtered_urls(request):
statuses = Urls.STATUS_ENUM.choices
@@ -342,4 +292,16 @@ def filtered_urls(request):
}
return render(request, 'filtered_urls.html', context)
####################################################################################################
def content_generation(request):
# https://fetcher.matitos.org/urls/?per_page=100&days=1&valid_content=True&min_sources=1&search=13&status=all&language=all&source=all
'''
# Get list of URLs ID
selected_urls = request.GET.getlist('urls', [])
# Sample URLs
selected_urls = [13460, 13455, 13454, 13452, 13210]
'''
####################################################################################################

View File

@@ -0,0 +1,74 @@
import os
import psycopg
from .tasks import background_task
from django.http import JsonResponse, HttpResponse
####################################################################################################
def trigger_task(request, task):
# Enqueue function in "default" queue
background_task.delay(task)
return JsonResponse({"message": "Task has been enqueued!", "task": task})
####################################################################################################
def link_list(request):
# Base URL path
app_url = request.build_absolute_uri()
# Tasks
links_fetch = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all"]
links_process = ["process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
# List of links
list_links = \
[ os.path.join(app_url, "admin"), os.path.join(app_url, "urls") ] + \
[ os.path.join(app_url, "logs", log_type) for log_type in ["database", "debug", "info", "warning"] ] + \
[ os.path.join(app_url, "task", l) for l in links_fetch + links_process ]
# Json
return JsonResponse({"links": list_links })
####################################################################################################
def logs(request, log_type):
# Capture output: python manage.py rqstats
try:
with open( os.path.join( os.getenv("PATH_LOGS_DIRECTORY", "logs"), "{}.log".format(log_type) ), "r") as f:
file_content = f.read()
except Exception as e:
file_content = "Error reading logs for log type :{}".format(log_type)
return HttpResponse(file_content, content_type="text/plain")
####################################################################################################
def log_db(request):
# TODO: Django connection
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
os.environ.get("DB_HOST", "localhost"),
os.environ.get("DB_PORT", "5432"),
os.environ.get("DB_NAME", "matitos"),
os.environ.get("DB_USER", "supermatitos"),
os.environ.get("DB_PASSWORD", "supermatitos")
)
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Create URLs table
r = cur.execute("""
SELECT
relname AS "relation",
pg_size_pretty (
pg_total_relation_size (C .oid)
) AS "total_size"
FROM
pg_class C
LEFT JOIN pg_namespace N ON (N.oid = C .relnamespace)
WHERE
nspname NOT IN (
'pg_catalog',
'information_schema'
)
AND C .relkind <> 'i'
AND nspname !~ '^pg_toast'
ORDER BY
pg_total_relation_size (C .oid) DESC
LIMIT 100;
""").fetchall()
return HttpResponse( "\n".join([str(e) for e in r]) )
####################################################################################################

34
app_urls/init_data.json Normal file
View File

@@ -0,0 +1,34 @@
{
"SEARCH": {
"rss_feed": [
"https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC",
"https://feeds.feedburner.com/breitbart",
"https://feeds.feedburner.com/zerohedge/feed",
"https://moxie.foxnews.com/google-publisher/latest.xml",
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=15837362",
"https://search.cnbc.com/rs/search/combinedcms/view.xml?partnerId=wrss01&id=100727362"
],
"url_host": [
"missingkids.org/poster",
"missingkids.org/new-poster",
"breitbart.com",
"zerohedge.com",
"foxnews.com",
"cnbc.com"
],
"keyword_search": [
"child abuse"
]
},
"REGEX_PATTERN_STATUS_PRIORITY": [
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50],
[".*cnbc\\.com\\/(video|quotes)\\/.*", "invalid", 75],
[".*foxnews\\.com\\/(video|category)\\/.*", "invalid", 75],
[".*radio.foxnews\\.com\\/.*", "invalid", 75],
[".*breitbart\\.com\\/(tag|author)\\/.*", "invalid", 75],
[".*zerohedge\\.com\\/(user)\\/.*", "invalid", 75],
[".*zerohedge\\.com\\/(economics|political|markets|)\\/.*", "valid", 50],
[".*breitbart\\.com\\/(economy|entertainment|border|crime|clips)\\/.*", "valid", 50],
[".*foxnews\\.com\\/(lifestyle|opinion|sports|world)\\/.*", "valid", 50]
]
}

View File

@@ -0,0 +1,66 @@
{
"SEARCH": {
"rss_feed": [
],
"url_host": [
"johnpilger.com",
"lapenseeecologique.com",
"partage-le.com",
"reflets.info",
"rezo.net",
"consortiumnews.com",
"disclose.ngo/fr",
"energieetenvironnement.com",
"global-climat.com",
"lapenseeecologique.com",
"slashdot.org",
"lesamisdebartleby.wordpress.com",
"lundi.am",
"lvsl.fr",
"moderndiplomacy.eu",
"mrmondialisation.org",
"ourfiniteworld.com",
"southfront.org",
"simplicius76.substack.com",
"smoothiex12.blogspot.com",
"theintercept.com",
"wikileaks.org",
"contretemps.eu",
"indianpunchline.com",
"investigaction.net/fr",
"notechmagazine.com",
"terrestres.org",
"truthdig.com",
"tass.com",
"bastamag.net",
"counterpunch.org",
"energy-daily.com",
"fakirpresse.info",
"geopoliticalmonitor.com",
"huffingtonpost.fr",
"legrandsoir.info",
"les-crises.fr",
"liberation.fr",
"maitre-eolas.fr",
"marianne.net",
"mediapart.fr",
"metaefficient.com",
"monde-diplomatique.fr",
"paulcraigroberts.org",
"politis.fr",
"reporterre.net",
"rue89.com",
"theguardian.com/international",
"treehugger.com",
"unz.com",
"voltairenet.org",
"wsws.org"
],
"keyword_search": [
"society collapse"
]
},
"REGEX_PATTERN_STATUS_PRIORITY": [
[".*(youtube|tiktok|twitter|reddit)\\.com\\/.*", "invalid", 50]
]
}

244
app_urls/init_db.py Normal file
View File

@@ -0,0 +1,244 @@
import argparse
import os
import psycopg
import json
import time
import urllib.parse
import html5lib
import feedparser
import requests
connection_info = "host={} port={} dbname={} user={} password={} connect_timeout=60".format(
os.environ.get("DB_HOST", "localhost"),
os.environ.get("DB_PORT", "5432"),
os.environ.get("DB_NAME", "matitos"),
os.environ.get("DB_USER", "supermatitos"),
os.environ.get("DB_PASSWORD", "supermatitos")
)
def wait_connection():
connected = False
while (not connected):
try:
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Create URLs table
c = cur.execute("SELECT 1;").fetchall()
connected = True
except psycopg.OperationalError as e:
# Connection not ready...
# print(".", end="")
time.sleep(2)
except Exception as e:
# Connection not ready...
# print("e", end="")
time.sleep(2)
print("DB connection ready")
def initialize_tables():
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Autocommit at end of transaction (Atomic creation of tables)
with conn.transaction() as tx:
try:
# Create URLs table
c = cur.execute("""
CREATE TYPE URL_STATUS AS ENUM ('raw', 'error', 'valid', 'unknown', 'invalid', 'duplicate');
CREATE TABLE URLS (
id SERIAL PRIMARY KEY,
url TEXT NOT NULL UNIQUE,
ts_fetch TIMESTAMPTZ NOT NULL DEFAULT NOW(),
status URL_STATUS NOT NULL DEFAULT 'raw' -- ,
-- status_wendy WENDY_STATUS DEFAULT NULL,
-- ts_wendy TIMESTAMPTZ DEFAULT NULL
);
CREATE INDEX idx_urls_status ON urls(status);
CREATE INDEX idx_urls_ts_fetch ON urls(ts_fetch);
CREATE TABLE URLS_DUPLICATE (
id_url_canonical INTEGER REFERENCES URLS(id),
id_url_duplicated INTEGER REFERENCES URLS(id),
PRIMARY KEY (id_url_canonical, id_url_duplicated)
);
CREATE TYPE SEARCH_TYPE AS ENUM ('rss_feed', 'keyword_search', 'url_host');
CREATE TABLE SEARCH (
id SMALLSERIAL PRIMARY KEY,
search TEXT NOT NULL UNIQUE,
type SEARCH_TYPE NOT NULL
-- language_country CHAR(5), -- Language: ISO 639-1 Code. Country: ISO 3166 ALPHA-2. e.g.: en-us. Required for search
-- UNIQUE(search, language_country)
);
CREATE INDEX idx_search_type ON SEARCH(type);
CREATE TABLE SOURCE (
id SMALLSERIAL PRIMARY KEY,
source TEXT NOT NULL UNIQUE
);
-- CREATE TABLE SEARCH_LANGUAGE (
-- language CHAR(2) NOT NULL, -- ISO 639-1 Code, e.g. "en"
-- country CHAR(2) NOT NULL, -- ISO 3166 ALPHA-2, e.g. "us"
-- PRIMARY KEY (language, country)
-- );
CREATE TABLE URLS_SOURCE_SEARCH (
id_url INTEGER REFERENCES URLS(id),
id_source SMALLINT REFERENCES SOURCE(id) ON UPDATE CASCADE ON DELETE RESTRICT,
id_search SMALLINT REFERENCES SEARCH(id) ON UPDATE CASCADE ON DELETE RESTRICT,
PRIMARY KEY(id_url, id_source, id_search)
);
CREATE INDEX idx_source ON URLS_SOURCE_SEARCH(id_source);
CREATE INDEX idx_search ON URLS_SOURCE_SEARCH(id_search);
CREATE TABLE STATUS_PATTERN_MATCHING (
pattern TEXT PRIMARY KEY,
priority SMALLINT NOT NULL,
status URL_STATUS NOT NULL
);
CREATE TABLE URL_CONTENT (
id_url INTEGER PRIMARY KEY REFERENCES URLS(id),
date_published TIMESTAMPTZ DEFAULT NOW(),
title TEXT,
description TEXT,
content TEXT,
valid_content BOOLEAN,
language CHAR(2), -- ISO 639-1 Code
keywords TEXT[],
tags TEXT[],
authors TEXT[],
image_main_url TEXT,
images_url TEXT[],
videos_url TEXT[],
url_host TEXT, -- www.breitbart.com
site_name TEXT -- Breitbart News
);
CREATE INDEX idx_tags ON URL_CONTENT USING GIN(tags);
CREATE INDEX idx_authors ON URL_CONTENT USING GIN(authors);
CREATE INDEX idx_date_published ON URL_CONTENT (date_published);
CREATE INDEX idx_valid_content ON URL_CONTENT (valid_content);
CREATE INDEX idx_language ON URL_CONTENT (language);
CREATE INDEX idx_url_host ON URL_CONTENT (url_host);
""")
except Exception as e:
print(str(e))
def find_feeds(url):
list_feeds = []
try:
def get_with_protocol(url):
# http:// -> https://
url = url.replace("http://", "https://")
# "" -> https://
if not (url.startswith("https://")):
url = "https://" + url
return url
url = get_with_protocol(url)
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
html = response.text
tree = html5lib.parse(html, namespaceHTMLElements=False)
# base for relative URLs
base = tree.findall('.//base')
base_url = base[0].attrib['href'] if base and 'href' in base[0].attrib else url
# prioritize Atom over RSS
links = tree.findall("""head/link[@rel='alternate'][@type='application/atom+xml']""") + tree.findall("""head/link[@rel='alternate'][@type='application/rss+xml']""")
for link in links:
href = link.attrib.get('href', '').strip()
if href:
r = requests.get(urllib.parse.urljoin(base_url, href), allow_redirects=True)
list_feeds.append(r.url)
# heuristic search for common feed paths
for suffix in [
'feed', 'feed/', 'rss', 'atom', 'feed.xml',
'/feed', '/feed/', '/rss', '/atom', '/feed.xml',
'index.atom', 'index.rss', 'index.xml', 'atom.xml', 'rss.xml',
'/index.atom', '/index.rss', '/index.xml', '/atom.xml', '/rss.xml',
'.rss', '/.rss', '?rss=1', '?feed=rss2',
]:
try:
potential_feed = urllib.parse.urljoin(base_url, suffix)
response = requests.get(potential_feed, allow_redirects=True)
if (response.status_code == 200) and (len(feedparser.parse(potential_feed).get("entries")) > 0):
list_feeds.append(response.url)
except Exception:
continue
except Exception as e:
print(f"An error occurred: {e}")
# Remove duplicates
return list(set(list_feeds))
def initialize_data():
# Read data
with open("init_data.json", "r") as f:
data_json = json.loads(f.read())
print("Initialization data:", data_json)
# Connect to an existing database
with psycopg.connect(connection_info) as conn:
# Open a cursor to perform database operations
with conn.cursor() as cur:
# Autocommit at end of transaction (Atomic creation of data)
# with conn.transaction() as tx:
# TODO: Language per search
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-us');" )
# cur.execute( "INSERT INTO SEARCH (search, type) VALUES ('child abuse', 'keyword_search', 'en-gb');" )
for list_pattern_status_priority in data_json.get("REGEX_PATTERN_STATUS_PRIORITY", []):
# Decode
pattern, status, priority = list_pattern_status_priority
# Query
query = "INSERT INTO STATUS_PATTERN_MATCHING (pattern, priority, status) VALUES ('{}', {}, '{}');".format(pattern, priority, status)
print(query)
cur.execute(query)
# Feeds, URL host, keyword search
for search_type, list_searches in data_json.get("SEARCH", {}).items():
for search in list_searches:
query = "INSERT INTO SEARCH (search, type) VALUES ('{}', '{}');".format(search, search_type)
print(query)
cur.execute(query)
# Try finding RSS feed
if (search_type == "url_host"):
url_host = search
list_feeds = find_feeds(url_host)
# If not exists, insert feed
for feed in list_feeds:
query = "INSERT INTO SEARCH (search, type) VALUES ('{}', '{}') ON CONFLICT DO NOTHING;".format(feed, "rss_feed")
print(query)
cur.execute(query)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Database initialization')
parser.add_argument('--initialize_tables', help='Create DB tables', action='store_true', default=False)
parser.add_argument('--initialize_data', help='Insert data', action='store_true', default=False)
args = parser.parse_args()
# Wait for DB connection
wait_connection()
if (args.initialize_tables):
print("Initializing tables")
initialize_tables()
if (args.initialize_data):
print("Initializing data")
initialize_data()

View File

@@ -4,7 +4,7 @@ if [ "${INITIALIZE_DB}" = false ]; then
echo "Initialization not required"
else
echo "Initializating database"
python db.py --initialize_tables --initialize_data
python init_db.py --initialize_tables --initialize_data
python manage.py makemigrations fetcher; python manage.py migrate --fake-initial
python manage.py createsuperuser --noinput
python manage.py collectstatic --no-input

View File

@@ -5,6 +5,8 @@ psycopg[binary]
gunicorn
whitenoise
feedparser
html5lib
requests
python-dateutil
newspaper4k[all]
lxml[html_clean]

View File

@@ -48,7 +48,7 @@ services:
- DJANGO_ALLOWED_HOSTS=${DJANGO_ALLOWED_HOSTS:-*} # host1,host2
- DJANGO_ALLOWED_ORIGINS=${ALLOWED_ORIGINS:-https://fetcher.matitos.org} # Reverse proxy
- DJANGO_SECRET_KEY=${DJANGO_SECRET_KEY:-abc123456789qwerty}
- DJANGO_DEBUG=${DJANGO_DEBUG:-False}
- DJANGO_DEBUG=${DJANGO_DEBUG:-True}
- PATH_LOGS_DIRECTORY=${PATH_LOGS_DIRECTORY:-/opt/logs}
# Database
- DB_NAME=${DB_NAME:-matitos}
@@ -61,10 +61,11 @@ services:
# Job timeout: 30 min
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT:-1800}
# Fetcher
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-1.5}
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP:-1}
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-2}
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-1.5}
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
# Selenium
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}
@@ -105,8 +106,8 @@ services:
POSTGRES_PASSWORD: ${DB_PASSWORD:-supermatitos}
POSTGRES_USER: ${DB_USER:-supermatitos}
POSTGRES_INITDB_ARGS: '--data-checksums'
volumes: # Persistent DB?
- ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
#volumes: # Persistent DB?
# - ${PATH_DB_DATA:-.}/postgres:/var/lib/postgresql/data
ports:
- 5432 #:5432

View File

@@ -59,10 +59,11 @@ services:
# Job timeout: 30 min
- JOB_DEFAULT_TIMEOUT=${JOB_DEFAULT_TIMEOUT:-1800}
# Fetcher
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-2}
- FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP-1.5}
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP:-5}
- FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP:-1}
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-2}
- FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP:-1.5}
- FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
# Selenium
- SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT:-http://fetcher_app_selenium:80}
- ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA:-https://ollamamodel.matitos.org}