Valid content filter, language detect on min chars, fetch missingkids.org

This commit is contained in:
Luciano Gervasoni
2025-04-03 09:44:46 +02:00
parent 3b54e247e7
commit 5addfa5ba9
18 changed files with 533 additions and 66 deletions

1
.env Normal file
View File

@@ -0,0 +1 @@
# TODO...

View File

@@ -1 +1,33 @@
# Matitos
# Matitos
- Scheduled tasks
- Fetcher -> Inserts raw URLs
- Fetch parsing URL host
- Fetch from RSS feed
- Fetch searching (Google search & news, DuckDuckGo, ...)
- Process URLs -> Updates raw URLs
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
- Determines if it is a valid article content
- Valid URLs
- Generate summary
- Classification
- 5W: Who, What, When, Where, Why of a Story
- Related to child abuse?
- ...
Georgia Institute of Technology
https://comm.gatech.edu resources writers
- Visualization of URLs
- Filter URLs
- By status, search, source, language
- Charts
- Content generation
- Select URLs:
- Valid content
- language=en
- published_date during last_week
- Use classifications
- Merge summaries, ...

46
app_selenium/Dev.ipynb Normal file
View File

@@ -0,0 +1,46 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"\n",
"endpoint = \"http://localhost:80/get_missing_kids?pages=2\"\n",
"r = requests.get(endpoint)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"r.text"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos_urls",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

16
app_selenium/Dockerfile Normal file
View File

@@ -0,0 +1,16 @@
FROM python:3.12
RUN apt update && apt install -y --no-install-recommends chromium chromium-driver curl
RUN apt autoclean && rm -rf /var/lib/apt/lists/*
WORKDIR /opt/app
RUN pip install --no-cache-dir selenium fastapi "uvicorn[standard]"
COPY . /opt/app/
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
# docker build -f Dockerfile -t selenium_app .
# docker run --rm -it --shm-size=512m --name selenium_app selenium_app
# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=5"
# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=-1"

View File

@@ -1,3 +1,8 @@
# Selenium app
* Missing kids posters fetch (num_pages=X)
* ...
```
SELENIUM_SLEEP_PER_PAGE=4
PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log"
```

14
app_selenium/app.py Normal file
View File

@@ -0,0 +1,14 @@
from fastapi import FastAPI
from missing_kids import MissingKidsFetcher
from logger import get_logger
logger = get_logger()
app = FastAPI()
@app.get("/get_missing_kids/")
def get_missing_kids(pages: int = -1):
try:
res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)}
except Exception as e:
res = {}
return res

34
app_selenium/logger.py Normal file
View File

@@ -0,0 +1,34 @@
import logging
import os
# Get env var
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log")
# Directory of logs
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
os.makedirs(directory, exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# To file log: INFO / WARNING / ERROR
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.INFO)
logger.addHandler(fh)
# To file log: WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.WARNING)
logger.addHandler(fh)
def get_logger():
return logger

View File

@@ -0,0 +1,83 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
from utils import get_chrome_options
import time
import os
from logger import get_logger
logger = get_logger()
class MissingKidsFetcher():
def __init__(self) -> None:
pass
def get_missing_kids_urls(self, first_n_pages=-1):
# Poster URL
url = "https://www.missingkids.org/gethelpnow/search/poster-search-results"
# URLs
set_urls = set()
try:
# Initialize
driver = webdriver.Chrome(options=get_chrome_options())
# Go to URL
driver.get(url)
# Iterate
i, continue_iterating, num_exceptions = 1, True, 0
while (continue_iterating):
logger.debug("Processing page: {}...".format(i))
try:
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3)
# Fetch poster URLs
for element_type in ["a"]: # ["a", "p", "div"]:
for elem in driver.find_elements(By.TAG_NAME, element_type):
href = elem.get_attribute('href')
if (href is not None) and ("missingkids.org/poster" in href):
set_urls.add(href)
logger.debug("#URLS: {}".format(len(set_urls)))
# Next page
elem = driver.find_element(By.LINK_TEXT, str(i+1))
logger.debug("Clicking: {}...".format(elem.text))
elem.click()
# Ok
processed_ok = True
except Exception as e:
# +1 exception
num_exceptions += 1
processed_ok = False
if (num_exceptions == 3):
continue_iterating = False
else:
logger.info("Exception while clicking page {}, retrying...".format(i+1))
start_print = False
for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""):
if (e.text == "<<"):
start_print = True
if (e.text == ">>"):
break
if (start_print):
logger.info(e.text)
# driver.refresh()
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4));
if (i == first_n_pages):
continue_iterating = False
if (processed_ok):
i += 1
num_exceptions = 0
except Exception as e:
logger.warning("Exception while clicking page {}. {}".format(i+1, str(e)), exc_info=True)
finally:
try:
driver.close()
except Exception as e:
pass
return set_urls

14
app_selenium/utils.py Normal file
View File

@@ -0,0 +1,14 @@
from selenium.webdriver.chrome.options import Options
def get_chrome_options():
"""Sets chrome options for Selenium.
Chrome options for headless browser is enabled.
"""
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_prefs = {}
chrome_options.experimental_options["prefs"] = chrome_prefs
chrome_prefs["profile.default_content_settings"] = {"images": 2}
return chrome_options

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -11,16 +11,42 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"db_postgres\n",
"db_redis\n",
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
"\u001b[?25h"
]
}
],
"source": [
"!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5"
"!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5\n",
"!rm logs/*"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -37,7 +63,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -163,9 +189,41 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\t urls\n",
"[]\n",
"\t urls_duplicate\n",
"[]\n",
"\t urls_source_search\n",
"[]\n",
"\t source\n",
"[]\n",
"\t search\n",
"[(1,\n",
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
" 'rss_feed'),\n",
" (2, 'missingkids.org/poster', 'url_host'),\n",
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
" (4, 'breitbart.com', 'url_host'),\n",
" (5, 'child abuse', 'keyword_search')]\n",
"\t status_pattern_matching\n",
"[('.*youtube\\\\.com/.*', 50, 'invalid'),\n",
" ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n",
" ('.*twitter\\\\.com/.*', 50, 'invalid'),\n",
" ('.*reddit\\\\.com/.*', 50, 'invalid'),\n",
" ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n",
" ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n",
"\t url_content\n",
"[]\n"
]
}
],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
@@ -182,9 +240,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(1,\n",
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
" 'rss_feed'),\n",
" (2, 'missingkids.org/poster', 'url_host'),\n",
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
" (4, 'breitbart.com', 'url_host'),\n",
" (5, 'child abuse', 'keyword_search')]\n"
]
}
],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
@@ -195,9 +267,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[]\n"
]
}
],
"source": [
"# Connect to an existing database\n",
"with psycopg.connect(connection_info) as conn:\n",
@@ -209,9 +289,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"'\\n!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''\n",
"!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n",

View File

@@ -96,6 +96,9 @@ FETCHER_GNEWS_DECODE_SLEEP=2
FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
FETCHER_BETWEEN_SEARCHES_SLEEP=5
FETCHER_URL_HOST_SLEEP=5
FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
SELENIUM_ENDPOINT="http://selenium_app:80"
```
* Deploy

View File

@@ -0,0 +1,42 @@
from .db_utils import DB_Handler
from ..models import Search, Source
import os
import requests
import json
import traceback
from .logger import get_logger
logger = get_logger()
class FetchMissingKids():
def __init__(self) -> None:
logger.debug("Initializing Fetcher MissingKids")
def run(self, number_pages=-1):
try:
logger.debug("Starting MissingKids.run(), processing #{} pages".format(number_pages))
# Get source object
obj_source, created = Source.objects.get_or_create(source="missingkids.org")
# Get search object
obj_search, created = Search.objects.get_or_create(search="missingkids.org/poster", type=Search.TYPE_ENUM.URL_HOST)
try:
# Missing kids fetching endpoint, parameter number of pages to fetch
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "get_missing_kids/?pages={}".format(number_pages))
# Timeout
if (number_pages > 15) or (number_pages == -1):
timeout = 60*90 # 1.5h
else:
timeout = 60*10 # 10 min
# Request
r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
# Decode
urls_fetched = json.loads(r.text).get("list_urls", [])
except Exception as e:
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
urls_fetched = []
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e:
logger.warning("Exception in MissingKids.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -1,4 +1,3 @@
import traceback
import os
from django.core.cache import cache
from .logger import get_logger
@@ -30,7 +29,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
# Cache decoded URL
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
else:
logger.info("Bad status while decoding news.google.com, URL {}".format(url))
logger.info("Bad status while decoding news.google.com, URL {}\n{}".format(url, decoded_url_dict.get("message")))
except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}".format(url))
return list_decoded_urls

View File

@@ -69,6 +69,16 @@ def process_url(url):
except Exception as e:
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
return None
try:
content_merged = "\n".join([article.title, article.meta_description, article.text])
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
language = langdetect.detect(content_merged)
else:
language = None
except Exception as e:
logger.info("Could not detect language: {}\n{}".format(url, str(e)))
language = None
dict_data = {
"url": url,
@@ -76,8 +86,7 @@ def process_url(url):
"url_host": article.source_url,
"site_name": article.meta_site_name,
"publish_date": article.publish_date,
# article.meta_lang -> Not always reliable
"language": langdetect.detect("\n".join([article.title, article.meta_description, article.text]) ),
"language": language, # article.meta_lang -> Not always reliable
"title": article.title,
"description": article.meta_description,
"content": article.text,

View File

@@ -3,10 +3,8 @@ from scheduler import job
from .src.fetch_feed import FetchFeeds
from .src.fetch_parser import FetchParser
from .src.fetch_search import FetchSearcher
from .src.fetch_missing_kids import FetchMissingKids
from .src.db_utils import DB_Handler
'''
from src.missing_kids_fetch import MissingKidsFetch
'''
from .src.logger import get_logger
logger = get_logger()
@@ -32,7 +30,19 @@ def fetch_search():
FetchSearcher().run()
logger.info("Task completed: {}".format(task))
# TODO: fetch_missing_kids()
@job('default')
def fetch_missing_kids(number_pages=5):
task = "Fetch MissingKids"
logger.info("Task triggered: {}".format(task))
FetchMissingKids().run(number_pages)
logger.info("Task completed: {}".format(task))
@job('default')
def fetch_missing_kids_all(number_pages=-1):
task = "Fetch MissingKids"
logger.info("Task triggered: {}".format(task))
FetchMissingKids().run(number_pages)
logger.info("Task completed: {}".format(task))
@job('default')
def process_raw_urls(batch_size=50):
@@ -77,8 +87,15 @@ def background_task(process_type: str):
FetchParser().run()
elif (process_type == "fetch_search"):
FetchSearcher().run()
#elif (process_type == "fetch_missingkids"):
# FetchMissingKids().run()
elif (process_type == "fetch_missingkids_all"):
FetchMissingKids().run(number_pages=-1)
elif ("fetch_missingkids" in process_type):
# number_pages encoded in URL
try:
number_pages = int(process_type.split("_")[-1])
except Exception as e:
number_pages = -1
FetchMissingKids().run(number_pages=number_pages)
elif ("process_" in process_type):
# Batch size encoded in URL
try:
@@ -95,14 +112,6 @@ def background_task(process_type: str):
else:
logger.info("Task unknown!: {}".format(process_type))
'''
# Selenium based
elif (process_type == "fetch_missing_kids_reduced"):
MissingKidsFetch(db_handler, num_pages=4).run()
elif (process_type == "fetch_missing_kids_full"):
MissingKidsFetch(db_handler, num_pages=100000).run()
'''
logger.info("Task completed: {}".format(process_type))
except Exception as e:
logger.error(e)

View File

@@ -258,7 +258,7 @@ input[type="checkbox"] {
<span id="offText" class="off-text">OFF</span>
</span>
</div>
-->
-->
<!-- Pages Per Page Dropdown -->
<h3>Pages Per Page</h3>
@@ -297,6 +297,17 @@ input[type="checkbox"] {
</label><br>
{% endfor %}
<!-- Filter by valid content -->
<h3>Valid content</h3>
<button type="button" class="toggle-all-btn" data-toggle="valid_content">Toggle All</button><br>
{% for vc in valid_contents %}
<label>
<input type="checkbox" name="valid_content" value="{{ vc }}"
{% if vc|stringformat:"s" in selected_valid_contents or 'all' in selected_valid_contents%}checked{% endif %}>
{{ vc|truncatechars:50 }}
</label><br>
{% endfor %}
<!-- Filter by Search -->
<h3>Search</h3>
<button type="button" class="toggle-all-btn" data-toggle="search">Toggle All</button><br>
@@ -329,7 +340,7 @@ input[type="checkbox"] {
{{ lang|truncatechars:50 }}
</label><br>
{% endfor %}
</form>
</div>
@@ -526,10 +537,6 @@ input[type="checkbox"] {
const checkboxes = document.querySelectorAll(`[name='${section}']`);
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
checkboxes.forEach(cb => cb.checked = !allChecked);
/*
// Automatically submit the form when a checkbox is toggled
document.getElementById('filterForm').submit();
*/
updateFormParameter(section);
}
@@ -545,9 +552,6 @@ input[type="checkbox"] {
// Automatically submit the form when any checkbox changes
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
checkbox.addEventListener('change', function() {
/*
document.getElementById('filterForm').submit();
*/
updateFormParameter(this.name);
});
});

View File

@@ -15,7 +15,7 @@ def trigger_task(request, task):
####################################################################################################
def link_list(request):
prefix = "http://localhost:8000/task"
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
list_links = [
# DB
@@ -212,21 +212,26 @@ def filtered_urls(request):
# TODO: Cache languages, update once every N
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
# Null for visualization
languages = ["Null"] + [l for l in languages if l is not None]
languages = ["Unknown"] + [l for l in languages if l is not None]
valid_contents = ["True", "False", "Unknown"]
# Get selected parameters
selected_status = request.GET.getlist('status', ["null"])
selected_search = request.GET.getlist('search', ["null"])
selected_source = request.GET.getlist('source', ["null"])
selected_language = request.GET.getlist('language', ["null"])
selected_valid_contents = request.GET.getlist('valid_content', ["null"])
selected_days = request.GET.get("days", 30)
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
page_number = request.GET.get('page') # Get the current page number
all_status = [str(status[0]) for status in statuses]
all_search = [str(search.id) for search in searches]
all_source = [str(source.id) for source in sources]
all_languages = languages
all_valid_contents = valid_contents
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
@@ -234,23 +239,22 @@ def filtered_urls(request):
selected_search = ["all"]
selected_source = ["all"]
selected_language = ["all"]
# print(set(selected_status), set(all_status))
"""
# List of TODO remove...
if (set(selected_status) == set(all_status)):
selected_status = ["all"]
if (set(selected_search) == set(all_search)):
selected_search = ["all"]
if (set(selected_source) == set(all_source)):
selected_source = ["all"]
if (set(selected_language) == set(languages)):
selected_language = ["all"]"
"""
selected_valid_contents = ["all"]
else:
# Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
if (set(selected_status) == set(all_status)):
selected_status = ["all"]
if (set(selected_search) == set(all_search)):
selected_search = ["all"]
if (set(selected_source) == set(all_source)):
selected_source = ["all"]
if (set(selected_language) == set(all_languages)):
selected_language = ["all"]
if (set(selected_valid_contents) == set(all_valid_contents)):
selected_valid_contents = ["all"]
# Filter URLs based on selected filters
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language):
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
urls = []
else:
# Filter by date
@@ -262,18 +266,36 @@ def filtered_urls(request):
query &= Q(urlssourcesearch__id_source__in=selected_source)
if ("all" not in selected_search):
query &= Q(urlssourcesearch__id_search__in=selected_search)
if ("all" not in selected_language):
if ("all" not in selected_language):
# URLs with selected languages
subquery = Q(urlcontent__language__in=selected_language)
if ("Null" in selected_language):
if ("Unknown" in selected_language):
# URLs with NULL language
subquery |= Q(urlcontent__language__isnull=True)
# URLs with no UrlContent record at all (similar to URLs with NULL language)
subquery |= Q(urlcontent__id_url__isnull=True)
# Update query
query &= (subquery)
if ("all" not in selected_valid_contents):
# Boolean array
bool_array = []
if ('True' in selected_valid_contents):
bool_array.append(True)
if ('False' in selected_valid_contents):
bool_array.append(False)
# URLs with selected valid_contents
subquery = Q(urlcontent__valid_content__in=bool_array)
if ("Unknown" in selected_valid_contents):
# URLs with NULL valid_content
subquery |= Q(urlcontent__valid_content__isnull=True)
# URLs with no UrlContent record at all (similar to URLs with NULL valid_content)
subquery |= Q(urlcontent__id_url__isnull=True)
# Update query
query &= (subquery)
# Run query
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
# print(urls.query)
# Pagination
paginator = Paginator(urls, per_page) # Paginate the filtered URLs
@@ -300,11 +322,13 @@ def filtered_urls(request):
'searches': sorted(searches, key=lambda x: (x.type, x.search)),
'sources': sorted(sources, key=lambda x: x.source),
'languages': sorted(languages, key=lambda x: (x is None, x)),
'valid_contents': valid_contents,
# Selection
'selected_status': selected_status,
'selected_search': selected_search,
'selected_source': selected_source,
'selected_language': selected_language,
'selected_valid_contents': selected_valid_contents,
"selected_days": selected_days,
# Map
"sources_map": sources_map,

View File

@@ -2,7 +2,48 @@ version: '3.9'
services:
matitos_db:
fetcher_selenium:
build:
context: ./app_selenium
container_name: selenium_app
restart: unless-stopped
shm_size: 512mb
environment:
- SELENIUM_SLEEP_PER_PAGE=4
- PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log"
ports:
- 80
fetcher_urls_app:
build:
context: ./app_urls
container_name: urls_app
restart: unless-stopped
environment:
#- name=value
# Database
- DB_NAME=${DB_NAME:-matitos}
- DB_USER=${DB_NAME:-supermatitos}
- DB_PASSWORD=${DB_NAME:-supermatitos}
- DB_HOST=${DB_NAME:-localhost} # db_postgres
- DB_PORT=${DB_NAME:-5432}
- REDIS_HOST=${REDIS_HOST:-localhost}
- REDIS_PORT=${REDIS_PORT:-6379}
# Job timeout: 30 min
- JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
# Logs path
- PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
# Fetcher
- FETCHER_GNEWS_DECODE_SLEEP=2
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
- FETCHER_BETWEEN_SEARCHES_SLEEP=5
- FETCHER_URL_HOST_SLEEP=5
# Selenium
- SELENIUM_ENDPOINT="http://selenium_app:80"
ports:
- 80
fetcher_db:
image: postgres:17
container_name: db_postgres
restart: unless-stopped
@@ -18,7 +59,7 @@ services:
ports:
- 5432:5432
matitos_redis:
fetcher_redis:
image: redis:alpine
container_name: db_redis
restart: unless-stopped
@@ -27,7 +68,7 @@ services:
#expose:
# - 6379
matitos_adminer:
fetcher_adminer:
# http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public
image: adminer
container_name: adminer
@@ -41,7 +82,7 @@ services:
ports:
- 8080:8080
matitos_dozzle:
fetcher_dozzle:
container_name: dozzle
image: amir20/dozzle:latest
volumes: