Valid content filter, language detect on min chars, fetch missingkids.org
This commit is contained in:
34
README.md
34
README.md
@@ -1 +1,33 @@
|
||||
# Matitos
|
||||
# Matitos
|
||||
|
||||
- Scheduled tasks
|
||||
- Fetcher -> Inserts raw URLs
|
||||
- Fetch parsing URL host
|
||||
- Fetch from RSS feed
|
||||
- Fetch searching (Google search & news, DuckDuckGo, ...)
|
||||
- Process URLs -> Updates raw URLs
|
||||
- Extracts title, description, content, image and video URLs, main image URL, language, keywords, authors, tags, published date
|
||||
- Determines if it is a valid article content
|
||||
- Valid URLs
|
||||
- Generate summary
|
||||
- Classification
|
||||
- 5W: Who, What, When, Where, Why of a Story
|
||||
- Related to child abuse?
|
||||
- ...
|
||||
|
||||
Georgia Institute of Technology
|
||||
https://comm.gatech.edu › resources › writers
|
||||
|
||||
|
||||
- Visualization of URLs
|
||||
- Filter URLs
|
||||
- By status, search, source, language
|
||||
- Charts
|
||||
|
||||
- Content generation
|
||||
- Select URLs:
|
||||
- Valid content
|
||||
- language=en
|
||||
- published_date during last_week
|
||||
- Use classifications
|
||||
- Merge summaries, ...
|
||||
46
app_selenium/Dev.ipynb
Normal file
46
app_selenium/Dev.ipynb
Normal file
@@ -0,0 +1,46 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import requests\n",
|
||||
"\n",
|
||||
"endpoint = \"http://localhost:80/get_missing_kids?pages=2\"\n",
|
||||
"r = requests.get(endpoint)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"r.text"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "matitos_urls",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
16
app_selenium/Dockerfile
Normal file
16
app_selenium/Dockerfile
Normal file
@@ -0,0 +1,16 @@
|
||||
FROM python:3.12
|
||||
|
||||
RUN apt update && apt install -y --no-install-recommends chromium chromium-driver curl
|
||||
RUN apt autoclean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /opt/app
|
||||
RUN pip install --no-cache-dir selenium fastapi "uvicorn[standard]"
|
||||
COPY . /opt/app/
|
||||
|
||||
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
|
||||
|
||||
# docker build -f Dockerfile -t selenium_app .
|
||||
# docker run --rm -it --shm-size=512m --name selenium_app selenium_app
|
||||
|
||||
# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=5"
|
||||
# docker exec -it selenium_app bash -c "curl localhost:80/get_missing_kids/?pages=-1"
|
||||
@@ -1,3 +1,8 @@
|
||||
# Selenium app
|
||||
|
||||
* Missing kids posters fetch (num_pages=X)
|
||||
* ...
|
||||
|
||||
```
|
||||
SELENIUM_SLEEP_PER_PAGE=4
|
||||
PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log"
|
||||
```
|
||||
14
app_selenium/app.py
Normal file
14
app_selenium/app.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from fastapi import FastAPI
|
||||
from missing_kids import MissingKidsFetcher
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
@app.get("/get_missing_kids/")
|
||||
def get_missing_kids(pages: int = -1):
|
||||
try:
|
||||
res = {"list_urls": MissingKidsFetcher().get_missing_kids_urls(first_n_pages=pages)}
|
||||
except Exception as e:
|
||||
res = {}
|
||||
return res
|
||||
34
app_selenium/logger.py
Normal file
34
app_selenium/logger.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
# Get env var
|
||||
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_selenium_{}.log")
|
||||
|
||||
# Directory of logs
|
||||
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.INFO)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.WARNING)
|
||||
logger.addHandler(fh)
|
||||
|
||||
def get_logger():
|
||||
return logger
|
||||
83
app_selenium/missing_kids.py
Normal file
83
app_selenium/missing_kids.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from utils import get_chrome_options
|
||||
import time
|
||||
import os
|
||||
|
||||
from logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class MissingKidsFetcher():
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def get_missing_kids_urls(self, first_n_pages=-1):
|
||||
# Poster URL
|
||||
url = "https://www.missingkids.org/gethelpnow/search/poster-search-results"
|
||||
# URLs
|
||||
set_urls = set()
|
||||
|
||||
try:
|
||||
# Initialize
|
||||
driver = webdriver.Chrome(options=get_chrome_options())
|
||||
# Go to URL
|
||||
driver.get(url)
|
||||
# Iterate
|
||||
i, continue_iterating, num_exceptions = 1, True, 0
|
||||
while (continue_iterating):
|
||||
logger.debug("Processing page: {}...".format(i))
|
||||
|
||||
try:
|
||||
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4)); #driver.implicitly_wait(3)
|
||||
# Fetch poster URLs
|
||||
for element_type in ["a"]: # ["a", "p", "div"]:
|
||||
for elem in driver.find_elements(By.TAG_NAME, element_type):
|
||||
href = elem.get_attribute('href')
|
||||
if (href is not None) and ("missingkids.org/poster" in href):
|
||||
set_urls.add(href)
|
||||
|
||||
logger.debug("#URLS: {}".format(len(set_urls)))
|
||||
|
||||
# Next page
|
||||
elem = driver.find_element(By.LINK_TEXT, str(i+1))
|
||||
logger.debug("Clicking: {}...".format(elem.text))
|
||||
elem.click()
|
||||
# Ok
|
||||
processed_ok = True
|
||||
except Exception as e:
|
||||
# +1 exception
|
||||
num_exceptions += 1
|
||||
processed_ok = False
|
||||
|
||||
if (num_exceptions == 3):
|
||||
continue_iterating = False
|
||||
else:
|
||||
logger.info("Exception while clicking page {}, retrying...".format(i+1))
|
||||
|
||||
start_print = False
|
||||
for e in driver.find_elements(By.PARTIAL_LINK_TEXT, ""):
|
||||
if (e.text == "<<"):
|
||||
start_print = True
|
||||
if (e.text == ">>"):
|
||||
break
|
||||
if (start_print):
|
||||
logger.info(e.text)
|
||||
|
||||
# driver.refresh()
|
||||
time.sleep(os.getenv("SELENIUM_SLEEP_PER_PAGE", 4));
|
||||
|
||||
if (i == first_n_pages):
|
||||
continue_iterating = False
|
||||
if (processed_ok):
|
||||
i += 1
|
||||
num_exceptions = 0
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception while clicking page {}. {}".format(i+1, str(e)), exc_info=True)
|
||||
finally:
|
||||
try:
|
||||
driver.close()
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return set_urls
|
||||
14
app_selenium/utils.py
Normal file
14
app_selenium/utils.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
def get_chrome_options():
|
||||
"""Sets chrome options for Selenium.
|
||||
Chrome options for headless browser is enabled.
|
||||
"""
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_prefs = {}
|
||||
chrome_options.experimental_options["prefs"] = chrome_prefs
|
||||
chrome_prefs["profile.default_content_settings"] = {"images": 2}
|
||||
return chrome_options
|
||||
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -11,16 +11,42 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"db_postgres\n",
|
||||
"db_redis\n",
|
||||
"\u001b[1A\u001b[1B\u001b[0G\u001b[?25l[+] Running 2/0\n",
|
||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.1s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l[+] Running 2/4\n",
|
||||
" ⠿ Container db_redis \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||
" ⠿ Container db_postgres \u001b[39mStarting\u001b[0m \u001b[34m0.2s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[1A\u001b[0G\u001b[?25l\u001b[34m[+] Running 4/4\u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_redis \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container db_postgres \u001b[32mStarted\u001b[0m \u001b[34m0.3s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container dozzle \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
" \u001b[32m✔\u001b[0m Container adminer \u001b[32mRunning\u001b[0m \u001b[34m0.0s \u001b[0m\n",
|
||||
"\u001b[?25h"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5"
|
||||
"!docker rm -f db_postgres db_redis; docker compose -f ../docker/docker-compose.yml up -d ; sleep 5\n",
|
||||
"!rm logs/*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -37,7 +63,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -163,9 +189,41 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\t urls\n",
|
||||
"[]\n",
|
||||
"\t urls_duplicate\n",
|
||||
"[]\n",
|
||||
"\t urls_source_search\n",
|
||||
"[]\n",
|
||||
"\t source\n",
|
||||
"[]\n",
|
||||
"\t search\n",
|
||||
"[(1,\n",
|
||||
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
|
||||
" 'rss_feed'),\n",
|
||||
" (2, 'missingkids.org/poster', 'url_host'),\n",
|
||||
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
|
||||
" (4, 'breitbart.com', 'url_host'),\n",
|
||||
" (5, 'child abuse', 'keyword_search')]\n",
|
||||
"\t status_pattern_matching\n",
|
||||
"[('.*youtube\\\\.com/.*', 50, 'invalid'),\n",
|
||||
" ('.*tiktok\\\\.com/.*', 50, 'invalid'),\n",
|
||||
" ('.*twitter\\\\.com/.*', 50, 'invalid'),\n",
|
||||
" ('.*reddit\\\\.com/.*', 50, 'invalid'),\n",
|
||||
" ('.*libreddit\\\\.de/.*', 50, 'invalid'),\n",
|
||||
" ('.*radio\\\\.foxnews\\\\.com/.*', 50, 'invalid')]\n",
|
||||
"\t url_content\n",
|
||||
"[]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Connect to an existing database\n",
|
||||
"with psycopg.connect(connection_info) as conn:\n",
|
||||
@@ -182,9 +240,23 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[(1,\n",
|
||||
" 'https://api.missingkids.org/missingkids/servlet/XmlServlet?act=rss&LanguageCountry=en_US&orgPrefix=NCMC',\n",
|
||||
" 'rss_feed'),\n",
|
||||
" (2, 'missingkids.org/poster', 'url_host'),\n",
|
||||
" (3, 'missingkids.org/new-poster', 'url_host'),\n",
|
||||
" (4, 'breitbart.com', 'url_host'),\n",
|
||||
" (5, 'child abuse', 'keyword_search')]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Connect to an existing database\n",
|
||||
"with psycopg.connect(connection_info) as conn:\n",
|
||||
@@ -195,9 +267,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Connect to an existing database\n",
|
||||
"with psycopg.connect(connection_info) as conn:\n",
|
||||
@@ -209,9 +289,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\n!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\\n\\n# Connect to an existing database\\nwith psycopg.connect(connection_info) as conn:\\n # Open a cursor to perform database operations\\n with conn.cursor() as cur:\\n pprint( cur.execute(\"TRUNCATE URLS, URL_CONTENT, URLS_SOURCE_SEARCH, URLS_DUPLICATE;\") )\\n # cur.execute( \"INSERT INTO SEARCH (search, type) VALUES (\\'missingkids.org\\', \\'url_host\\');\" )\\n'"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"'''\n",
|
||||
"!docker rm -f db_redis; docker compose -f ../docker/docker-compose.yml up -d\n",
|
||||
|
||||
@@ -96,6 +96,9 @@ FETCHER_GNEWS_DECODE_SLEEP=2
|
||||
FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
|
||||
FETCHER_BETWEEN_SEARCHES_SLEEP=5
|
||||
FETCHER_URL_HOST_SLEEP=5
|
||||
FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100
|
||||
|
||||
SELENIUM_ENDPOINT="http://selenium_app:80"
|
||||
```
|
||||
|
||||
* Deploy
|
||||
|
||||
42
app_urls/api/src/fetch_missing_kids.py
Normal file
42
app_urls/api/src/fetch_missing_kids.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
import os
|
||||
import requests
|
||||
import json
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchMissingKids():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher MissingKids")
|
||||
|
||||
def run(self, number_pages=-1):
|
||||
try:
|
||||
logger.debug("Starting MissingKids.run(), processing #{} pages".format(number_pages))
|
||||
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source="missingkids.org")
|
||||
# Get search object
|
||||
obj_search, created = Search.objects.get_or_create(search="missingkids.org/poster", type=Search.TYPE_ENUM.URL_HOST)
|
||||
|
||||
try:
|
||||
# Missing kids fetching endpoint, parameter number of pages to fetch
|
||||
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "get_missing_kids/?pages={}".format(number_pages))
|
||||
# Timeout
|
||||
if (number_pages > 15) or (number_pages == -1):
|
||||
timeout = 60*90 # 1.5h
|
||||
else:
|
||||
timeout = 60*10 # 10 min
|
||||
# Request
|
||||
r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
|
||||
# Decode
|
||||
urls_fetched = json.loads(r.text).get("list_urls", [])
|
||||
except Exception as e:
|
||||
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
|
||||
urls_fetched = []
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in MissingKids.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
@@ -1,4 +1,3 @@
|
||||
import traceback
|
||||
import os
|
||||
from django.core.cache import cache
|
||||
from .logger import get_logger
|
||||
@@ -30,7 +29,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
|
||||
# Cache decoded URL
|
||||
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
|
||||
else:
|
||||
logger.info("Bad status while decoding news.google.com, URL {}".format(url))
|
||||
logger.info("Bad status while decoding news.google.com, URL {}\n{}".format(url, decoded_url_dict.get("message")))
|
||||
except Exception as e:
|
||||
logger.warning("Error decoding news.google.com, URL: {}".format(url))
|
||||
return list_decoded_urls
|
||||
@@ -69,6 +69,16 @@ def process_url(url):
|
||||
except Exception as e:
|
||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||
return None
|
||||
|
||||
try:
|
||||
content_merged = "\n".join([article.title, article.meta_description, article.text])
|
||||
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
|
||||
language = langdetect.detect(content_merged)
|
||||
else:
|
||||
language = None
|
||||
except Exception as e:
|
||||
logger.info("Could not detect language: {}\n{}".format(url, str(e)))
|
||||
language = None
|
||||
|
||||
dict_data = {
|
||||
"url": url,
|
||||
@@ -76,8 +86,7 @@ def process_url(url):
|
||||
"url_host": article.source_url,
|
||||
"site_name": article.meta_site_name,
|
||||
"publish_date": article.publish_date,
|
||||
# article.meta_lang -> Not always reliable
|
||||
"language": langdetect.detect("\n".join([article.title, article.meta_description, article.text]) ),
|
||||
"language": language, # article.meta_lang -> Not always reliable
|
||||
"title": article.title,
|
||||
"description": article.meta_description,
|
||||
"content": article.text,
|
||||
|
||||
@@ -3,10 +3,8 @@ from scheduler import job
|
||||
from .src.fetch_feed import FetchFeeds
|
||||
from .src.fetch_parser import FetchParser
|
||||
from .src.fetch_search import FetchSearcher
|
||||
from .src.fetch_missing_kids import FetchMissingKids
|
||||
from .src.db_utils import DB_Handler
|
||||
'''
|
||||
from src.missing_kids_fetch import MissingKidsFetch
|
||||
'''
|
||||
|
||||
from .src.logger import get_logger
|
||||
logger = get_logger()
|
||||
@@ -32,7 +30,19 @@ def fetch_search():
|
||||
FetchSearcher().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
# TODO: fetch_missing_kids()
|
||||
@job('default')
|
||||
def fetch_missing_kids(number_pages=5):
|
||||
task = "Fetch MissingKids"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_missing_kids_all(number_pages=-1):
|
||||
task = "Fetch MissingKids"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def process_raw_urls(batch_size=50):
|
||||
@@ -77,8 +87,15 @@ def background_task(process_type: str):
|
||||
FetchParser().run()
|
||||
elif (process_type == "fetch_search"):
|
||||
FetchSearcher().run()
|
||||
#elif (process_type == "fetch_missingkids"):
|
||||
# FetchMissingKids().run()
|
||||
elif (process_type == "fetch_missingkids_all"):
|
||||
FetchMissingKids().run(number_pages=-1)
|
||||
elif ("fetch_missingkids" in process_type):
|
||||
# number_pages encoded in URL
|
||||
try:
|
||||
number_pages = int(process_type.split("_")[-1])
|
||||
except Exception as e:
|
||||
number_pages = -1
|
||||
FetchMissingKids().run(number_pages=number_pages)
|
||||
elif ("process_" in process_type):
|
||||
# Batch size encoded in URL
|
||||
try:
|
||||
@@ -95,14 +112,6 @@ def background_task(process_type: str):
|
||||
else:
|
||||
logger.info("Task unknown!: {}".format(process_type))
|
||||
|
||||
'''
|
||||
# Selenium based
|
||||
elif (process_type == "fetch_missing_kids_reduced"):
|
||||
MissingKidsFetch(db_handler, num_pages=4).run()
|
||||
elif (process_type == "fetch_missing_kids_full"):
|
||||
MissingKidsFetch(db_handler, num_pages=100000).run()
|
||||
'''
|
||||
|
||||
logger.info("Task completed: {}".format(process_type))
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
@@ -258,7 +258,7 @@ input[type="checkbox"] {
|
||||
<span id="offText" class="off-text">OFF</span>
|
||||
</span>
|
||||
</div>
|
||||
-->
|
||||
-->
|
||||
|
||||
<!-- Pages Per Page Dropdown -->
|
||||
<h3>Pages Per Page</h3>
|
||||
@@ -297,6 +297,17 @@ input[type="checkbox"] {
|
||||
</label><br>
|
||||
{% endfor %}
|
||||
|
||||
<!-- Filter by valid content -->
|
||||
<h3>Valid content</h3>
|
||||
<button type="button" class="toggle-all-btn" data-toggle="valid_content">Toggle All</button><br>
|
||||
{% for vc in valid_contents %}
|
||||
<label>
|
||||
<input type="checkbox" name="valid_content" value="{{ vc }}"
|
||||
{% if vc|stringformat:"s" in selected_valid_contents or 'all' in selected_valid_contents%}checked{% endif %}>
|
||||
{{ vc|truncatechars:50 }}
|
||||
</label><br>
|
||||
{% endfor %}
|
||||
|
||||
<!-- Filter by Search -->
|
||||
<h3>Search</h3>
|
||||
<button type="button" class="toggle-all-btn" data-toggle="search">Toggle All</button><br>
|
||||
@@ -329,7 +340,7 @@ input[type="checkbox"] {
|
||||
{{ lang|truncatechars:50 }}
|
||||
</label><br>
|
||||
{% endfor %}
|
||||
|
||||
|
||||
</form>
|
||||
</div>
|
||||
|
||||
@@ -526,10 +537,6 @@ input[type="checkbox"] {
|
||||
const checkboxes = document.querySelectorAll(`[name='${section}']`);
|
||||
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
|
||||
checkboxes.forEach(cb => cb.checked = !allChecked);
|
||||
/*
|
||||
// Automatically submit the form when a checkbox is toggled
|
||||
document.getElementById('filterForm').submit();
|
||||
*/
|
||||
updateFormParameter(section);
|
||||
}
|
||||
|
||||
@@ -545,9 +552,6 @@ input[type="checkbox"] {
|
||||
// Automatically submit the form when any checkbox changes
|
||||
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
|
||||
checkbox.addEventListener('change', function() {
|
||||
/*
|
||||
document.getElementById('filterForm').submit();
|
||||
*/
|
||||
updateFormParameter(this.name);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -15,7 +15,7 @@ def trigger_task(request, task):
|
||||
####################################################################################################
|
||||
def link_list(request):
|
||||
prefix = "http://localhost:8000/task"
|
||||
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
||||
links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
||||
|
||||
list_links = [
|
||||
# DB
|
||||
@@ -212,21 +212,26 @@ def filtered_urls(request):
|
||||
# TODO: Cache languages, update once every N
|
||||
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
|
||||
# Null for visualization
|
||||
languages = ["Null"] + [l for l in languages if l is not None]
|
||||
languages = ["Unknown"] + [l for l in languages if l is not None]
|
||||
valid_contents = ["True", "False", "Unknown"]
|
||||
|
||||
# Get selected parameters
|
||||
selected_status = request.GET.getlist('status', ["null"])
|
||||
selected_search = request.GET.getlist('search', ["null"])
|
||||
selected_source = request.GET.getlist('source', ["null"])
|
||||
selected_language = request.GET.getlist('language', ["null"])
|
||||
selected_valid_contents = request.GET.getlist('valid_content', ["null"])
|
||||
selected_days = request.GET.get("days", 30)
|
||||
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
|
||||
page_number = request.GET.get('page') # Get the current page number
|
||||
|
||||
|
||||
all_status = [str(status[0]) for status in statuses]
|
||||
all_search = [str(search.id) for search in searches]
|
||||
all_source = [str(source.id) for source in sources]
|
||||
all_languages = languages
|
||||
all_valid_contents = valid_contents
|
||||
|
||||
|
||||
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
|
||||
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
|
||||
@@ -234,23 +239,22 @@ def filtered_urls(request):
|
||||
selected_search = ["all"]
|
||||
selected_source = ["all"]
|
||||
selected_language = ["all"]
|
||||
|
||||
|
||||
# print(set(selected_status), set(all_status))
|
||||
"""
|
||||
# List of TODO remove...
|
||||
if (set(selected_status) == set(all_status)):
|
||||
selected_status = ["all"]
|
||||
if (set(selected_search) == set(all_search)):
|
||||
selected_search = ["all"]
|
||||
if (set(selected_source) == set(all_source)):
|
||||
selected_source = ["all"]
|
||||
if (set(selected_language) == set(languages)):
|
||||
selected_language = ["all"]"
|
||||
"""
|
||||
selected_valid_contents = ["all"]
|
||||
else:
|
||||
# Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
|
||||
if (set(selected_status) == set(all_status)):
|
||||
selected_status = ["all"]
|
||||
if (set(selected_search) == set(all_search)):
|
||||
selected_search = ["all"]
|
||||
if (set(selected_source) == set(all_source)):
|
||||
selected_source = ["all"]
|
||||
if (set(selected_language) == set(all_languages)):
|
||||
selected_language = ["all"]
|
||||
if (set(selected_valid_contents) == set(all_valid_contents)):
|
||||
selected_valid_contents = ["all"]
|
||||
|
||||
# Filter URLs based on selected filters
|
||||
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language):
|
||||
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
|
||||
urls = []
|
||||
else:
|
||||
# Filter by date
|
||||
@@ -262,18 +266,36 @@ def filtered_urls(request):
|
||||
query &= Q(urlssourcesearch__id_source__in=selected_source)
|
||||
if ("all" not in selected_search):
|
||||
query &= Q(urlssourcesearch__id_search__in=selected_search)
|
||||
if ("all" not in selected_language):
|
||||
if ("all" not in selected_language):
|
||||
# URLs with selected languages
|
||||
subquery = Q(urlcontent__language__in=selected_language)
|
||||
if ("Null" in selected_language):
|
||||
if ("Unknown" in selected_language):
|
||||
# URLs with NULL language
|
||||
subquery |= Q(urlcontent__language__isnull=True)
|
||||
# URLs with no UrlContent record at all (similar to URLs with NULL language)
|
||||
subquery |= Q(urlcontent__id_url__isnull=True)
|
||||
# Update query
|
||||
query &= (subquery)
|
||||
if ("all" not in selected_valid_contents):
|
||||
# Boolean array
|
||||
bool_array = []
|
||||
if ('True' in selected_valid_contents):
|
||||
bool_array.append(True)
|
||||
if ('False' in selected_valid_contents):
|
||||
bool_array.append(False)
|
||||
# URLs with selected valid_contents
|
||||
subquery = Q(urlcontent__valid_content__in=bool_array)
|
||||
if ("Unknown" in selected_valid_contents):
|
||||
# URLs with NULL valid_content
|
||||
subquery |= Q(urlcontent__valid_content__isnull=True)
|
||||
# URLs with no UrlContent record at all (similar to URLs with NULL valid_content)
|
||||
subquery |= Q(urlcontent__id_url__isnull=True)
|
||||
# Update query
|
||||
query &= (subquery)
|
||||
|
||||
# Run query
|
||||
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
|
||||
# print(urls.query)
|
||||
|
||||
# Pagination
|
||||
paginator = Paginator(urls, per_page) # Paginate the filtered URLs
|
||||
@@ -300,11 +322,13 @@ def filtered_urls(request):
|
||||
'searches': sorted(searches, key=lambda x: (x.type, x.search)),
|
||||
'sources': sorted(sources, key=lambda x: x.source),
|
||||
'languages': sorted(languages, key=lambda x: (x is None, x)),
|
||||
'valid_contents': valid_contents,
|
||||
# Selection
|
||||
'selected_status': selected_status,
|
||||
'selected_search': selected_search,
|
||||
'selected_source': selected_source,
|
||||
'selected_language': selected_language,
|
||||
'selected_valid_contents': selected_valid_contents,
|
||||
"selected_days": selected_days,
|
||||
# Map
|
||||
"sources_map": sources_map,
|
||||
|
||||
@@ -2,7 +2,48 @@ version: '3.9'
|
||||
|
||||
services:
|
||||
|
||||
matitos_db:
|
||||
fetcher_selenium:
|
||||
build:
|
||||
context: ./app_selenium
|
||||
container_name: selenium_app
|
||||
restart: unless-stopped
|
||||
shm_size: 512mb
|
||||
environment:
|
||||
- SELENIUM_SLEEP_PER_PAGE=4
|
||||
- PATH_LOGS_PARAMETERIZATION="logs/log_app_selenium_{}.log"
|
||||
ports:
|
||||
- 80
|
||||
|
||||
fetcher_urls_app:
|
||||
build:
|
||||
context: ./app_urls
|
||||
container_name: urls_app
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
#- name=value
|
||||
# Database
|
||||
- DB_NAME=${DB_NAME:-matitos}
|
||||
- DB_USER=${DB_NAME:-supermatitos}
|
||||
- DB_PASSWORD=${DB_NAME:-supermatitos}
|
||||
- DB_HOST=${DB_NAME:-localhost} # db_postgres
|
||||
- DB_PORT=${DB_NAME:-5432}
|
||||
- REDIS_HOST=${REDIS_HOST:-localhost}
|
||||
- REDIS_PORT=${REDIS_PORT:-6379}
|
||||
# Job timeout: 30 min
|
||||
- JOB_DEFAULT_TIMEOUT=${RQ_DEFAULT_TIMEOUT:-1800}
|
||||
# Logs path
|
||||
- PATH_LOGS_PARAMETERIZATION="logs/log_app_fetcher_{}.log"
|
||||
# Fetcher
|
||||
- FETCHER_GNEWS_DECODE_SLEEP=2
|
||||
- FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=4
|
||||
- FETCHER_BETWEEN_SEARCHES_SLEEP=5
|
||||
- FETCHER_URL_HOST_SLEEP=5
|
||||
# Selenium
|
||||
- SELENIUM_ENDPOINT="http://selenium_app:80"
|
||||
ports:
|
||||
- 80
|
||||
|
||||
fetcher_db:
|
||||
image: postgres:17
|
||||
container_name: db_postgres
|
||||
restart: unless-stopped
|
||||
@@ -18,7 +59,7 @@ services:
|
||||
ports:
|
||||
- 5432:5432
|
||||
|
||||
matitos_redis:
|
||||
fetcher_redis:
|
||||
image: redis:alpine
|
||||
container_name: db_redis
|
||||
restart: unless-stopped
|
||||
@@ -27,7 +68,7 @@ services:
|
||||
#expose:
|
||||
# - 6379
|
||||
|
||||
matitos_adminer:
|
||||
fetcher_adminer:
|
||||
# http://localhost:8080/?pgsql=matitos_db&username=supermatitos&db=matitos&ns=public
|
||||
image: adminer
|
||||
container_name: adminer
|
||||
@@ -41,7 +82,7 @@ services:
|
||||
ports:
|
||||
- 8080:8080
|
||||
|
||||
matitos_dozzle:
|
||||
fetcher_dozzle:
|
||||
container_name: dozzle
|
||||
image: amir20/dozzle:latest
|
||||
volumes:
|
||||
Reference in New Issue
Block a user