Valid content filter, language detect on min chars, fetch missingkids.org
This commit is contained in:
42
app_urls/api/src/fetch_missing_kids.py
Normal file
42
app_urls/api/src/fetch_missing_kids.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
import os
|
||||
import requests
|
||||
import json
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchMissingKids():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher MissingKids")
|
||||
|
||||
def run(self, number_pages=-1):
|
||||
try:
|
||||
logger.debug("Starting MissingKids.run(), processing #{} pages".format(number_pages))
|
||||
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source="missingkids.org")
|
||||
# Get search object
|
||||
obj_search, created = Search.objects.get_or_create(search="missingkids.org/poster", type=Search.TYPE_ENUM.URL_HOST)
|
||||
|
||||
try:
|
||||
# Missing kids fetching endpoint, parameter number of pages to fetch
|
||||
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "get_missing_kids/?pages={}".format(number_pages))
|
||||
# Timeout
|
||||
if (number_pages > 15) or (number_pages == -1):
|
||||
timeout = 60*90 # 1.5h
|
||||
else:
|
||||
timeout = 60*10 # 10 min
|
||||
# Request
|
||||
r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
|
||||
# Decode
|
||||
urls_fetched = json.loads(r.text).get("list_urls", [])
|
||||
except Exception as e:
|
||||
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
|
||||
urls_fetched = []
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in MissingKids.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
@@ -1,4 +1,3 @@
|
||||
import traceback
|
||||
import os
|
||||
from django.core.cache import cache
|
||||
from .logger import get_logger
|
||||
@@ -30,7 +29,7 @@ def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE
|
||||
# Cache decoded URL
|
||||
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
|
||||
else:
|
||||
logger.info("Bad status while decoding news.google.com, URL {}".format(url))
|
||||
logger.info("Bad status while decoding news.google.com, URL {}\n{}".format(url, decoded_url_dict.get("message")))
|
||||
except Exception as e:
|
||||
logger.warning("Error decoding news.google.com, URL: {}".format(url))
|
||||
return list_decoded_urls
|
||||
@@ -69,6 +69,16 @@ def process_url(url):
|
||||
except Exception as e:
|
||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||
return None
|
||||
|
||||
try:
|
||||
content_merged = "\n".join([article.title, article.meta_description, article.text])
|
||||
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
|
||||
language = langdetect.detect(content_merged)
|
||||
else:
|
||||
language = None
|
||||
except Exception as e:
|
||||
logger.info("Could not detect language: {}\n{}".format(url, str(e)))
|
||||
language = None
|
||||
|
||||
dict_data = {
|
||||
"url": url,
|
||||
@@ -76,8 +86,7 @@ def process_url(url):
|
||||
"url_host": article.source_url,
|
||||
"site_name": article.meta_site_name,
|
||||
"publish_date": article.publish_date,
|
||||
# article.meta_lang -> Not always reliable
|
||||
"language": langdetect.detect("\n".join([article.title, article.meta_description, article.text]) ),
|
||||
"language": language, # article.meta_lang -> Not always reliable
|
||||
"title": article.title,
|
||||
"description": article.meta_description,
|
||||
"content": article.text,
|
||||
|
||||
@@ -3,10 +3,8 @@ from scheduler import job
|
||||
from .src.fetch_feed import FetchFeeds
|
||||
from .src.fetch_parser import FetchParser
|
||||
from .src.fetch_search import FetchSearcher
|
||||
from .src.fetch_missing_kids import FetchMissingKids
|
||||
from .src.db_utils import DB_Handler
|
||||
'''
|
||||
from src.missing_kids_fetch import MissingKidsFetch
|
||||
'''
|
||||
|
||||
from .src.logger import get_logger
|
||||
logger = get_logger()
|
||||
@@ -32,7 +30,19 @@ def fetch_search():
|
||||
FetchSearcher().run()
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
# TODO: fetch_missing_kids()
|
||||
@job('default')
|
||||
def fetch_missing_kids(number_pages=5):
|
||||
task = "Fetch MissingKids"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def fetch_missing_kids_all(number_pages=-1):
|
||||
task = "Fetch MissingKids"
|
||||
logger.info("Task triggered: {}".format(task))
|
||||
FetchMissingKids().run(number_pages)
|
||||
logger.info("Task completed: {}".format(task))
|
||||
|
||||
@job('default')
|
||||
def process_raw_urls(batch_size=50):
|
||||
@@ -77,8 +87,15 @@ def background_task(process_type: str):
|
||||
FetchParser().run()
|
||||
elif (process_type == "fetch_search"):
|
||||
FetchSearcher().run()
|
||||
#elif (process_type == "fetch_missingkids"):
|
||||
# FetchMissingKids().run()
|
||||
elif (process_type == "fetch_missingkids_all"):
|
||||
FetchMissingKids().run(number_pages=-1)
|
||||
elif ("fetch_missingkids" in process_type):
|
||||
# number_pages encoded in URL
|
||||
try:
|
||||
number_pages = int(process_type.split("_")[-1])
|
||||
except Exception as e:
|
||||
number_pages = -1
|
||||
FetchMissingKids().run(number_pages=number_pages)
|
||||
elif ("process_" in process_type):
|
||||
# Batch size encoded in URL
|
||||
try:
|
||||
@@ -95,14 +112,6 @@ def background_task(process_type: str):
|
||||
else:
|
||||
logger.info("Task unknown!: {}".format(process_type))
|
||||
|
||||
'''
|
||||
# Selenium based
|
||||
elif (process_type == "fetch_missing_kids_reduced"):
|
||||
MissingKidsFetch(db_handler, num_pages=4).run()
|
||||
elif (process_type == "fetch_missing_kids_full"):
|
||||
MissingKidsFetch(db_handler, num_pages=100000).run()
|
||||
'''
|
||||
|
||||
logger.info("Task completed: {}".format(process_type))
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
@@ -258,7 +258,7 @@ input[type="checkbox"] {
|
||||
<span id="offText" class="off-text">OFF</span>
|
||||
</span>
|
||||
</div>
|
||||
-->
|
||||
-->
|
||||
|
||||
<!-- Pages Per Page Dropdown -->
|
||||
<h3>Pages Per Page</h3>
|
||||
@@ -297,6 +297,17 @@ input[type="checkbox"] {
|
||||
</label><br>
|
||||
{% endfor %}
|
||||
|
||||
<!-- Filter by valid content -->
|
||||
<h3>Valid content</h3>
|
||||
<button type="button" class="toggle-all-btn" data-toggle="valid_content">Toggle All</button><br>
|
||||
{% for vc in valid_contents %}
|
||||
<label>
|
||||
<input type="checkbox" name="valid_content" value="{{ vc }}"
|
||||
{% if vc|stringformat:"s" in selected_valid_contents or 'all' in selected_valid_contents%}checked{% endif %}>
|
||||
{{ vc|truncatechars:50 }}
|
||||
</label><br>
|
||||
{% endfor %}
|
||||
|
||||
<!-- Filter by Search -->
|
||||
<h3>Search</h3>
|
||||
<button type="button" class="toggle-all-btn" data-toggle="search">Toggle All</button><br>
|
||||
@@ -329,7 +340,7 @@ input[type="checkbox"] {
|
||||
{{ lang|truncatechars:50 }}
|
||||
</label><br>
|
||||
{% endfor %}
|
||||
|
||||
|
||||
</form>
|
||||
</div>
|
||||
|
||||
@@ -526,10 +537,6 @@ input[type="checkbox"] {
|
||||
const checkboxes = document.querySelectorAll(`[name='${section}']`);
|
||||
const allChecked = Array.from(checkboxes).every(checkbox => checkbox.checked);
|
||||
checkboxes.forEach(cb => cb.checked = !allChecked);
|
||||
/*
|
||||
// Automatically submit the form when a checkbox is toggled
|
||||
document.getElementById('filterForm').submit();
|
||||
*/
|
||||
updateFormParameter(section);
|
||||
}
|
||||
|
||||
@@ -545,9 +552,6 @@ input[type="checkbox"] {
|
||||
// Automatically submit the form when any checkbox changes
|
||||
document.querySelectorAll('input[type="checkbox"]').forEach(function(checkbox) {
|
||||
checkbox.addEventListener('change', function() {
|
||||
/*
|
||||
document.getElementById('filterForm').submit();
|
||||
*/
|
||||
updateFormParameter(this.name);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -15,7 +15,7 @@ def trigger_task(request, task):
|
||||
####################################################################################################
|
||||
def link_list(request):
|
||||
prefix = "http://localhost:8000/task"
|
||||
links = ["fetch_feeds", "fetch_parser", "fetch_search", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
||||
links = ["fetch_feeds", "fetch_parser", "fetch_search", "fetch_missingkids_5", "fetch_missingkids_all", "process_raw_urls_50", "process_error_urls_50", "process_missing_kids_urls_50", "process_missing_kids_urls_all"]
|
||||
|
||||
list_links = [
|
||||
# DB
|
||||
@@ -212,21 +212,26 @@ def filtered_urls(request):
|
||||
# TODO: Cache languages, update once every N
|
||||
languages = list(UrlContent.objects.distinct('language').values_list('language', flat=True))
|
||||
# Null for visualization
|
||||
languages = ["Null"] + [l for l in languages if l is not None]
|
||||
languages = ["Unknown"] + [l for l in languages if l is not None]
|
||||
valid_contents = ["True", "False", "Unknown"]
|
||||
|
||||
# Get selected parameters
|
||||
selected_status = request.GET.getlist('status', ["null"])
|
||||
selected_search = request.GET.getlist('search', ["null"])
|
||||
selected_source = request.GET.getlist('source', ["null"])
|
||||
selected_language = request.GET.getlist('language', ["null"])
|
||||
selected_valid_contents = request.GET.getlist('valid_content', ["null"])
|
||||
selected_days = request.GET.get("days", 30)
|
||||
per_page = request.GET.get('per_page', 100) # Default is X URLs per page
|
||||
page_number = request.GET.get('page') # Get the current page number
|
||||
|
||||
|
||||
all_status = [str(status[0]) for status in statuses]
|
||||
all_search = [str(search.id) for search in searches]
|
||||
all_source = [str(source.id) for source in sources]
|
||||
all_languages = languages
|
||||
all_valid_contents = valid_contents
|
||||
|
||||
|
||||
# Override with default filters? [Case: no params update on URL] -> Only on "Home" click, or "Next page"
|
||||
if (len(request.GET.keys()) == 0) or ((len(request.GET.keys()) == 1) and ("page" in request.GET.keys())):
|
||||
@@ -234,23 +239,22 @@ def filtered_urls(request):
|
||||
selected_search = ["all"]
|
||||
selected_source = ["all"]
|
||||
selected_language = ["all"]
|
||||
|
||||
|
||||
# print(set(selected_status), set(all_status))
|
||||
"""
|
||||
# List of TODO remove...
|
||||
if (set(selected_status) == set(all_status)):
|
||||
selected_status = ["all"]
|
||||
if (set(selected_search) == set(all_search)):
|
||||
selected_search = ["all"]
|
||||
if (set(selected_source) == set(all_source)):
|
||||
selected_source = ["all"]
|
||||
if (set(selected_language) == set(languages)):
|
||||
selected_language = ["all"]"
|
||||
"""
|
||||
selected_valid_contents = ["all"]
|
||||
else:
|
||||
# Non-defult parameters, if list with all elements, replace with "all" and avoid heavy query
|
||||
if (set(selected_status) == set(all_status)):
|
||||
selected_status = ["all"]
|
||||
if (set(selected_search) == set(all_search)):
|
||||
selected_search = ["all"]
|
||||
if (set(selected_source) == set(all_source)):
|
||||
selected_source = ["all"]
|
||||
if (set(selected_language) == set(all_languages)):
|
||||
selected_language = ["all"]
|
||||
if (set(selected_valid_contents) == set(all_valid_contents)):
|
||||
selected_valid_contents = ["all"]
|
||||
|
||||
# Filter URLs based on selected filters
|
||||
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language):
|
||||
if ('null' in selected_status) or ('null' in selected_search) or ('null' in selected_source) or ('null' in selected_language) or ('null' in selected_valid_contents):
|
||||
urls = []
|
||||
else:
|
||||
# Filter by date
|
||||
@@ -262,18 +266,36 @@ def filtered_urls(request):
|
||||
query &= Q(urlssourcesearch__id_source__in=selected_source)
|
||||
if ("all" not in selected_search):
|
||||
query &= Q(urlssourcesearch__id_search__in=selected_search)
|
||||
if ("all" not in selected_language):
|
||||
if ("all" not in selected_language):
|
||||
# URLs with selected languages
|
||||
subquery = Q(urlcontent__language__in=selected_language)
|
||||
if ("Null" in selected_language):
|
||||
if ("Unknown" in selected_language):
|
||||
# URLs with NULL language
|
||||
subquery |= Q(urlcontent__language__isnull=True)
|
||||
# URLs with no UrlContent record at all (similar to URLs with NULL language)
|
||||
subquery |= Q(urlcontent__id_url__isnull=True)
|
||||
# Update query
|
||||
query &= (subquery)
|
||||
if ("all" not in selected_valid_contents):
|
||||
# Boolean array
|
||||
bool_array = []
|
||||
if ('True' in selected_valid_contents):
|
||||
bool_array.append(True)
|
||||
if ('False' in selected_valid_contents):
|
||||
bool_array.append(False)
|
||||
# URLs with selected valid_contents
|
||||
subquery = Q(urlcontent__valid_content__in=bool_array)
|
||||
if ("Unknown" in selected_valid_contents):
|
||||
# URLs with NULL valid_content
|
||||
subquery |= Q(urlcontent__valid_content__isnull=True)
|
||||
# URLs with no UrlContent record at all (similar to URLs with NULL valid_content)
|
||||
subquery |= Q(urlcontent__id_url__isnull=True)
|
||||
# Update query
|
||||
query &= (subquery)
|
||||
|
||||
# Run query
|
||||
urls = Urls.objects.filter(query).distinct() # .order_by('-ts_fetch')
|
||||
# print(urls.query)
|
||||
|
||||
# Pagination
|
||||
paginator = Paginator(urls, per_page) # Paginate the filtered URLs
|
||||
@@ -300,11 +322,13 @@ def filtered_urls(request):
|
||||
'searches': sorted(searches, key=lambda x: (x.type, x.search)),
|
||||
'sources': sorted(sources, key=lambda x: x.source),
|
||||
'languages': sorted(languages, key=lambda x: (x is None, x)),
|
||||
'valid_contents': valid_contents,
|
||||
# Selection
|
||||
'selected_status': selected_status,
|
||||
'selected_search': selected_search,
|
||||
'selected_source': selected_source,
|
||||
'selected_language': selected_language,
|
||||
'selected_valid_contents': selected_valid_contents,
|
||||
"selected_days": selected_days,
|
||||
# Map
|
||||
"sources_map": sources_map,
|
||||
|
||||
Reference in New Issue
Block a user