Dockerization, whitenoise serving static, refactor
This commit is contained in:
273
app_urls/fetcher/src/db_utils.py
Normal file
273
app_urls/fetcher/src/db_utils.py
Normal file
@@ -0,0 +1,273 @@
|
||||
from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPatternMatching, Source, Search
|
||||
from django.db.models import Q
|
||||
from django.core.cache import cache
|
||||
from django.db import IntegrityError
|
||||
from .url_processor import process_url, get_with_protocol
|
||||
import re
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class DB_Handler():
|
||||
def __init__(self):
|
||||
# Inserting raw URL, cache time: 1 day
|
||||
self._cache_timeout_insert_url = 86400
|
||||
# Processing error URL, cache time: 2 days
|
||||
self._cache_timeout_error_url = 86400*2
|
||||
|
||||
def insert_raw_urls(self, urls, obj_source, obj_search):
|
||||
try:
|
||||
logger.debug("Inserting raw URLs")
|
||||
# Empty?
|
||||
if (len(urls) == 0):
|
||||
logger.debug("Empty batch of urls (not writing to DB) for source-search: {} - {}".format(obj_source.source, obj_search.search))
|
||||
return
|
||||
# Default protocol https://
|
||||
urls_clean = [get_with_protocol(url) for url in urls]
|
||||
|
||||
urls_to_insert = []
|
||||
# Per URL
|
||||
for url in urls_clean:
|
||||
|
||||
### Already processed URL?
|
||||
if (cache.get("insert_{}".format(url)) is not None):
|
||||
logger.debug("Already cached URL: {}".format(url))
|
||||
|
||||
if (cache.get("insert_{}{}{}".format(url, obj_source.source, obj_search.search)) is not None):
|
||||
logger.debug("Already cached (URL, source, search): {} {} {}".format(url, obj_source.source, obj_search.search))
|
||||
else:
|
||||
### Insert (URL_id, source_id, search_id), since not cached
|
||||
# Get URL ID (should already be created)
|
||||
obj_url, created = Urls.objects.get_or_create(url=url)
|
||||
# Create (id_source, id_url) (shouldn't exist)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
|
||||
else:
|
||||
# Add object to insert
|
||||
# url_object_to_insert.append(Urls(url=url))
|
||||
urls_to_insert.append(url)
|
||||
|
||||
### Insert URLs & (URL_id, source_id)
|
||||
try:
|
||||
### Bulk insert, fails if duplicated URL (not retuning IDs when using ignore_conflicts=True)
|
||||
# URLs (ignore_conflicts=False to return IDs)
|
||||
bulk_created_urls = Urls.objects.bulk_create([Urls(url=url) for url in urls_to_insert], ignore_conflicts=False)
|
||||
# (URL_id, source_id)
|
||||
UrlsSourceSearch.objects.bulk_create([UrlsSourceSearch(id_url=obj_url, id_source=obj_source, id_search=obj_search) for obj_url in bulk_created_urls], ignore_conflicts=True)
|
||||
except IntegrityError as e:
|
||||
### Fallback to one-by-one insert
|
||||
logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method")
|
||||
# One by one
|
||||
for url in urls_to_insert:
|
||||
# URL
|
||||
obj_url, created = Urls.objects.get_or_create(url=url)
|
||||
if (created):
|
||||
logger.debug("Inserted: {}".format(obj_url.url))
|
||||
else:
|
||||
logger.debug("Not inserted: {}".format(obj_url.url))
|
||||
# (URL, source, search)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("bulk_create unknown exception while inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
# Avoid caching due to error on insertion
|
||||
urls_clean = []
|
||||
|
||||
# Insert or update cache
|
||||
for url in urls_clean:
|
||||
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
|
||||
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
|
||||
|
||||
logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search))
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
|
||||
|
||||
def set_status(obj_url, status):
|
||||
# Update status if setting a new value
|
||||
if (obj_url.status != status):
|
||||
obj_url.status = status
|
||||
obj_url.save()
|
||||
|
||||
##### Filter URL? -> Invalid
|
||||
if (status_pattern_match == "invalid"):
|
||||
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Process URL
|
||||
try:
|
||||
# Get data
|
||||
dict_url_data = process_url(obj_url.url)
|
||||
except Exception as e:
|
||||
if (raise_exception_on_error):
|
||||
# Simply raise exception, handled in a different way
|
||||
raise Exception("Error processing URL, raising exception as expected")
|
||||
else:
|
||||
logger.debug("Error processing URL: {}\n{}\n{}".format(obj_url.url, str(e), traceback.format_exc()))
|
||||
# Set status to error
|
||||
dict_url_data = None
|
||||
|
||||
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
|
||||
if (dict_url_data is None):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
# Invalid? e.g. binary data
|
||||
if (dict_url_data.get("override_status") == "invalid"):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Canonical URL different? -> Duplicate
|
||||
if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
|
||||
|
||||
# Get or create URL with canonical form
|
||||
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
|
||||
# Get the source-search IDs associated to obj_url.id
|
||||
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
|
||||
for obj_url_source_search in list_url_source_search:
|
||||
# Associate same sources to url_canonical (it might already exist)
|
||||
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
|
||||
|
||||
# URLs duplciate association
|
||||
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
|
||||
|
||||
# TODO: return obj_url_canonical so as to directly process the recently inserted URL
|
||||
# Wherever this function is called, add:
|
||||
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
|
||||
|
||||
# Next URL
|
||||
return
|
||||
|
||||
##### Valid URL
|
||||
# Update status
|
||||
set_status(obj_url, Urls.STATUS_ENUM.VALID)
|
||||
|
||||
# Create or update extracted URL data
|
||||
UrlContent.objects.update_or_create(
|
||||
id_url=obj_url,
|
||||
defaults = {
|
||||
"date_published" : dict_url_data.get("publish_date"),
|
||||
"title" : dict_url_data.get("title"),
|
||||
"description" : dict_url_data.get("description"),
|
||||
"content" : dict_url_data.get("content"),
|
||||
"valid_content" : dict_url_data.get("valid_content"),
|
||||
"language" : dict_url_data.get("language"),
|
||||
"keywords" : dict_url_data.get("keywords"),
|
||||
"tags" : dict_url_data.get("tags"),
|
||||
"authors" : dict_url_data.get("authors"),
|
||||
"image_main_url" : dict_url_data.get("image_main_url"),
|
||||
"images_url" : dict_url_data.get("images_url"),
|
||||
"videos_url" : dict_url_data.get("videos_url"),
|
||||
"url_host" : dict_url_data.get("url_host"),
|
||||
"site_name" : dict_url_data.get("site_name"),
|
||||
}
|
||||
)
|
||||
|
||||
def process_raw_urls(self, batch_size):
|
||||
|
||||
def _get_status_pattern_matching(url, list_pattern_status_tuple):
|
||||
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
|
||||
"""
|
||||
# Sort pattern tuples by priority. (pattern, priority, status)
|
||||
for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
|
||||
# Regular expression pattern matching: https://regexr.com/
|
||||
if bool(re.match(regex_pattern, obj_url.url)):
|
||||
logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
|
||||
return status_if_match
|
||||
return None
|
||||
|
||||
try:
|
||||
logger.debug("Processing raw URLs")
|
||||
|
||||
# Get batch of URLs, status='raw'
|
||||
raw_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.RAW)[:batch_size]
|
||||
|
||||
if (len(raw_urls) == 0):
|
||||
logger.debug("No raw URLs to process")
|
||||
return
|
||||
|
||||
# Get list of (pattern, priority, status) tuples to override status if required
|
||||
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
|
||||
|
||||
# Per URL
|
||||
for obj_url in raw_urls:
|
||||
# Override status if pattern matching?
|
||||
status_pattern_match = _get_status_pattern_matching(obj_url.url, list_pattern_status_tuple)
|
||||
# Process URL
|
||||
self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False)
|
||||
|
||||
logger.info("Updated #{} raw URLs".format(len(raw_urls)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def process_error_urls(self, batch_size):
|
||||
try:
|
||||
logger.debug("Processing error URLs")
|
||||
|
||||
# Keep track of processed and skipped "error" URLs
|
||||
num_urls_skipped, num_urls_processed = 0, 0
|
||||
# Get batch of URLs, status='error'
|
||||
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
|
||||
|
||||
while ((len(error_urls) > 0) and (num_urls_processed < batch_size)):
|
||||
# Per URL
|
||||
for obj_url in error_urls:
|
||||
# URL ID cached? -> Tried to process recently already, skip
|
||||
if (cache.get("error_{}".format(obj_url.id)) is not None):
|
||||
logger.debug("Already cached URL ID: {}".format(obj_url.id))
|
||||
num_urls_skipped += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
# Process URL
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
|
||||
num_urls_processed += 1
|
||||
except Exception as e:
|
||||
# Error, cache to avoid re-processing for X time
|
||||
cache.set("error_{}".format(obj_url.id), True, timeout=self._cache_timeout_insert_url)
|
||||
num_urls_skipped += 1
|
||||
|
||||
# Get following batch of URLs, status='error'
|
||||
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
|
||||
|
||||
logger.info("Updated #{}, skipped #{} error URLs".format(num_urls_processed, num_urls_skipped))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
def process_missing_kids_urls(self, batch_size=None):
|
||||
try:
|
||||
logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size))
|
||||
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
|
||||
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
|
||||
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
|
||||
&
|
||||
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
|
||||
)
|
||||
|
||||
# Get batch size
|
||||
if (batch_size is not None):
|
||||
missingkids_urls = missingkids_urls[:batch_size]
|
||||
|
||||
# Per URL
|
||||
for obj_url in missingkids_urls:
|
||||
try:
|
||||
# Process URL. If no exception -> Valid
|
||||
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
|
||||
except Exception as e:
|
||||
# Raised exception -> Invalid (404 error)
|
||||
obj_url.status = Urls.STATUS_ENUM.INVALID
|
||||
obj_url.save()
|
||||
|
||||
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))
|
||||
|
||||
51
app_urls/fetcher/src/fetch_feed.py
Normal file
51
app_urls/fetcher/src/fetch_feed.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
import feedparser
|
||||
import dateutil
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchFeeds():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher Feeds")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchFeeds.run()")
|
||||
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source="feeds")
|
||||
|
||||
# Get feeds objects
|
||||
list_obj_search_feeds = Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED)
|
||||
logger.debug("Fetching from feeds: {}".format([e.search for e in list_obj_search_feeds]))
|
||||
|
||||
# Process via RSS feeds
|
||||
for obj_search in list_obj_search_feeds:
|
||||
# Initialize
|
||||
urls_fetched, urls_publish_date = [], []
|
||||
# Fetch feeds
|
||||
feeds = feedparser.parse(obj_search.search)
|
||||
# Parse
|
||||
for f in feeds.get("entries", []):
|
||||
# Get URL
|
||||
url = f.get("link", None)
|
||||
# Process?
|
||||
if (url is not None):
|
||||
# Available publish date?
|
||||
publish_date_parsed = f.get("published_parsed")
|
||||
if (publish_date_parsed is None):
|
||||
publish_date = f.get("published", None)
|
||||
if (publish_date is not None):
|
||||
publish_date_parsed = dateutil.parser.parse(publish_date)
|
||||
|
||||
# Published date
|
||||
urls_publish_date.append(publish_date_parsed)
|
||||
# URL
|
||||
urls_fetched.append(url)
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchFeeds.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
42
app_urls/fetcher/src/fetch_missing_kids.py
Normal file
42
app_urls/fetcher/src/fetch_missing_kids.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
import os
|
||||
import requests
|
||||
import json
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchMissingKids():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher MissingKids")
|
||||
|
||||
def run(self, number_pages=-1):
|
||||
try:
|
||||
logger.debug("Starting MissingKids.run(), processing #{} pages".format(number_pages))
|
||||
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source="missingkids.org")
|
||||
# Get search object
|
||||
obj_search, created = Search.objects.get_or_create(search="missingkids.org/poster", type=Search.TYPE_ENUM.URL_HOST)
|
||||
|
||||
try:
|
||||
# Missing kids fetching endpoint, parameter number of pages to fetch
|
||||
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "get_missing_kids/?pages={}".format(number_pages))
|
||||
# Timeout
|
||||
if (number_pages > 15) or (number_pages == -1):
|
||||
timeout = 60*90 # 1.5h
|
||||
else:
|
||||
timeout = 60*10 # 10 min
|
||||
# Request
|
||||
r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
|
||||
# Decode
|
||||
urls_fetched = json.loads(r.text).get("list_urls", [])
|
||||
except Exception as e:
|
||||
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
|
||||
urls_fetched = []
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in MissingKids.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
46
app_urls/fetcher/src/fetch_parser.py
Normal file
46
app_urls/fetcher/src/fetch_parser.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search, Source
|
||||
from .url_processor import get_with_protocol, url_host_slowdown
|
||||
import newspaper
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchParser():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher Parser")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchParser.run() for {}")
|
||||
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source="newspaper4k")
|
||||
# Get URL hosts
|
||||
list_url_host = Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST)
|
||||
logger.debug("Fetching news by parsing URL hosts: {}".format([e.search for e in list_url_host]))
|
||||
|
||||
# Process newspaper4k build method
|
||||
for obj_search in list_url_host:
|
||||
# Protocol
|
||||
url_host_protocol = get_with_protocol(obj_search.search)
|
||||
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_protocol))
|
||||
|
||||
# Make sure no requests made for the last X seconds
|
||||
url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
|
||||
try:
|
||||
# Source object
|
||||
url_host_built = newspaper.build(url_host_protocol)
|
||||
# Get articles URL list
|
||||
urls_fetched = url_host_built.article_urls()
|
||||
except newspaper.exceptions.ArticleException as e:
|
||||
logger.debug("ArticleException while parsing input URL {}\n{}".format(url_host_protocol, str(e.args)))
|
||||
urls_fetched = []
|
||||
except Exception as e:
|
||||
logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
|
||||
urls_fetched = []
|
||||
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
57
app_urls/fetcher/src/fetch_search.py
Normal file
57
app_urls/fetcher/src/fetch_search.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import Search
|
||||
from django.db.models import Q
|
||||
import traceback
|
||||
import time
|
||||
import os
|
||||
from .fetch_search_instances import ListSearchInstances
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchSearcher():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher Searcher")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchSearcher.run()")
|
||||
|
||||
# Get search objects of interest
|
||||
list_search_obj = Search.objects.filter(Q(type=Search.TYPE_ENUM.URL_HOST) | Q(type=Search.TYPE_ENUM.KEYWORD_SEARCH))
|
||||
logger.debug("Fetching from search: {}".format(["{} ({})".format(e.search, e.type) for e in list_search_obj]))
|
||||
|
||||
# Search
|
||||
for obj_search in list_search_obj:
|
||||
# TODO: language & country customization
|
||||
|
||||
# Search
|
||||
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
|
||||
|
||||
if (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
|
||||
# Add search with intitle keyword
|
||||
# TODO: allintitle: "child abuse"
|
||||
# TODO: intitle: "child abuse"
|
||||
pass
|
||||
# language, country = obj_search.language_country.split("-")
|
||||
|
||||
logger.debug("Starting keyword search: {}".format(keyword_search))
|
||||
logger.debug("Search type: {}".format(obj_search.type))
|
||||
|
||||
# DB writer
|
||||
db_writer = DB_Handler()
|
||||
|
||||
# Keyword arguments
|
||||
args = {
|
||||
"language": "en",
|
||||
"country": "US",
|
||||
# "period": ["7d", "1d"], # TODO: List of periods to iterate
|
||||
}
|
||||
|
||||
for SearchInstance in ListSearchInstances:
|
||||
# Sleep between requests, avoid too many requests...
|
||||
time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
|
||||
SearchInstance(args).fetch_articles(db_writer, obj_search)
|
||||
|
||||
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
308
app_urls/fetcher/src/fetch_search_instances.py
Normal file
308
app_urls/fetcher/src/fetch_search_instances.py
Normal file
@@ -0,0 +1,308 @@
|
||||
import time
|
||||
import feedparser
|
||||
import os
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from ..models import Search, Source
|
||||
from .fetch_utils import decode_gnews_urls
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
from gnews import GNews
|
||||
from duckduckgo_search import DDGS
|
||||
from GoogleNews import GoogleNews
|
||||
from search_engines import Yahoo, Aol
|
||||
|
||||
###########################################################################
|
||||
###########################################################################
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
# Generic fetcher (fetches articles, writes to DB)
|
||||
class FetcherAbstract(ABC):
|
||||
@abstractmethod
|
||||
def _fetch_raw_urls(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _get_name(self):
|
||||
pass
|
||||
|
||||
def _get_source_object(self, source):
|
||||
# TODO: Cache
|
||||
# self.cached_sources = {}
|
||||
# Get source object
|
||||
obj_source, created = Source.objects.get_or_create(source=source)
|
||||
return obj_source
|
||||
|
||||
def _post_process_urls(self, raw_urls, obj_search):
|
||||
# Searching URL Host based? Make sure results belong to that site
|
||||
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
|
||||
# Get clean URL host
|
||||
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
|
||||
# Ensure URL host in URL
|
||||
raw_urls = [u for u in raw_urls if url_host_clean in u]
|
||||
|
||||
return raw_urls
|
||||
|
||||
def fetch_articles(self, db_writer, obj_search):
|
||||
# Source name
|
||||
source_name = self._get_name()
|
||||
|
||||
# Search
|
||||
keyword_search = obj_search.search
|
||||
# URL Host search? -> site:${URL_HOST}
|
||||
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
|
||||
keyword_search = "{}{}".format("site:", keyword_search)
|
||||
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
|
||||
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
|
||||
start_date = timezone.now() - timedelta(days=7)
|
||||
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
|
||||
|
||||
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
|
||||
# Fetch
|
||||
raw_urls = self._fetch_raw_urls(keyword_search)
|
||||
# Post-process
|
||||
raw_urls = self._post_process_urls(raw_urls, obj_search)
|
||||
|
||||
# Write to DB
|
||||
db_writer.insert_raw_urls(raw_urls, self._get_source_object(source_name), obj_search)
|
||||
|
||||
###########################################################################
|
||||
|
||||
class SearchGNews(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
self.period = args.get("period", "7d")
|
||||
self.max_results = args.get("max_results", 100)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [period] [language-country] [max_results]
|
||||
return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
# Get news
|
||||
results_gnews = GNews(language=self.language, country=self.country, period=self.period, max_results=self.max_results).get_news(keyword_search)
|
||||
# Get list of encoded urls
|
||||
encoded_urls = [e.get("url") for e in results_gnews]
|
||||
# Decode
|
||||
urls = decode_gnews_urls(encoded_urls)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchDuckDuckGoGeneral(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "wt")
|
||||
self.country = args.get("country", "wt")
|
||||
self.max_results = args.get("max_results", 20)
|
||||
self.region = "{}-{}".format(self.language, self.country).lower()
|
||||
self.period = None
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country] [max_results]
|
||||
return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
|
||||
urls = [e.get("href") for e in news]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchDuckDuckGoNews(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "wt")
|
||||
self.country = args.get("country", "wt")
|
||||
self.max_results = args.get("max_results", 100)
|
||||
self.region = "{}-{}".format(self.language, self.country).lower()
|
||||
self.period = None
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country] [max_results]
|
||||
return "ddg-news {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
news = DDGS().news(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
|
||||
urls = [e.get("url") for e in news]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchGoogleNews(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
self.period = args.get("period", "7d")
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [period] [language-country]
|
||||
return "googlenews {} {}-{}".format(self.period, self.language, self.country)
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
# Initialize
|
||||
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
|
||||
googlenews.enableException(True)
|
||||
# Search
|
||||
googlenews.get_news(keyword_search)
|
||||
# Fetch
|
||||
encoded_urls = googlenews.get_links()
|
||||
# Decode
|
||||
urls = decode_gnews_urls(encoded_urls)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchGoogleGeneral(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
self.period = args.get("period", "7d")
|
||||
self.pages = args.get("pages", 1)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [period] [language-country] [pages]
|
||||
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
# Initialize
|
||||
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
|
||||
googlenews.enableException(True)
|
||||
# Search
|
||||
googlenews.search(keyword_search)
|
||||
|
||||
set_links = set()
|
||||
# Iterate pages
|
||||
for i in range(self.pages):
|
||||
# Sleep between pages fetch
|
||||
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
|
||||
# Number of URLs fetched so far
|
||||
num_before = len(set_links)
|
||||
# Get page
|
||||
try:
|
||||
links = googlenews.page_at(i+1)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching page - {}: {}".format(self._get_name(), str(e)))
|
||||
break
|
||||
# Links
|
||||
for l in links:
|
||||
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
|
||||
set_links.add( l.get("link").split("&ved=")[0] )
|
||||
# Finished?
|
||||
if (num_before == len(set_links)):
|
||||
break
|
||||
# To list
|
||||
urls = list(set_links)
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchGoogleNewsRSS(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.language = args.get("language", "en")
|
||||
self.country = args.get("country", "US")
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country]
|
||||
return "googlenews-rss {}-{}".format(self.language, self.country).strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
# Search URL with parameters filled: https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
|
||||
search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(self.language, self.country.upper()), self.country.upper(), self.country.upper(), self.language)
|
||||
# Control characters
|
||||
search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
|
||||
# Initialize
|
||||
encoded_urls = []
|
||||
# Fetch feeds
|
||||
feeds = feedparser.parse(search_url)
|
||||
# Parse
|
||||
for f in feeds.get("entries", []):
|
||||
# Encoded URL
|
||||
encoded_url = f.get("link", None)
|
||||
'''
|
||||
# Available publish date?
|
||||
publish_date_parsed = f.get("published_parsed")
|
||||
if (publish_date_parsed is None):
|
||||
publish_date = f.get("published", None)
|
||||
if (publish_date is not None):
|
||||
publish_date_parsed = dateutil.parser.parse(publish_date)
|
||||
|
||||
# Published date
|
||||
urls_publish_date.append(publish_date_parsed)'
|
||||
'''
|
||||
# Append
|
||||
encoded_urls.append(encoded_url)
|
||||
|
||||
# Decode
|
||||
urls = decode_gnews_urls(encoded_urls)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
|
||||
return urls
|
||||
|
||||
class SearchYahooGeneral(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.pages = args.get("pages", 2)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country] [pages]
|
||||
return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
results = Yahoo().search(keyword_search, pages=self.pages)
|
||||
urls = results.links()
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
|
||||
class SearchAOLGeneral(FetcherAbstract):
|
||||
def __init__(self, args={}):
|
||||
super().__init__()
|
||||
# Parameters
|
||||
self.pages = args.get("pages", 2)
|
||||
|
||||
def _get_name(self):
|
||||
# [source] [language-country] [pages]
|
||||
return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip()
|
||||
|
||||
def _fetch_raw_urls(self, keyword_search):
|
||||
try:
|
||||
results = Aol().search(keyword_search, pages=self.pages)
|
||||
urls = results.links()
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
|
||||
urls = []
|
||||
return urls
|
||||
###########################################################################
|
||||
|
||||
# List of instances
|
||||
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]
|
||||
35
app_urls/fetcher/src/fetch_utils.py
Normal file
35
app_urls/fetcher/src/fetch_utils.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import os
|
||||
from django.core.cache import cache
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
from googlenewsdecoder import gnewsdecoder
|
||||
|
||||
|
||||
def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
|
||||
logger.debug("Decoding gnews URLs")
|
||||
# DecodeURLs
|
||||
list_decoded_urls = []
|
||||
for url in encoded_urls:
|
||||
# Already cached?
|
||||
decoded_url = cache.get("gnews_decode_{}".format(url))
|
||||
|
||||
if (decoded_url is not None):
|
||||
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
|
||||
# Append decoded URL
|
||||
list_decoded_urls.append(decoded_url)
|
||||
else:
|
||||
try:
|
||||
# Decode URL, with interval time to avoid block
|
||||
decoded_url_dict = gnewsdecoder(url, interval=interval)
|
||||
# Ok?
|
||||
if decoded_url_dict.get("status"):
|
||||
# Append decoded URL
|
||||
decoded_url = decoded_url_dict["decoded_url"]
|
||||
list_decoded_urls.append(decoded_url)
|
||||
# Cache decoded URL
|
||||
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
|
||||
else:
|
||||
logger.info("Bad status while decoding news.google.com, URL {}\n{}".format(url, decoded_url_dict.get("message")))
|
||||
except Exception as e:
|
||||
logger.warning("Error decoding news.google.com, URL: {}".format(url))
|
||||
return list_decoded_urls
|
||||
33
app_urls/fetcher/src/logger.py
Normal file
33
app_urls/fetcher/src/logger.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
# Get env var
|
||||
logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
|
||||
|
||||
# Directory of logs
|
||||
os.makedirs(logs_directory, exist_ok=True)
|
||||
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("fetcher")
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.INFO)
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: WARNING / ERROR / CRITICAL
|
||||
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh.setLevel(logging.WARNING)
|
||||
logger.addHandler(fh)
|
||||
|
||||
def get_logger():
|
||||
return logger
|
||||
127
app_urls/fetcher/src/url_processor.py
Normal file
127
app_urls/fetcher/src/url_processor.py
Normal file
@@ -0,0 +1,127 @@
|
||||
from django.core.cache import cache
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
import newspaper
|
||||
import time
|
||||
import os
|
||||
from urllib.parse import unquote
|
||||
import langdetect
|
||||
langdetect.DetectorFactory.seed = 0
|
||||
|
||||
def get_with_protocol(url):
|
||||
# http:// -> https://
|
||||
url = url.replace("http://", "https://")
|
||||
# "" -> https://
|
||||
if not (url.startswith("https://")):
|
||||
url = "https://" + url
|
||||
return url
|
||||
|
||||
def get_url_host(url):
|
||||
# URL no protocol, first substring before '/'
|
||||
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
|
||||
return url_host
|
||||
|
||||
def url_host_slowdown(url, url_host_slowdown_seconds):
|
||||
### Avoid (frequent) too many requests to the same URL host
|
||||
# Get URL host
|
||||
url_host = get_url_host(url)
|
||||
# Recently processed URL host? -> Slow down required
|
||||
last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None)
|
||||
if last_cached_timestamp:
|
||||
# Get time since last processed URL host (in seconds)
|
||||
time_since_last_processed = time.time() - last_cached_timestamp
|
||||
# Amount of time required to sleep?
|
||||
slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
|
||||
logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
|
||||
# Sleep
|
||||
time.sleep(slowdown_required)
|
||||
# About to process URL host, cache time
|
||||
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
|
||||
|
||||
def process_url(url):
|
||||
try:
|
||||
# Slow down if required to avoid too many requests error
|
||||
url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
|
||||
# Process
|
||||
article = newspaper.article(url)
|
||||
except newspaper.ArticleBinaryDataException:
|
||||
logger.warning("ArticleException for input URL {}".format(url))
|
||||
return {"override_status": "invalid"}
|
||||
except newspaper.ArticleException as e:
|
||||
|
||||
# Too many requests? Cool down...
|
||||
if ("Status code 429" in str(e.args)):
|
||||
# TODO: cool down and retry once?, proxy/VPN, ...
|
||||
logger.debug("TODO: process_url Implement code 429")
|
||||
# Unavailable for legal reasons
|
||||
if ("Status code 451" in str(e.args)):
|
||||
# TODO: Bypass with VPN
|
||||
logger.debug("TODO: process_url Implement code 451")
|
||||
# CloudFlare protection?
|
||||
if ("Website protected with Cloudflare" in str(e.args)):
|
||||
logger.debug("TODO: process_url Implement bypass CloudFlare")
|
||||
# PerimeterX protection?
|
||||
if ("Website protected with PerimeterX" in str(e.args)):
|
||||
logger.debug("TODO: process_url Implement bypass PerimeterX")
|
||||
|
||||
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
|
||||
return None
|
||||
|
||||
try:
|
||||
content_merged = "\n".join([article.title, article.meta_description, article.text])
|
||||
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
|
||||
language = langdetect.detect(content_merged)
|
||||
else:
|
||||
language = None
|
||||
except Exception as e:
|
||||
logger.info("Could not detect language: {}\n{}".format(url, str(e)))
|
||||
language = None
|
||||
|
||||
dict_data = {
|
||||
"url": url,
|
||||
"url_canonical": article.canonical_link,
|
||||
"url_host": article.source_url,
|
||||
"site_name": article.meta_site_name,
|
||||
"publish_date": article.publish_date,
|
||||
"language": language, # article.meta_lang -> Not always reliable
|
||||
"title": article.title,
|
||||
"description": article.meta_description,
|
||||
"content": article.text,
|
||||
"valid_content": article.is_valid_body(),
|
||||
"keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
|
||||
"tags": article.tags,
|
||||
"authors": article.authors,
|
||||
"image_main_url": article.top_image, # article.meta_img
|
||||
"images_url": article.images,
|
||||
"videos_url": article.movies,
|
||||
}
|
||||
|
||||
'''
|
||||
# TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",")
|
||||
if (dict_data["tags"] is None):
|
||||
dict_data["tags"] = []
|
||||
for k in article.meta_data.keys():
|
||||
if ("tags" in k):
|
||||
dict_data["tags"] += article.meta_data[k].split(",")
|
||||
'''
|
||||
|
||||
# Sanity check
|
||||
for k in dict_data.keys():
|
||||
if (type(dict_data[k]) is list):
|
||||
# Remove empty string, unquote special characters, e.g. "%20" -> " "
|
||||
dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ]
|
||||
# NULL instead of empty list
|
||||
if (len(dict_data[k]) == 0):
|
||||
dict_data[k] = None
|
||||
elif (type(dict_data[k]) is str):
|
||||
# Unquote special characters
|
||||
if (dict_data[k] is not None):
|
||||
dict_data[k] = unquote(dict_data[k])
|
||||
# NULL instead of empty string
|
||||
if (dict_data[k] == ""):
|
||||
dict_data[k] = None
|
||||
|
||||
return dict_data
|
||||
Reference in New Issue
Block a user