Dockerization, whitenoise serving static, refactor

This commit is contained in:
Luciano Gervasoni
2025-04-04 10:53:16 +02:00
parent 5addfa5ba9
commit 4dbe2e55ef
39 changed files with 708 additions and 1238 deletions

View File

@@ -0,0 +1,273 @@
from ..models import Urls, UrlContent, UrlsSourceSearch, UrlsDuplicate, StatusPatternMatching, Source, Search
from django.db.models import Q
from django.core.cache import cache
from django.db import IntegrityError
from .url_processor import process_url, get_with_protocol
import re
import traceback
from .logger import get_logger
logger = get_logger()
class DB_Handler():
def __init__(self):
# Inserting raw URL, cache time: 1 day
self._cache_timeout_insert_url = 86400
# Processing error URL, cache time: 2 days
self._cache_timeout_error_url = 86400*2
def insert_raw_urls(self, urls, obj_source, obj_search):
try:
logger.debug("Inserting raw URLs")
# Empty?
if (len(urls) == 0):
logger.debug("Empty batch of urls (not writing to DB) for source-search: {} - {}".format(obj_source.source, obj_search.search))
return
# Default protocol https://
urls_clean = [get_with_protocol(url) for url in urls]
urls_to_insert = []
# Per URL
for url in urls_clean:
### Already processed URL?
if (cache.get("insert_{}".format(url)) is not None):
logger.debug("Already cached URL: {}".format(url))
if (cache.get("insert_{}{}{}".format(url, obj_source.source, obj_search.search)) is not None):
logger.debug("Already cached (URL, source, search): {} {} {}".format(url, obj_source.source, obj_search.search))
else:
### Insert (URL_id, source_id, search_id), since not cached
# Get URL ID (should already be created)
obj_url, created = Urls.objects.get_or_create(url=url)
# Create (id_source, id_url) (shouldn't exist)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
else:
# Add object to insert
# url_object_to_insert.append(Urls(url=url))
urls_to_insert.append(url)
### Insert URLs & (URL_id, source_id)
try:
### Bulk insert, fails if duplicated URL (not retuning IDs when using ignore_conflicts=True)
# URLs (ignore_conflicts=False to return IDs)
bulk_created_urls = Urls.objects.bulk_create([Urls(url=url) for url in urls_to_insert], ignore_conflicts=False)
# (URL_id, source_id)
UrlsSourceSearch.objects.bulk_create([UrlsSourceSearch(id_url=obj_url, id_source=obj_source, id_search=obj_search) for obj_url in bulk_created_urls], ignore_conflicts=True)
except IntegrityError as e:
### Fallback to one-by-one insert
logger.debug("bulk_create exception while inserting raw URLs (fails if duplicated URL), falling back to non-bulk method")
# One by one
for url in urls_to_insert:
# URL
obj_url, created = Urls.objects.get_or_create(url=url)
if (created):
logger.debug("Inserted: {}".format(obj_url.url))
else:
logger.debug("Not inserted: {}".format(obj_url.url))
# (URL, source, search)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url, id_source=obj_source, id_search=obj_search)
except Exception as e:
logger.warning("bulk_create unknown exception while inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
# Avoid caching due to error on insertion
urls_clean = []
# Insert or update cache
for url in urls_clean:
cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url)
cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url)
logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search))
except Exception as e:
logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc()))
def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error):
def set_status(obj_url, status):
# Update status if setting a new value
if (obj_url.status != status):
obj_url.status = status
obj_url.save()
##### Filter URL? -> Invalid
if (status_pattern_match == "invalid"):
logger.debug("Domain filter applied to input URL: {}".format(obj_url.url))
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
##### Process URL
try:
# Get data
dict_url_data = process_url(obj_url.url)
except Exception as e:
if (raise_exception_on_error):
# Simply raise exception, handled in a different way
raise Exception("Error processing URL, raising exception as expected")
else:
logger.debug("Error processing URL: {}\n{}\n{}".format(obj_url.url, str(e), traceback.format_exc()))
# Set status to error
dict_url_data = None
# (dict_url_data is None) or (Exception while processing URL) ? -> Error status
if (dict_url_data is None):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.ERROR)
# Next URL
return
# Invalid? e.g. binary data
if (dict_url_data.get("override_status") == "invalid"):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.INVALID)
# Next URL
return
##### Canonical URL different? -> Duplicate
if (dict_url_data.get("url_canonical") is not None) and(dict_url_data.get("url") != dict_url_data.get("url_canonical")):
# Update status
set_status(obj_url, Urls.STATUS_ENUM.DUPLICATE)
# Get or create URL with canonical form
obj_url_canonical, created = Urls.objects.get_or_create(url=dict_url_data.get("url_canonical"))
# Get the source-search IDs associated to obj_url.id
list_url_source_search = UrlsSourceSearch.objects.filter(id_url=obj_url)
for obj_url_source_search in list_url_source_search:
# Associate same sources to url_canonical (it might already exist)
UrlsSourceSearch.objects.get_or_create(id_url=obj_url_canonical, id_source=obj_url_source_search.id_source, id_search=obj_url_source_search.id_search)
# URLs duplciate association
UrlsDuplicate.objects.get_or_create(id_url_canonical=obj_url_canonical, id_url_duplicated=obj_url)
# TODO: return obj_url_canonical so as to directly process the recently inserted URL
# Wherever this function is called, add:
# self._process_single_url(obj_url_canonical, status_pattern_match, raise_exception_on_error)
# Next URL
return
##### Valid URL
# Update status
set_status(obj_url, Urls.STATUS_ENUM.VALID)
# Create or update extracted URL data
UrlContent.objects.update_or_create(
id_url=obj_url,
defaults = {
"date_published" : dict_url_data.get("publish_date"),
"title" : dict_url_data.get("title"),
"description" : dict_url_data.get("description"),
"content" : dict_url_data.get("content"),
"valid_content" : dict_url_data.get("valid_content"),
"language" : dict_url_data.get("language"),
"keywords" : dict_url_data.get("keywords"),
"tags" : dict_url_data.get("tags"),
"authors" : dict_url_data.get("authors"),
"image_main_url" : dict_url_data.get("image_main_url"),
"images_url" : dict_url_data.get("images_url"),
"videos_url" : dict_url_data.get("videos_url"),
"url_host" : dict_url_data.get("url_host"),
"site_name" : dict_url_data.get("site_name"),
}
)
def process_raw_urls(self, batch_size):
def _get_status_pattern_matching(url, list_pattern_status_tuple):
""" Be careful: Regex pattern should update status on "valid", "invalid", and "unknown" status only
"""
# Sort pattern tuples by priority. (pattern, priority, status)
for regex_pattern, regex_priority, status_if_match in sorted(list_pattern_status_tuple, key=lambda tup: tup[1], reverse=True):
# Regular expression pattern matching: https://regexr.com/
if bool(re.match(regex_pattern, obj_url.url)):
logger.debug("Regex pattern found, status '{}' for URL: {}".format(status_if_match, url))
return status_if_match
return None
try:
logger.debug("Processing raw URLs")
# Get batch of URLs, status='raw'
raw_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.RAW)[:batch_size]
if (len(raw_urls) == 0):
logger.debug("No raw URLs to process")
return
# Get list of (pattern, priority, status) tuples to override status if required
list_pattern_status_tuple = list(StatusPatternMatching.objects.values_list("pattern", "priority", "status"))
# Per URL
for obj_url in raw_urls:
# Override status if pattern matching?
status_pattern_match = _get_status_pattern_matching(obj_url.url, list_pattern_status_tuple)
# Process URL
self._process_single_url(obj_url, status_pattern_match, raise_exception_on_error=False)
logger.info("Updated #{} raw URLs".format(len(raw_urls)))
except Exception as e:
logger.warning("Exception processing raw URLs: {}\n{}".format(e, traceback.format_exc()))
def process_error_urls(self, batch_size):
try:
logger.debug("Processing error URLs")
# Keep track of processed and skipped "error" URLs
num_urls_skipped, num_urls_processed = 0, 0
# Get batch of URLs, status='error'
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
while ((len(error_urls) > 0) and (num_urls_processed < batch_size)):
# Per URL
for obj_url in error_urls:
# URL ID cached? -> Tried to process recently already, skip
if (cache.get("error_{}".format(obj_url.id)) is not None):
logger.debug("Already cached URL ID: {}".format(obj_url.id))
num_urls_skipped += 1
continue
try:
# Process URL
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
num_urls_processed += 1
except Exception as e:
# Error, cache to avoid re-processing for X time
cache.set("error_{}".format(obj_url.id), True, timeout=self._cache_timeout_insert_url)
num_urls_skipped += 1
# Get following batch of URLs, status='error'
error_urls = Urls.objects.order_by("-ts_fetch").filter(status=Urls.STATUS_ENUM.ERROR)[num_urls_skipped:batch_size+num_urls_skipped]
logger.info("Updated #{}, skipped #{} error URLs".format(num_urls_processed, num_urls_skipped))
except Exception as e:
logger.warning("Exception processing error URLs: {}\n{}".format(e, traceback.format_exc()))
def process_missing_kids_urls(self, batch_size=None):
try:
logger.debug("Processing MissingKids URLs - batch_size={}".format(batch_size))
# Get batch of URLs, %missingkids.org/poster% AND (status='valid' OR status='invalid')
missingkids_urls = Urls.objects.order_by("-ts_fetch").filter(
(Q(url__contains="missingkids.org/poster") | Q(url__contains="missingkids.org/new-poster"))
&
(Q(status=Urls.STATUS_ENUM.VALID) | Q(status=Urls.STATUS_ENUM.INVALID) | Q(status=Urls.STATUS_ENUM.ERROR))
)
# Get batch size
if (batch_size is not None):
missingkids_urls = missingkids_urls[:batch_size]
# Per URL
for obj_url in missingkids_urls:
try:
# Process URL. If no exception -> Valid
self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True)
except Exception as e:
# Raised exception -> Invalid (404 error)
obj_url.status = Urls.STATUS_ENUM.INVALID
obj_url.save()
logger.info("Verified status of #{} missingkids.org/poster URLs".format(len(missingkids_urls)))
except Exception as e:
logger.warning("Exception processing MissingKids URLs: {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,51 @@
from .db_utils import DB_Handler
from ..models import Search, Source
import feedparser
import dateutil
import traceback
from .logger import get_logger
logger = get_logger()
class FetchFeeds():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Feeds")
def run(self):
try:
logger.debug("Starting FetchFeeds.run()")
# Get source object
obj_source, created = Source.objects.get_or_create(source="feeds")
# Get feeds objects
list_obj_search_feeds = Search.objects.filter(type=Search.TYPE_ENUM.RSS_FEED)
logger.debug("Fetching from feeds: {}".format([e.search for e in list_obj_search_feeds]))
# Process via RSS feeds
for obj_search in list_obj_search_feeds:
# Initialize
urls_fetched, urls_publish_date = [], []
# Fetch feeds
feeds = feedparser.parse(obj_search.search)
# Parse
for f in feeds.get("entries", []):
# Get URL
url = f.get("link", None)
# Process?
if (url is not None):
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)
# URL
urls_fetched.append(url)
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e:
logger.warning("Exception in FetchFeeds.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,42 @@
from .db_utils import DB_Handler
from ..models import Search, Source
import os
import requests
import json
import traceback
from .logger import get_logger
logger = get_logger()
class FetchMissingKids():
def __init__(self) -> None:
logger.debug("Initializing Fetcher MissingKids")
def run(self, number_pages=-1):
try:
logger.debug("Starting MissingKids.run(), processing #{} pages".format(number_pages))
# Get source object
obj_source, created = Source.objects.get_or_create(source="missingkids.org")
# Get search object
obj_search, created = Search.objects.get_or_create(search="missingkids.org/poster", type=Search.TYPE_ENUM.URL_HOST)
try:
# Missing kids fetching endpoint, parameter number of pages to fetch
missingkids_fetch_endpoint = os.path.join(os.getenv("SELENIUM_ENDPOINT", "http://localhost:80"), "get_missing_kids/?pages={}".format(number_pages))
# Timeout
if (number_pages > 15) or (number_pages == -1):
timeout = 60*90 # 1.5h
else:
timeout = 60*10 # 10 min
# Request
r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
# Decode
urls_fetched = json.loads(r.text).get("list_urls", [])
except Exception as e:
logger.warning("Timeout on request: {}. {}".format(missingkids_fetch_endpoint, str(e)))
urls_fetched = []
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e:
logger.warning("Exception in MissingKids.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,46 @@
from .db_utils import DB_Handler
from ..models import Search, Source
from .url_processor import get_with_protocol, url_host_slowdown
import newspaper
import traceback
from .logger import get_logger
logger = get_logger()
class FetchParser():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Parser")
def run(self):
try:
logger.debug("Starting FetchParser.run() for {}")
# Get source object
obj_source, created = Source.objects.get_or_create(source="newspaper4k")
# Get URL hosts
list_url_host = Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST)
logger.debug("Fetching news by parsing URL hosts: {}".format([e.search for e in list_url_host]))
# Process newspaper4k build method
for obj_search in list_url_host:
# Protocol
url_host_protocol = get_with_protocol(obj_search.search)
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_protocol))
# Make sure no requests made for the last X seconds
url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
try:
# Source object
url_host_built = newspaper.build(url_host_protocol)
# Get articles URL list
urls_fetched = url_host_built.article_urls()
except newspaper.exceptions.ArticleException as e:
logger.debug("ArticleException while parsing input URL {}\n{}".format(url_host_protocol, str(e.args)))
urls_fetched = []
except Exception as e:
logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
urls_fetched = []
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e:
logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,57 @@
from .db_utils import DB_Handler
from ..models import Search
from django.db.models import Q
import traceback
import time
import os
from .fetch_search_instances import ListSearchInstances
from .logger import get_logger
logger = get_logger()
class FetchSearcher():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Searcher")
def run(self):
try:
logger.debug("Starting FetchSearcher.run()")
# Get search objects of interest
list_search_obj = Search.objects.filter(Q(type=Search.TYPE_ENUM.URL_HOST) | Q(type=Search.TYPE_ENUM.KEYWORD_SEARCH))
logger.debug("Fetching from search: {}".format(["{} ({})".format(e.search, e.type) for e in list_search_obj]))
# Search
for obj_search in list_search_obj:
# TODO: language & country customization
# Search
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
if (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
# Add search with intitle keyword
# TODO: allintitle: "child abuse"
# TODO: intitle: "child abuse"
pass
# language, country = obj_search.language_country.split("-")
logger.debug("Starting keyword search: {}".format(keyword_search))
logger.debug("Search type: {}".format(obj_search.type))
# DB writer
db_writer = DB_Handler()
# Keyword arguments
args = {
"language": "en",
"country": "US",
# "period": ["7d", "1d"], # TODO: List of periods to iterate
}
for SearchInstance in ListSearchInstances:
# Sleep between requests, avoid too many requests...
time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
SearchInstance(args).fetch_articles(db_writer, obj_search)
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
except Exception as e:
logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))

View File

@@ -0,0 +1,308 @@
import time
import feedparser
import os
from django.utils import timezone
from datetime import timedelta
from ..models import Search, Source
from .fetch_utils import decode_gnews_urls
from .logger import get_logger
logger = get_logger()
from gnews import GNews
from duckduckgo_search import DDGS
from GoogleNews import GoogleNews
from search_engines import Yahoo, Aol
###########################################################################
###########################################################################
from abc import ABC, abstractmethod
# Generic fetcher (fetches articles, writes to DB)
class FetcherAbstract(ABC):
@abstractmethod
def _fetch_raw_urls(self):
pass
@abstractmethod
def _get_name(self):
pass
def _get_source_object(self, source):
# TODO: Cache
# self.cached_sources = {}
# Get source object
obj_source, created = Source.objects.get_or_create(source=source)
return obj_source
def _post_process_urls(self, raw_urls, obj_search):
# Searching URL Host based? Make sure results belong to that site
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
# Get clean URL host
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
# Ensure URL host in URL
raw_urls = [u for u in raw_urls if url_host_clean in u]
return raw_urls
def fetch_articles(self, db_writer, obj_search):
# Source name
source_name = self._get_name()
# Search
keyword_search = obj_search.search
# URL Host search? -> site:${URL_HOST}
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
keyword_search = "{}{}".format("site:", keyword_search)
# Keyword search & using a General search? -> ${SEARCH} news after:${LAST_WEEK}
if ("general" in source_name) and (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
start_date = timezone.now() - timedelta(days=7)
keyword_search = "{}{}".format(keyword_search, "news after:{}-{}-{}".format(start_date.month, start_date.day, start_date.year))
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
# Fetch
raw_urls = self._fetch_raw_urls(keyword_search)
# Post-process
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
db_writer.insert_raw_urls(raw_urls, self._get_source_object(source_name), obj_search)
###########################################################################
class SearchGNews(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "en")
self.country = args.get("country", "US")
self.period = args.get("period", "7d")
self.max_results = args.get("max_results", 100)
def _get_name(self):
# [source] [period] [language-country] [max_results]
return "gnews {} {}-{} results={}".format(self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Get news
results_gnews = GNews(language=self.language, country=self.country, period=self.period, max_results=self.max_results).get_news(keyword_search)
# Get list of encoded urls
encoded_urls = [e.get("url") for e in results_gnews]
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchDuckDuckGoGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "wt")
self.country = args.get("country", "wt")
self.max_results = args.get("max_results", 20)
self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None
def _get_name(self):
# [source] [language-country] [max_results]
return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
urls = [e.get("href") for e in news]
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchDuckDuckGoNews(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "wt")
self.country = args.get("country", "wt")
self.max_results = args.get("max_results", 100)
self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None
def _get_name(self):
# [source] [language-country] [max_results]
return "ddg-news {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
news = DDGS().news(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
urls = [e.get("url") for e in news]
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleNews(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "en")
self.country = args.get("country", "US")
self.period = args.get("period", "7d")
def _get_name(self):
# [source] [period] [language-country]
return "googlenews {} {}-{}".format(self.period, self.language, self.country)
def _fetch_raw_urls(self, keyword_search):
try:
# Initialize
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
googlenews.enableException(True)
# Search
googlenews.get_news(keyword_search)
# Fetch
encoded_urls = googlenews.get_links()
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "en")
self.country = args.get("country", "US")
self.period = args.get("period", "7d")
self.pages = args.get("pages", 1)
def _get_name(self):
# [source] [period] [language-country] [pages]
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Initialize
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
googlenews.enableException(True)
# Search
googlenews.search(keyword_search)
set_links = set()
# Iterate pages
for i in range(self.pages):
# Sleep between pages fetch
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
# Number of URLs fetched so far
num_before = len(set_links)
# Get page
try:
links = googlenews.page_at(i+1)
except Exception as e:
logger.warning("Exception fetching page - {}: {}".format(self._get_name(), str(e)))
break
# Links
for l in links:
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
set_links.add( l.get("link").split("&ved=")[0] )
# Finished?
if (num_before == len(set_links)):
break
# To list
urls = list(set_links)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleNewsRSS(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.language = args.get("language", "en")
self.country = args.get("country", "US")
def _get_name(self):
# [source] [language-country]
return "googlenews-rss {}-{}".format(self.language, self.country).strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Search URL with parameters filled: https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(self.language, self.country.upper()), self.country.upper(), self.country.upper(), self.language)
# Control characters
search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
# Initialize
encoded_urls = []
# Fetch feeds
feeds = feedparser.parse(search_url)
# Parse
for f in feeds.get("entries", []):
# Encoded URL
encoded_url = f.get("link", None)
'''
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)'
'''
# Append
encoded_urls.append(encoded_url)
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchYahooGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.pages = args.get("pages", 2)
def _get_name(self):
# [source] [language-country] [pages]
return "yahoo-general pages={}".format(self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
results = Yahoo().search(keyword_search, pages=self.pages)
urls = results.links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchAOLGeneral(FetcherAbstract):
def __init__(self, args={}):
super().__init__()
# Parameters
self.pages = args.get("pages", 2)
def _get_name(self):
# [source] [language-country] [pages]
return "aol-general pages={}".format(self.pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
results = Aol().search(keyword_search, pages=self.pages)
urls = results.links()
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
###########################################################################
# List of instances
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchAOLGeneral, SearchYahooGeneral, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]

View File

@@ -0,0 +1,35 @@
import os
from django.core.cache import cache
from .logger import get_logger
logger = get_logger()
from googlenewsdecoder import gnewsdecoder
def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
logger.debug("Decoding gnews URLs")
# DecodeURLs
list_decoded_urls = []
for url in encoded_urls:
# Already cached?
decoded_url = cache.get("gnews_decode_{}".format(url))
if (decoded_url is not None):
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
# Append decoded URL
list_decoded_urls.append(decoded_url)
else:
try:
# Decode URL, with interval time to avoid block
decoded_url_dict = gnewsdecoder(url, interval=interval)
# Ok?
if decoded_url_dict.get("status"):
# Append decoded URL
decoded_url = decoded_url_dict["decoded_url"]
list_decoded_urls.append(decoded_url)
# Cache decoded URL
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
else:
logger.info("Bad status while decoding news.google.com, URL {}\n{}".format(url, decoded_url_dict.get("message")))
except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}".format(url))
return list_decoded_urls

View File

@@ -0,0 +1,33 @@
import logging
import os
# Get env var
logs_directory = os.getenv("PATH_LOGS_DIRECTORY", "logs")
# Directory of logs
os.makedirs(logs_directory, exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("fetcher")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "debug.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# To file log: INFO / WARNING / ERROR
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "info.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.INFO)
logger.addHandler(fh)
# To file log: WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename=os.path.join(logs_directory, "warning.log"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.WARNING)
logger.addHandler(fh)
def get_logger():
return logger

View File

@@ -0,0 +1,127 @@
from django.core.cache import cache
from .logger import get_logger
logger = get_logger()
import newspaper
import time
import os
from urllib.parse import unquote
import langdetect
langdetect.DetectorFactory.seed = 0
def get_with_protocol(url):
# http:// -> https://
url = url.replace("http://", "https://")
# "" -> https://
if not (url.startswith("https://")):
url = "https://" + url
return url
def get_url_host(url):
# URL no protocol, first substring before '/'
url_host = url.replace("https://", "").replace("http://", "").split("/")[0]
return url_host
def url_host_slowdown(url, url_host_slowdown_seconds):
### Avoid (frequent) too many requests to the same URL host
# Get URL host
url_host = get_url_host(url)
# Recently processed URL host? -> Slow down required
last_cached_timestamp = cache.get("process_{}".format(url_host).encode("utf-8"), None)
if last_cached_timestamp:
# Get time since last processed URL host (in seconds)
time_since_last_processed = time.time() - last_cached_timestamp
# Amount of time required to sleep?
slowdown_required = max(0, url_host_slowdown_seconds - time_since_last_processed)
logger.debug("Slow down (sleeping {:.2f}) for URL host {}".format(slowdown_required, url_host))
# Sleep
time.sleep(slowdown_required)
# About to process URL host, cache time
cache.set("process_{}".format(url_host).encode("utf-8"), time.time(), timeout=60*5) # Expire after 5 minutes
def process_url(url):
try:
# Slow down if required to avoid too many requests error
url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
# Process
article = newspaper.article(url)
except newspaper.ArticleBinaryDataException:
logger.warning("ArticleException for input URL {}".format(url))
return {"override_status": "invalid"}
except newspaper.ArticleException as e:
# Too many requests? Cool down...
if ("Status code 429" in str(e.args)):
# TODO: cool down and retry once?, proxy/VPN, ...
logger.debug("TODO: process_url Implement code 429")
# Unavailable for legal reasons
if ("Status code 451" in str(e.args)):
# TODO: Bypass with VPN
logger.debug("TODO: process_url Implement code 451")
# CloudFlare protection?
if ("Website protected with Cloudflare" in str(e.args)):
logger.debug("TODO: process_url Implement bypass CloudFlare")
# PerimeterX protection?
if ("Website protected with PerimeterX" in str(e.args)):
logger.debug("TODO: process_url Implement bypass PerimeterX")
logger.debug("ArticleException for input URL {}\n{}".format(url, str(e.args)))
return None
except Exception as e:
logger.warning("Exception for input URL {}\n{}".format(url, str(e)))
return None
try:
content_merged = "\n".join([article.title, article.meta_description, article.text])
if (len(content_merged) > int(os.getenv("FETCHER_LANGUAGE_DETECTION_MIN_CHAR", 100))):
language = langdetect.detect(content_merged)
else:
language = None
except Exception as e:
logger.info("Could not detect language: {}\n{}".format(url, str(e)))
language = None
dict_data = {
"url": url,
"url_canonical": article.canonical_link,
"url_host": article.source_url,
"site_name": article.meta_site_name,
"publish_date": article.publish_date,
"language": language, # article.meta_lang -> Not always reliable
"title": article.title,
"description": article.meta_description,
"content": article.text,
"valid_content": article.is_valid_body(),
"keywords": [k for k in set(article.keywords + article.meta_keywords) if k!=""],
"tags": article.tags,
"authors": article.authors,
"image_main_url": article.top_image, # article.meta_img
"images_url": article.images,
"videos_url": article.movies,
}
'''
# TODO: If exists, add tags article.meta_data.get("classification-tags", "").split(",")
if (dict_data["tags"] is None):
dict_data["tags"] = []
for k in article.meta_data.keys():
if ("tags" in k):
dict_data["tags"] += article.meta_data[k].split(",")
'''
# Sanity check
for k in dict_data.keys():
if (type(dict_data[k]) is list):
# Remove empty string, unquote special characters, e.g. "%20" -> " "
dict_data[k] = [ unquote(e) for e in dict_data[k] if e != "" ]
# NULL instead of empty list
if (len(dict_data[k]) == 0):
dict_data[k] = None
elif (type(dict_data[k]) is str):
# Unquote special characters
if (dict_data[k] is not None):
dict_data[k] = unquote(dict_data[k])
# NULL instead of empty string
if (dict_data[k] == ""):
dict_data[k] = None
return dict_data