Refactor searches, env vars fetcher config, urls webpage update

This commit is contained in:
Luciano Gervasoni
2025-04-02 18:45:43 +02:00
parent 077219fcb6
commit 84da104dc8
22 changed files with 676 additions and 1521 deletions

View File

@@ -1,59 +1,17 @@
from .db_utils import DB_Handler
from ..models import Search, Source
from ..models import Search
from django.db.models import Q
import traceback
import time
from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news, search_googlenews_rss
import os
from .fetch_search_instances import ListSearchInstances
from .logger import get_logger
logger = get_logger()
'''
from abc import ABC, abstractmethod
# Generic fetcher (fetches articles, writes to DB)
class FetcherAbstract(ABC):
@abstractmethod
def _fetch_raw_urls_list(self):
pass
def fetch_articles(self, db_writer):
logger.debug("Starting fetch() for {}".format(self.name))
# Fetch articles
list_news = self._fetch()
logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
# Write to DB
db_writer.write_batch(list_news, self.name)
self._fetch_raw_urls_list()
raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
'''
class FetchSearcher():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Searcher")
def _get_source_object(self, source):
# TODO: Cache
# self.cached_sources = {}
# Get source object
obj_source, created = Source.objects.get_or_create(source=source)
return obj_source
def _post_process_urls(self, raw_urls, obj_search):
# Searching URL Host based? Make sure results belong to that site
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
# Get clean URL host
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
# Ensure URL host in URL
raw_urls = [u for u in raw_urls if url_host_clean in u]
return raw_urls
def run(self):
try:
logger.debug("Starting FetchSearcher.run()")
@@ -65,58 +23,36 @@ class FetchSearcher():
# Search
for obj_search in list_search_obj:
# TODO: language & country customization
# TODO: allintitle: "child abuse"
# TODO: intitle: "child abuse"
# Search
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
if (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
# Add search with intitle keyword
# TODO: allintitle: "child abuse"
# TODO: intitle: "child abuse"
pass
# language, country = obj_search.language_country.split("-")
logger.debug("Starting keyword search: {}".format(keyword_search))
logger.debug("Search type: {}".format(obj_search.type))
# news.google.com/rss
time.sleep(5)
raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# DDG News
time.sleep(5)
raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "en-US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# DB writer
db_writer = DB_Handler()
# GNews
time.sleep(5)
raw_urls, source = search_gnews(keyword_search, language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# Keyword arguments
args = {
"language": "en",
"country": "US",
"period": "7d",
"max_results": 100,
"max_pages": 1,
}
# DDG Text (week, 20 results)
time.sleep(5)
raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=20, region = "en-US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# GoogleNews news
time.sleep(5)
raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US")
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
# GoogleNews general
time.sleep(5)
raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=2)
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
for SearchInstance in ListSearchInstances:
# Sleep between requests, avoid too many requests...
time.sleep(int(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
SearchInstance(args).fetch_articles(db_writer, obj_search)
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
except Exception as e:

View File

@@ -0,0 +1,259 @@
import time
import feedparser
import os
from ..models import Search, Source
from .fetch_utils import decode_gnews_urls
from .logger import get_logger
logger = get_logger()
from gnews import GNews
from duckduckgo_search import DDGS
from GoogleNews import GoogleNews
###########################################################################
###########################################################################
from abc import ABC, abstractmethod
# Generic fetcher (fetches articles, writes to DB)
class FetcherAbstract(ABC):
@abstractmethod
def _fetch_raw_urls(self):
pass
@abstractmethod
def _get_name(self):
pass
def _get_source_object(self, source):
# TODO: Cache
# self.cached_sources = {}
# Get source object
obj_source, created = Source.objects.get_or_create(source=source)
return obj_source
def _post_process_urls(self, raw_urls, obj_search):
# Searching URL Host based? Make sure results belong to that site
if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
# Get clean URL host
url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
# Ensure URL host in URL
raw_urls = [u for u in raw_urls if url_host_clean in u]
return raw_urls
def fetch_articles(self, db_writer, obj_search):
# Search
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
# Source name
source_name = self._get_name()
logger.debug("Starting search: {} - {}".format(keyword_search, source_name))
# Fetch
raw_urls = self._fetch_raw_urls(keyword_search)
# Post-process
raw_urls = self._post_process_urls(raw_urls, obj_search)
# Write to DB
db_writer.insert_raw_urls(raw_urls, self._get_source_object(source_name), obj_search)
###########################################################################
class SearchGNews(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_results":100}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.period = args.get("period")
self.max_results = args.get("max_results")
def _get_name(self):
# [source] [period] [language-country] [max_results]
return "gnews {} {}-{} results={}".format("news", self.period, self.language, self.country, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Get news
results_gnews = GNews(language=self.language, country=self.country, period=self.period, max_results=self.max_results).get_news(keyword_search)
# Get list of encoded urls
encoded_urls = [e.get("url") for e in results_gnews]
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchDuckDuckGoGeneral(FetcherAbstract):
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.max_results = args.get("max_results")
self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None
def _get_name(self):
# [source] [language-country] [max_results]
return "ddg-general {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
news = DDGS().text(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
urls = [e.get("href") for e in news]
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchDuckDuckGoNews(FetcherAbstract):
def __init__(self, args={"language":"wt", "country":"wt", "max_results":100}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.max_results = args.get("max_results")
self.region = "{}-{}".format(self.language, self.country).lower()
self.period = None
def _get_name(self):
# [source] [language-country] [max_results]
return "ddg-news {} results={}".format(self.region, self.max_results).replace("results=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
news = DDGS().news(keyword_search, region=self.region, timelimit=self.period, max_results=self.max_results)
urls = [e.get("url") for e in news]
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleNews(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d"}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.period = args.get("period")
def _get_name(self):
# [source] [period] [language-country]
return "googlenews {} {}-{}".format(self.period, self.language, self.country)
def _fetch_raw_urls(self, keyword_search):
try:
# Initialize
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
googlenews.enableException(True)
# Search
googlenews.get_news(keyword_search)
# Fetch
encoded_urls = googlenews.get_links()
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleGeneral(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US", "period":"7d", "max_pages":1}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
self.period = args.get("period")
self.max_pages = args.get("max_pages")
def _get_name(self):
# [source] [period] [language-country] [pages]
return "google-general {} {}-{} pages={}".format(self.period, self.language, self.country, self.max_pages).replace("pages=None", "").strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Initialize
googlenews = GoogleNews(period=self.period, lang=self.language, region=self.country)
googlenews.enableException(True)
# Search
googlenews.search(keyword_search)
set_links = set()
# Iterate pages
for i in range(self.max_pages):
# Sleep between pages fetch
time.sleep(int(os.getenv("FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP", 4)))
# Number of URLs fetched so far
num_before = len(set_links)
# Get page
try:
links = googlenews.page_at(i+1)
except Exception as e:
logger.warning("Exception fetching page - {}: {}".format(self._get_name(), str(e)))
break
# Links
for l in links:
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
set_links.add( l.get("link").split("&ved=")[0] )
# Finished?
if (num_before == len(set_links)):
break
# To list
urls = list(set_links)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(self._get_name(), str(e)))
urls = []
return urls
class SearchGoogleNewsRSS(FetcherAbstract):
def __init__(self, args={"language":"en", "country":"US"}):
super().__init__()
# Parameters
self.language = args.get("language")
self.country = args.get("country")
def _get_name(self):
# [source] [language-country]
return "googlenews-rss {}-{}".format(self.language, self.country).strip()
def _fetch_raw_urls(self, keyword_search):
try:
# Search URL with parameters filled: https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(self.language, self.country.upper()), self.country.upper(), self.country.upper(), self.language)
# Control characters
search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
# Initialize
encoded_urls = []
# Fetch feeds
feeds = feedparser.parse(search_url)
# Parse
for f in feeds.get("entries", []):
# Encoded URL
encoded_url = f.get("link", None)
'''
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)'
'''
# Append
encoded_urls.append(encoded_url)
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}".format(self._get_name(), str(e)))
urls = []
return urls
###########################################################################
# List of instances
ListSearchInstances = [SearchGNews, SearchDuckDuckGoNews, SearchGoogleNews, SearchDuckDuckGoGeneral, SearchGoogleGeneral, SearchGoogleNewsRSS]

View File

@@ -1,197 +0,0 @@
from django.core.cache import cache
import traceback
import random
import time
import feedparser
import urllib
import dateutil
from .logger import get_logger
logger = get_logger()
from googlenewsdecoder import gnewsdecoder
from gnews import GNews
from duckduckgo_search import DDGS
from GoogleNews import GoogleNews
###########################################################################
def decode_gnews_urls(encoded_urls, interval=2):
# DecodeURLs
list_decoded_urls = []
for url in encoded_urls:
# Already cached?
decoded_url = cache.get("gnews_decode_{}".format(url))
if (decoded_url is not None):
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
# Append decoded URL
list_decoded_urls.append(decoded_url)
else:
try:
# Decode URL, with interval time to avoid block
decoded_url_dict = gnewsdecoder(url, interval=interval)
# Ok?
if decoded_url_dict.get("status"):
# Append decoded URL
decoded_url = decoded_url_dict["decoded_url"]
list_decoded_urls.append(decoded_url)
# Cache decoded URL
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
else:
logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, str(decoded_url)))
except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
return list_decoded_urls
###########################################################################
def search_gnews(keyword_search, period="1d", language="en", country="US", max_results=100):
# [source] [category] [period] [language-country] [max_results]
source = "gnews {} {} {}-{} max_results={}".format("news", period, language, country, max_results).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
try:
# Get news
results_gnews = GNews(language=language, country=country).get_news(keyword_search)
# Get list of encoded urls
encoded_urls = [e.get("url") for e in results_gnews]
# Decode
logger.debug("Decoding gnews URLs")
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source
###########################################################################
def search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region="wt-wt"):
# [source] [category] [period] [language-country] [max_results]
source = "ddg {} {} {} max_results={}".format(category, timelimit, region, max_results).replace("max_results=None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# region="{}-{}".format(langauge, country.lower())
# timelimit= # Options: d, w, m
# max_results # max number of results. If None, returns results only from the first response. Defaults to None
try:
if (category == "news"):
news = DDGS().news(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
urls = [e.get("url") for e in news]
if (category == "text"):
news = DDGS().text(keyword_search, region=region, timelimit=timelimit, max_results=max_results)
urls = [e.get("href") for e in news]
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source
###########################################################################
def search_googlenews_news(keyword_search, period="1d", language="en", country="US"):
category = "news"
# [source] [category] [period] [language-country]
source = "googlenews {} {} {}-{}".format(category, period, language, country).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# Initialize
googlenews = GoogleNews(period=period, lang=language, region=country)
googlenews.enableException(True)
try:
# Search
googlenews.get_news(keyword_search)
# Fetch
encoded_urls = googlenews.get_links()
# Decode
logger.debug("Decoding gnews URLs")
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source
def search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5):
category="general"
# [source] [category] [period] [language-country] [max_results]
source = "googlenews {} {} {}-{} max_pages={}".format(category, period, language, country, max_pages).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# Initialize
googlenews = GoogleNews(period=period, lang=language, region=country)
googlenews.enableException(True)
try:
set_links = set()
# Search
googlenews.search(keyword_search)
# Iterate pages
for i in range(max_pages):
time.sleep(random.uniform(2, 4.5))
num_before = len(set_links)
# Get page
try:
links = googlenews.page_at(i+1)
except Exception as e:
logger.warning("Exception fetching page in GoogleNews {}: {}".format(source, str(e)))
break
# Links
for l in links:
# 'link': 'https://uk.news.yahoo.com/leaving-neverland-2-michael-jackson-lawyer-channel-4-102017088.html&ved=2ahUKEwjl38eJm5aMAxVvqJUCHXgnGzwQxfQBegQICRAC&usg=AOvVaw1osa6b3o_xXfcNinMDpLoK'
set_links.add( l.get("link").split("&ved=")[0] )
# Finished?
if (num_before == len(set_links)):
break
# To list
urls = list(set_links)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source
###########################################################################
def search_googlenews_rss(keyword_search, language="en", country="US"):
# [source] [category] [period] [language-country] [max_results]
source = "googlenews-rss {}-{}".format(language, country).replace("None", "").strip()
logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
# https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
try:
# Search URL with parameters filled
search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(language, country.upper()), country.upper(), country.upper(), language)
# Control characters
search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
# Initialize
encoded_urls = []
# Fetch feeds
feeds = feedparser.parse(search_url)
# Parse
for f in feeds.get("entries", []):
# Encoded URL
encoded_url = f.get("link", None)
'''
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)'
'''
# Append
encoded_urls.append(encoded_url)
# Decode
urls = decode_gnews_urls(encoded_urls)
except Exception as e:
logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
urls = []
return urls, source

View File

@@ -0,0 +1,35 @@
import traceback
import os
from django.core.cache import cache
from .logger import get_logger
logger = get_logger()
from googlenewsdecoder import gnewsdecoder
def decode_gnews_urls(encoded_urls, interval=int(os.getenv("FETCHER_GNEWS_DECODE_SLEEP", 2))):
logger.debug("Decoding gnews URLs")
# DecodeURLs
list_decoded_urls = []
for url in encoded_urls:
# Already cached?
decoded_url = cache.get("gnews_decode_{}".format(url))
if (decoded_url is not None):
logger.debug("Already cached decoded URL: {} -> {}".format(url, decoded_url))
# Append decoded URL
list_decoded_urls.append(decoded_url)
else:
try:
# Decode URL, with interval time to avoid block
decoded_url_dict = gnewsdecoder(url, interval=interval)
# Ok?
if decoded_url_dict.get("status"):
# Append decoded URL
decoded_url = decoded_url_dict["decoded_url"]
list_decoded_urls.append(decoded_url)
# Cache decoded URL
cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
else:
logger.warning("Error decoding news.google.com, URL {}".format(url))
except Exception as e:
logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
return list_decoded_urls

View File

@@ -1,34 +1,34 @@
import logging
import os
''' TODO: PATH LOGS
PATH_LOGS_ERROR=logs/log_app_fetcher_error.log
PATH_LOGS_INFO=logs/log_app_fetcher_info.log
PATH_LOGS_DEBUG=logs/log_app_fetcher_debug.log
# PATH_LOGS=logs/log_app_fetcher.log
'''
os.makedirs("logs", exist_ok=True)
# Get env var
path_logs_parameterization = os.getenv("PATH_LOGS_PARAMETERIZATION", "logs/log_app_fetcher_{}.log")
# Directory of logs
directory = '/'.join(path_logs_parameterization.split("/")[:-1])
os.makedirs(directory, exist_ok=True)
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
logger.setLevel(logging.DEBUG)
# To file log: INFO / WARNING / ERROR / CRITICAL
fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_debug.log", mode="a", maxBytes=10000000, backupCount=4)
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("debug"), mode="a", maxBytes=10000000, backupCount=4)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.DEBUG)
logger.addHandler(fh)
# To file log: INFO / WARNING / ERROR
fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_info.log", mode="a", maxBytes=10000000, backupCount=2)
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh_.setLevel(logging.INFO)
logger.addHandler(fh_)
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("info"), mode="a", maxBytes=10000000, backupCount=2)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.INFO)
logger.addHandler(fh)
# To file log: WARNING / ERROR / CRITICAL
fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh_.setLevel(logging.WARNING)
logger.addHandler(fh_)
fh = logging.handlers.RotatingFileHandler(filename=path_logs_parameterization.format("warning"), mode="a", maxBytes=10000000, backupCount=1)
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
fh.setLevel(logging.WARNING)
logger.addHandler(fh)
def get_logger():
return logger

View File

@@ -3,6 +3,7 @@ from .logger import get_logger
logger = get_logger()
import newspaper
import time
import os
from urllib.parse import unquote
import langdetect
langdetect.DetectorFactory.seed = 0
@@ -40,11 +41,11 @@ def url_host_slowdown(url, url_host_slowdown_seconds):
def process_url(url):
try:
# Slow down if required to avoid too many requests error
url_host_slowdown(url, url_host_slowdown_seconds=5)
url_host_slowdown(url, url_host_slowdown_seconds=int(os.getenv("FETCHER_URL_HOST_SLEEP", 5)))
# Process
article = newspaper.article(url)
except newspaper.ArticleBinaryDataException:
logger.warning("ArticleException for input URL {}\n{}".format(url, str(e)))
logger.warning("ArticleException for input URL {}".format(url))
return {"override_status": "invalid"}
except newspaper.ArticleException as e: