from .db_utils import DB_Handler import newspaper import logging logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s') logger = logging.getLogger("news_fetcher") class NewsSiteParsing(): def __init__(self, db_handler: DB_Handler) -> None: logger.debug("Initializing News SiteParsing newspaper4k") self.db_handler = db_handler # TODO: MOVE LOGIC ELSEWHERE! def _postprocess(self, article_urls): return [url.replace("#comment-stream", "") for url in article_urls] def run(self): try: logger.debug("Starting NewsSiteParsing.run() for {}") # Get URL hosts list_url_hosts = self.db_handler._get_url_hosts() logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts))) # Process newspaper4k build method for url_host_feed in list_url_hosts: # Protocol if not (url_host_feed.startswith("http")): url_host_feed_formatted = "https://" + url_host_feed else: url_host_feed_formatted = url_host_feed logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted)) # Source object url_host_built = newspaper.build(url_host_feed_formatted) # Get articles URL list urls_fetched = url_host_built.article_urls() # TODO: MOVE! # Post-processing urls_fetched = self._postprocess(urls_fetched) # URL fetching source source = "newspaper4k {}".format(url_host_feed) # Write to DB self.db_handler.write_batch(urls_fetched, source) except Exception as e: logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))