matitos_news/app_urls/api/obsolete_src/fetch_parser.py

from .db_utils import DB_Handler
import newspaper
from .logger import get_logger
logger = get_logger()

class FetchParser():
    def __init__(self, db_handler: DB_Handler) -> None:
        logger.debug("Initializing News SiteParsing newspaper4k")
        self.db_handler = db_handler

    # TODO: MOVE LOGIC ELSEWHERE!
    def _postprocess(self, article_urls):
        return [url.replace("#comment-stream", "") for url in article_urls]

    def run(self):
        try:
            logger.debug("Starting NewsSiteParsing.run() for {}")

            # Get URL hosts
            list_url_hosts = self.db_handler._get_url_hosts()
            logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))

            # Process newspaper4k build method
            for url_host_feed in list_url_hosts:
                # Protocol
                if not (url_host_feed.startswith("http")):
                    url_host_feed_formatted = "https://" + url_host_feed
                else:
                    url_host_feed_formatted = url_host_feed

                logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
                # Source object
                url_host_built = newspaper.build(url_host_feed_formatted)
                # Get articles URL list
                urls_fetched = url_host_built.article_urls()
                # TODO: MOVE!
                # Post-processing
                urls_fetched = self._postprocess(urls_fetched)

                # URL fetching source
                source = "newspaper4k {}".format(url_host_feed)
                # Write to DB
                self.db_handler.write_batch(urls_fetched, source)
        except Exception as e:
            logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))