matitos_news/app_urls/api/src/fetch_parser.py

from .db_utils import DB_Handler
from ..models import Search, Source
from .url_processor import get_with_protocol, url_host_slowdown
import newspaper
import traceback
from .logger import get_logger
logger = get_logger()

class FetchParser():
    def __init__(self) -> None:
        logger.debug("Initializing Fetcher Parser")

    def run(self):
        try:
            logger.debug("Starting FetchParser.run() for {}")

            # Get source object
            obj_source, created = Source.objects.get_or_create(source="newspaper4k")
            # Get URL hosts
            list_url_host = Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST)
            logger.debug("Fetching news by parsing URL hosts: {}".format([e.search for e in list_url_host]))

            # Process newspaper4k build method
            for obj_search in list_url_host:
                # Protocol
                url_host_protocol = get_with_protocol(obj_search.search)
                logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_protocol))

                # Make sure no requests made for the last X seconds
                url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
                # Source object
                url_host_built = newspaper.build(url_host_protocol)
                # Get articles URL list
                urls_fetched = url_host_built.article_urls()

                # Write to DB
                DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
        except Exception as e:
            logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))