from .db_utils import DB_Handler from ..models import Search, Source from .url_processor import get_with_protocol, url_host_slowdown import newspaper import traceback from .logger import get_logger logger = get_logger() class FetchParser(): def __init__(self) -> None: logger.debug("Initializing Fetcher Parser") def run(self): try: logger.debug("Starting FetchParser.run() for {}") # Get source object obj_source, created = Source.objects.get_or_create(source="newspaper4k") # Get URL hosts list_url_host = Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST) logger.debug("Fetching news by parsing URL hosts: {}".format([e.search for e in list_url_host])) # Process newspaper4k build method for obj_search in list_url_host: # Protocol url_host_protocol = get_with_protocol(obj_search.search) logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_protocol)) # Make sure no requests made for the last X seconds url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5) # Source object url_host_built = newspaper.build(url_host_protocol) # Get articles URL list urls_fetched = url_host_built.article_urls() # Write to DB DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search) except Exception as e: logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))