45 lines
1.8 KiB
Python
45 lines
1.8 KiB
Python
from .db_utils import DB_Handler
|
|
import newspaper
|
|
from .logger import get_logger
|
|
logger = get_logger()
|
|
|
|
class FetchParser():
|
|
def __init__(self, db_handler: DB_Handler) -> None:
|
|
logger.debug("Initializing News SiteParsing newspaper4k")
|
|
self.db_handler = db_handler
|
|
|
|
# TODO: MOVE LOGIC ELSEWHERE!
|
|
def _postprocess(self, article_urls):
|
|
return [url.replace("#comment-stream", "") for url in article_urls]
|
|
|
|
def run(self):
|
|
try:
|
|
logger.debug("Starting NewsSiteParsing.run() for {}")
|
|
|
|
# Get URL hosts
|
|
list_url_hosts = self.db_handler._get_url_hosts()
|
|
logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
|
|
|
|
# Process newspaper4k build method
|
|
for url_host_feed in list_url_hosts:
|
|
# Protocol
|
|
if not (url_host_feed.startswith("http")):
|
|
url_host_feed_formatted = "https://" + url_host_feed
|
|
else:
|
|
url_host_feed_formatted = url_host_feed
|
|
|
|
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
|
|
# Source object
|
|
url_host_built = newspaper.build(url_host_feed_formatted)
|
|
# Get articles URL list
|
|
urls_fetched = url_host_built.article_urls()
|
|
# TODO: MOVE!
|
|
# Post-processing
|
|
urls_fetched = self._postprocess(urls_fetched)
|
|
|
|
# URL fetching source
|
|
source = "newspaper4k {}".format(url_host_feed)
|
|
# Write to DB
|
|
self.db_handler.write_batch(urls_fetched, source)
|
|
except Exception as e:
|
|
logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e))) |