Refactoring fetcher WIP

This commit is contained in:
Luciano Gervasoni
2025-03-07 11:52:35 +01:00
parent ec4a2cad15
commit 95b9766245
10 changed files with 124 additions and 55 deletions

View File

@@ -1,27 +1,15 @@
from .db_utils import URL_DB_Writer
from .db_utils import DB_Handler
import newspaper
import psycopg
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class NewsSiteParsing():
def __init__(self, db_connect_info, redis_connect_info) -> None:
logger.debug("Initializing News SiteParsing newspaper3k")
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
def __init__(self, db_handler: DB_Handler) -> None:
logger.debug("Initializing News SiteParsing newspaper4k")
self.db_handler = db_handler
def _get_url_hosts(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
# Decode (tuple with 1 element)
list_url_hosts = [l[0] for l in list_url_hosts]
except Exception as e:
logger.warning("Exception fetching RSS sites: " + str(e))
list_url_hosts = []
return list_url_hosts
# TODO: MOVE LOGIC ELSEWHERE!
def _postprocess(self, article_urls):
return [url.replace("#comment-stream", "") for url in article_urls]
@@ -29,11 +17,11 @@ class NewsSiteParsing():
try:
logger.debug("Starting NewsSiteParsing.run() for {}")
# Get feeds
list_url_hosts = self._get_url_hosts()
# Get URL hosts
list_url_hosts = self.db_handler._get_url_hosts()
logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
# Process newspaper3k build method
# Process newspaper4k build method
for url_host_feed in list_url_hosts:
# Protocol
if not (url_host_feed.startswith("http")):
@@ -41,18 +29,18 @@ class NewsSiteParsing():
else:
url_host_feed_formatted = url_host_feed
logger.debug("Fetching newspaper3k parsing based on URL: {}".format(url_host_feed_formatted))
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
# Source object
url_host_built = newspaper.build(url_host_feed_formatted)
# Get articles URL list
urls_fetched = url_host_built.article_urls()
# TODO: MOVE!
# Post-processing
urls_fetched = self._postprocess(urls_fetched)
# URL fetching source
source = "newspaper3k {}".format(url_host_feed)
source = "newspaper4k {}".format(url_host_feed)
# Write to DB
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
db_writer.write_batch(urls_fetched, source)
self.db_handler.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))