Url content
This commit is contained in:
58
app_fetcher/src/news_parsing.py
Normal file
58
app_fetcher/src/news_parsing.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import newspaper
|
||||
import psycopg
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class NewsSiteParsing():
|
||||
def __init__(self, db_connect_info, redis_connect_info) -> None:
|
||||
logger.debug("Initializing News SiteParsing newspaper3k")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
|
||||
def _get_url_hosts(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
|
||||
# Decode (tuple with 1 element)
|
||||
list_url_hosts = [l[0] for l in list_url_hosts]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching RSS sites: " + str(e))
|
||||
list_url_hosts = []
|
||||
return list_url_hosts
|
||||
|
||||
def _postprocess(self, article_urls):
|
||||
return [url.replace("#comment-stream", "") for url in article_urls]
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting NewsSiteParsing.run() for {}")
|
||||
|
||||
# Get feeds
|
||||
list_url_hosts = self._get_url_hosts()
|
||||
logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
|
||||
|
||||
# Process newspaper3k build method
|
||||
for url_host_feed in list_url_hosts:
|
||||
# Protocol
|
||||
if not (url_host_feed.startswith("http")):
|
||||
url_host_feed_formatted = "https://" + url_host_feed
|
||||
else:
|
||||
url_host_feed_formatted = url_host_feed
|
||||
|
||||
logger.debug("Fetching newspaper3k parsing based on URL: {}".format(url_host_feed_formatted))
|
||||
# Source object
|
||||
url_host_built = newspaper.build(url_host_feed_formatted)
|
||||
# Get articles URL list
|
||||
urls_fetched = url_host_built.article_urls()
|
||||
# Post-processing
|
||||
urls_fetched = self._postprocess(urls_fetched)
|
||||
|
||||
# URL fetching source
|
||||
source = "newspaper3k {}".format(url_host_feed)
|
||||
# Write to DB
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
db_writer.write_batch(urls_fetched, source)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))
|
||||
Reference in New Issue
Block a user