Refactoring fetcher, working feeds and raw url writer
This commit is contained in:
48
app_urls/api/obsolete_src/fetch_feed.py
Normal file
48
app_urls/api/obsolete_src/fetch_feed.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from .db_utils import DB_Handler
|
||||
import feedparser
|
||||
import dateutil
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchFeeds():
|
||||
def __init__(self, db_handler: DB_Handler) -> None:
|
||||
logger.debug("Initializing News feed")
|
||||
self.db_handler = db_handler
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting NewsFeed.run()")
|
||||
# Get feeds
|
||||
list_url_feeds = self.db_handler._get_feed_urls()
|
||||
logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds)))
|
||||
|
||||
# Process via RSS feeds
|
||||
for url_feed in list_url_feeds:
|
||||
# Initialize
|
||||
urls_fetched, urls_publish_date = [], []
|
||||
# Fetch feeds
|
||||
feeds = feedparser.parse(url_feed)
|
||||
# Parse
|
||||
for f in feeds.get("entries", []):
|
||||
# Get URL
|
||||
url = f.get("link", None)
|
||||
# Process?
|
||||
if (url is not None):
|
||||
# Available publish date?
|
||||
publish_date_parsed = f.get("published_parsed")
|
||||
if (publish_date_parsed is None):
|
||||
publish_date = f.get("published", None)
|
||||
if (publish_date is not None):
|
||||
publish_date_parsed = dateutil.parser.parse(publish_date)
|
||||
|
||||
# Published date
|
||||
urls_publish_date.append(publish_date_parsed)
|
||||
# URL
|
||||
urls_fetched.append(url)
|
||||
|
||||
# URL fetching source
|
||||
source = "feed {}".format(url_feed)
|
||||
# Write to DB
|
||||
self.db_handler.write_batch(urls_fetched, source)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsFeed.run(): {}".format(str(e)))
|
||||
Reference in New Issue
Block a user