Working fetch feeds and parser, process raw and error urls
This commit is contained in:
39
app_urls/api/src/fetch_parser.py
Normal file
39
app_urls/api/src/fetch_parser.py
Normal file
@@ -0,0 +1,39 @@
|
||||
from .db_utils import DB_Handler
|
||||
from ..models import WebsiteOfInterest
|
||||
import newspaper
|
||||
import traceback
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchParser():
|
||||
def __init__(self) -> None:
|
||||
logger.debug("Initializing Fetcher Parser")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting FetchParser.run() for {}")
|
||||
|
||||
# Get URL hosts
|
||||
list_url_host = list(WebsiteOfInterest.objects.values_list('url_host', flat=True))
|
||||
logger.debug("Fetching news by parsing URL hosts: {}".format(list_url_host))
|
||||
|
||||
# Process newspaper4k build method
|
||||
for url_host_feed in list_url_host:
|
||||
# Protocol
|
||||
if not (url_host_feed.startswith("http")):
|
||||
url_host_feed_formatted = "https://" + url_host_feed
|
||||
else:
|
||||
url_host_feed_formatted = url_host_feed
|
||||
|
||||
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
|
||||
# Source object
|
||||
url_host_built = newspaper.build(url_host_feed_formatted)
|
||||
# Get articles URL list
|
||||
urls_fetched = url_host_built.article_urls()
|
||||
|
||||
# URL fetching source
|
||||
source = "newspaper4k {}".format(url_host_feed)
|
||||
# Write to DB
|
||||
DB_Handler().insert_raw_urls(urls_fetched, source)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))
|
||||
Reference in New Issue
Block a user