Working fetch feeds and parser, process raw and error urls

This commit is contained in:
Luciano Gervasoni
2025-03-18 14:49:12 +01:00
parent 7d7bce1e72
commit fb4b30f05e
26 changed files with 270 additions and 364 deletions

View File

@@ -0,0 +1,39 @@
from .db_utils import DB_Handler
from ..models import WebsiteOfInterest
import newspaper
import traceback
from .logger import get_logger
logger = get_logger()
class FetchParser():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Parser")
def run(self):
try:
logger.debug("Starting FetchParser.run() for {}")
# Get URL hosts
list_url_host = list(WebsiteOfInterest.objects.values_list('url_host', flat=True))
logger.debug("Fetching news by parsing URL hosts: {}".format(list_url_host))
# Process newspaper4k build method
for url_host_feed in list_url_host:
# Protocol
if not (url_host_feed.startswith("http")):
url_host_feed_formatted = "https://" + url_host_feed
else:
url_host_feed_formatted = url_host_feed
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
# Source object
url_host_built = newspaper.build(url_host_feed_formatted)
# Get articles URL list
urls_fetched = url_host_built.article_urls()
# URL fetching source
source = "newspaper4k {}".format(url_host_feed)
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, source)
except Exception as e:
logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))