40 lines
1.6 KiB
Python
40 lines
1.6 KiB
Python
from .db_utils import DB_Handler
|
|
from ..models import Search, Source
|
|
from .url_processor import get_with_protocol, url_host_slowdown
|
|
import newspaper
|
|
import traceback
|
|
from .logger import get_logger
|
|
logger = get_logger()
|
|
|
|
class FetchParser():
|
|
def __init__(self) -> None:
|
|
logger.debug("Initializing Fetcher Parser")
|
|
|
|
def run(self):
|
|
try:
|
|
logger.debug("Starting FetchParser.run() for {}")
|
|
|
|
# Get source object
|
|
obj_source, created = Source.objects.get_or_create(source="newspaper4k")
|
|
# Get URL hosts
|
|
list_url_host = Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST)
|
|
logger.debug("Fetching news by parsing URL hosts: {}".format([e.search for e in list_url_host]))
|
|
|
|
# Process newspaper4k build method
|
|
for obj_search in list_url_host:
|
|
# Protocol
|
|
url_host_protocol = get_with_protocol(obj_search.search)
|
|
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_protocol))
|
|
|
|
# Make sure no requests made for the last X seconds
|
|
url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
|
|
# Source object
|
|
url_host_built = newspaper.build(url_host_protocol)
|
|
# Get articles URL list
|
|
urls_fetched = url_host_built.article_urls()
|
|
|
|
# Write to DB
|
|
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
|
|
except Exception as e:
|
|
logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))
|