Urls source search, cleaning code

This commit is contained in:
Luciano Gervasoni
2025-03-20 17:19:52 +01:00
parent 05e17266f1
commit f84c7729f8
13 changed files with 241 additions and 300 deletions

View File

@@ -1,5 +1,6 @@
from .db_utils import DB_Handler
from ..models import Search
from ..models import Search, Source
from .url_processor import get_with_protocol, url_host_slowdown
import newspaper
import traceback
from .logger import get_logger
@@ -13,27 +14,26 @@ class FetchParser():
try:
logger.debug("Starting FetchParser.run() for {}")
# Get source object
obj_source, created = Source.objects.get_or_create(source="newspaper4k")
# Get URL hosts
list_url_host = list(Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST).values_list('search', flat=True))
logger.debug("Fetching news by parsing URL hosts: {}".format(list_url_host))
list_url_host = Search.objects.filter(type=Search.TYPE_ENUM.URL_HOST)
logger.debug("Fetching news by parsing URL hosts: {}".format([e.search for e in list_url_host]))
# Process newspaper4k build method
for url_host_feed in list_url_host:
for obj_search in list_url_host:
# Protocol
if not (url_host_feed.startswith("http")):
url_host_feed_formatted = "https://" + url_host_feed
else:
url_host_feed_formatted = url_host_feed
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
url_host_protocol = get_with_protocol(obj_search.search)
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_protocol))
# Make sure no requests made for the last X seconds
url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
# Source object
url_host_built = newspaper.build(url_host_feed_formatted)
url_host_built = newspaper.build(url_host_protocol)
# Get articles URL list
urls_fetched = url_host_built.article_urls()
# URL fetching source
source = "newspaper4k {}".format(url_host_feed)
# Write to DB
DB_Handler().insert_raw_urls(urls_fetched, source)
DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
except Exception as e:
logger.warning("Exception in FetchParser.run(): {}\n{}".format(e, traceback.format_exc()))