Refactoring fetcher WIP

This commit is contained in:
Luciano Gervasoni
2025-03-07 11:52:35 +01:00
parent ec4a2cad15
commit 95b9766245
10 changed files with 124 additions and 55 deletions

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
__pycache__/
*.pyc
**/credentials.py
logs/

View File

@@ -34,5 +34,5 @@ docker run --rm -it -p 12343:80 image_generation
# Deploy
```
python manage.py runserver
python app_web/manage.py runserver
```

46
app_fetcher/Dev.ipynb Normal file
View File

@@ -0,0 +1,46 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"conda create -n matitos_fetcher python=3.12\n",
"conda activate matitos_fetcher\n",
"conda install -c conda-forge curl\n",
"pip install ipykernel \"psycopg[binary]\" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!uvicorn app:app --host 0.0.0.0 --port 5000 --reload"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "matitos_fetcher",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,4 +1,4 @@
FROM continuumio/miniconda3:23.10.0-1
FROM continuumio/miniconda3:25.1.1-2
# App repository
COPY . /opt/app/
@@ -10,6 +10,7 @@ RUN pip freeze
WORKDIR /opt/app
# https://www.uvicorn.org/settings/#resource-limits
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
# docker build -t fetch_app .

View File

@@ -1,5 +1,13 @@
# Fetcher
```
conda create -n matitos_fetcher python=3.12
conda activate matitos_fetcher
conda install -c conda-forge curl
pip install ipykernel "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]
```
* Fetcher app
- Contains several endpoints to perform a specific fetching type task
- For more details, check in [app.py](app.py) /{fetch_type}

View File

@@ -30,6 +30,8 @@ from src.missing_kids_status import MissingKidsStatus
from src.url_status import UpdateErrorURLs
from src.fetcher_status import FetcherStatus
from src.db_utils import DB_Handler
from fastapi import FastAPI, BackgroundTasks
# import requests
# from fastapi_utils.tasks import repeat_every
@@ -37,11 +39,13 @@ from fastapi import FastAPI, BackgroundTasks
# time.sleep(10)
# import gc
db_handler = DB_Handler(cred.db_connect_info, cred.redis_connect_info)
app = FastAPI()
@app.get("/")
def hello_world():
return {"message": "OK"}
return {"message": "Ok"}
@app.get("/{fetch_type}")
async def fetch(background_tasks: BackgroundTasks, fetch_type: str):
@@ -49,9 +53,9 @@ async def fetch(background_tasks: BackgroundTasks, fetch_type: str):
logger.info("Triggered fetch: {}".format(fetch_type))
if (fetch_type == "feeds"):
task_run = NewsFeed(cred.db_connect_info, cred.redis_connect_info).run
task_run = NewsFeed(db_handler).run
elif (fetch_type == "parser"):
task_run = NewsSiteParsing(cred.db_connect_info, cred.redis_connect_info).run
task_run = NewsSiteParsing(db_handler).run
elif (fetch_type == "fetch_missing_kids_reduced"):
task_run = NewsMissingKids(cred.db_connect_info, cred.redis_connect_info, num_pages=4).run
elif (fetch_type == "fetch_missing_kids_full"):

View File

@@ -13,11 +13,11 @@ logger = logging.getLogger("news_fetcher")
# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
# The rest, elsewhere
class URL_DB_Writer():
class DB_Handler():
def __init__(self, db_connect_info, redis_connect_info):
logger.debug("Initializing URL DB writer")
self.db_connect_info = db_connect_info
self.redis_instance = redis.Redis(host=redis_connect_info.get("redis_host"), port=redis_connect_info.get("redis_port"))
self.redis_instance = redis.Redis(host=redis_connect_info.get("host"), port=redis_connect_info.get("port"))
self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours
try:
@@ -41,6 +41,28 @@ class URL_DB_Writer():
num_urls = None
return num_urls
def _get_feed_urls(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
# Decode (tuple with 1 element)
list_url_feeds = [l[0] for l in list_url_feeds]
except Exception as e:
logger.warning("Exception fetching RSS sites: " + str(e))
list_url_feeds = []
return list_url_feeds
def _get_url_hosts(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
# Decode (tuple with 1 element)
list_url_hosts = [l[0] for l in list_url_hosts]
except Exception as e:
logger.warning("Exception fetching RSS sites: " + str(e))
list_url_hosts = []
return list_url_hosts
def _format(self, values):
# Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729
# String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value)
@@ -352,6 +374,7 @@ class URL_DB_Writer():
# Decode source id
id_source = c[0]
# Cache
print("*"*10, source, id_source)
self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds)
return id_source

View File

@@ -1,34 +1,20 @@
from .db_utils import URL_DB_Writer
from .db_utils import DB_Handler
import feedparser
import dateutil
import psycopg
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class NewsFeed():
def __init__(self, db_connect_info, redis_connect_info) -> None:
def __init__(self, db_handler: DB_Handler) -> None:
logger.debug("Initializing News feed")
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
def _get_feed_urls(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
# Decode (tuple with 1 element)
list_url_feeds = [l[0] for l in list_url_feeds]
except Exception as e:
logger.warning("Exception fetching RSS sites: " + str(e))
list_url_feeds = []
return list_url_feeds
self.db_handler = db_handler
def run(self):
try:
logger.debug("Starting NewsFeed.run()")
# Get feeds
list_url_feeds = self._get_feed_urls()
list_url_feeds = self.db_handler._get_feed_urls()
logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds)))
# Process via RSS feeds
@@ -44,17 +30,20 @@ class NewsFeed():
# Process?
if (url is not None):
# Available publish date?
publish_date_parsed = f.get("published_parsed")
if (publish_date_parsed is None):
publish_date = f.get("published", None)
if (publish_date is not None):
publish_date = dateutil.parser.parse(publish_date)
urls_publish_date.append(publish_date)
publish_date_parsed = dateutil.parser.parse(publish_date)
# Published date
urls_publish_date.append(publish_date_parsed)
# URL
urls_fetched.append(url)
# URL fetching source
source = "feed {}".format(url_feed)
# Write to DB
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
db_writer.write_batch(urls_fetched, source)
self.db_handler.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsFeed.run(): {}".format(str(e)))

View File

@@ -1,27 +1,15 @@
from .db_utils import URL_DB_Writer
from .db_utils import DB_Handler
import newspaper
import psycopg
import logging
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
logger = logging.getLogger("news_fetcher")
class NewsSiteParsing():
def __init__(self, db_connect_info, redis_connect_info) -> None:
logger.debug("Initializing News SiteParsing newspaper3k")
self.db_connect_info = db_connect_info
self.redis_connect_info = redis_connect_info
def _get_url_hosts(self):
try:
with psycopg.connect(self.db_connect_info) as conn:
list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
# Decode (tuple with 1 element)
list_url_hosts = [l[0] for l in list_url_hosts]
except Exception as e:
logger.warning("Exception fetching RSS sites: " + str(e))
list_url_hosts = []
return list_url_hosts
def __init__(self, db_handler: DB_Handler) -> None:
logger.debug("Initializing News SiteParsing newspaper4k")
self.db_handler = db_handler
# TODO: MOVE LOGIC ELSEWHERE!
def _postprocess(self, article_urls):
return [url.replace("#comment-stream", "") for url in article_urls]
@@ -29,11 +17,11 @@ class NewsSiteParsing():
try:
logger.debug("Starting NewsSiteParsing.run() for {}")
# Get feeds
list_url_hosts = self._get_url_hosts()
# Get URL hosts
list_url_hosts = self.db_handler._get_url_hosts()
logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))
# Process newspaper3k build method
# Process newspaper4k build method
for url_host_feed in list_url_hosts:
# Protocol
if not (url_host_feed.startswith("http")):
@@ -41,18 +29,18 @@ class NewsSiteParsing():
else:
url_host_feed_formatted = url_host_feed
logger.debug("Fetching newspaper3k parsing based on URL: {}".format(url_host_feed_formatted))
logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
# Source object
url_host_built = newspaper.build(url_host_feed_formatted)
# Get articles URL list
urls_fetched = url_host_built.article_urls()
# TODO: MOVE!
# Post-processing
urls_fetched = self._postprocess(urls_fetched)
# URL fetching source
source = "newspaper3k {}".format(url_host_feed)
source = "newspaper4k {}".format(url_host_feed)
# Write to DB
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
db_writer.write_batch(urls_fetched, source)
self.db_handler.write_batch(urls_fetched, source)
except Exception as e:
logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))

View File

@@ -18,6 +18,15 @@ services:
ports:
- 5432:5432
matitos_redis:
image: redis:alpine
container_name: db_redis
restart: unless-stopped
ports:
- 6379:6379
#expose:
# - 6379
# django:
# Env: DB_HOST=matitos_db
# DJANGO_DB_NAME=${DB_DATABASE_NAME:-matitos}