Refactoring fetcher WIP

2025-03-07 11:52:35 +01:00
parent ec4a2cad15
commit 95b9766245
10 changed files with 124 additions and 55 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 __pycache__/
 *.pyc 
 **/credentials.py
+logs/
--- a/README.md
+++ b/README.md
@@ -34,5 +34,5 @@ docker run --rm -it -p 12343:80 image_generation

 # Deploy
 ```
-python manage.py runserver
+python app_web/manage.py runserver
 ```
--- a/app_fetcher/Dev.ipynb
+++ b/app_fetcher/Dev.ipynb
@@ -0,0 +1,46 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conda create -n matitos_fetcher python=3.12\n",
+    "conda activate matitos_fetcher\n",
+    "conda install -c conda-forge curl\n",
+    "pip install ipykernel \"psycopg[binary]\" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!uvicorn app:app --host 0.0.0.0 --port 5000 --reload"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "matitos_fetcher",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/app_fetcher/Dockerfile
+++ b/app_fetcher/Dockerfile
@@ -1,4 +1,4 @@
-FROM continuumio/miniconda3:23.10.0-1
+FROM continuumio/miniconda3:25.1.1-2

 # App repository
 COPY . /opt/app/
@@ -10,6 +10,7 @@ RUN pip freeze

 WORKDIR /opt/app

+# https://www.uvicorn.org/settings/#resource-limits
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]

 # docker build -t fetch_app .
--- a/app_fetcher/README.md
+++ b/app_fetcher/README.md
@@ -1,5 +1,13 @@
 # Fetcher

+```
+conda create -n matitos_fetcher python=3.12
+conda activate matitos_fetcher
+conda install -c conda-forge curl
+pip install ipykernel "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]
+```
+
+
 * Fetcher app
    - Contains several endpoints to perform a specific fetching type task 
        - For more details, check in [app.py](app.py) /{fetch_type}
--- a/app_fetcher/app.py
+++ b/app_fetcher/app.py
@@ -30,6 +30,8 @@ from src.missing_kids_status import MissingKidsStatus
 from src.url_status import UpdateErrorURLs
 from src.fetcher_status import FetcherStatus

+from src.db_utils import DB_Handler
+
 from fastapi import FastAPI, BackgroundTasks
 # import requests
 # from fastapi_utils.tasks import repeat_every
@@ -37,11 +39,13 @@ from fastapi import FastAPI, BackgroundTasks
 # time.sleep(10)
 # import gc

+db_handler = DB_Handler(cred.db_connect_info, cred.redis_connect_info)
+
 app = FastAPI()

@app.get("/")
 def hello_world():
-    return {"message": "OK"}
+    return {"message": "Ok"}

@app.get("/{fetch_type}")
 async def fetch(background_tasks: BackgroundTasks, fetch_type: str):
@@ -49,9 +53,9 @@ async def fetch(background_tasks: BackgroundTasks, fetch_type: str):
    logger.info("Triggered fetch: {}".format(fetch_type))

    if (fetch_type == "feeds"):
-        task_run = NewsFeed(cred.db_connect_info, cred.redis_connect_info).run
+        task_run = NewsFeed(db_handler).run
    elif (fetch_type == "parser"):
-        task_run = NewsSiteParsing(cred.db_connect_info, cred.redis_connect_info).run
+        task_run = NewsSiteParsing(db_handler).run
    elif (fetch_type == "fetch_missing_kids_reduced"):
        task_run = NewsMissingKids(cred.db_connect_info, cred.redis_connect_info, num_pages=4).run
    elif (fetch_type == "fetch_missing_kids_full"):
--- a/app_fetcher/src/db_utils.py
+++ b/app_fetcher/src/db_utils.py
@@ -13,11 +13,11 @@ logger = logging.getLogger("news_fetcher")
 # TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
 # The rest, elsewhere

-class URL_DB_Writer():
+class DB_Handler():
    def __init__(self, db_connect_info, redis_connect_info):
        logger.debug("Initializing URL DB writer")
        self.db_connect_info = db_connect_info
-        self.redis_instance = redis.Redis(host=redis_connect_info.get("redis_host"), port=redis_connect_info.get("redis_port"))
+        self.redis_instance = redis.Redis(host=redis_connect_info.get("host"), port=redis_connect_info.get("port"))
        self.redis_expiry_seconds = redis_connect_info.get("expiry_seconds", 172800) # Default: 48 hours
        
        try:
@@ -41,6 +41,28 @@ class URL_DB_Writer():
            num_urls = None
        return num_urls

+    def _get_feed_urls(self):
+        try:
+            with psycopg.connect(self.db_connect_info) as conn:
+                list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
+                # Decode (tuple with 1 element)
+                list_url_feeds = [l[0] for l in list_url_feeds]
+        except Exception as e:
+            logger.warning("Exception fetching RSS sites: " + str(e))
+            list_url_feeds = []
+        return list_url_feeds
+    
+    def _get_url_hosts(self):
+        try:
+            with psycopg.connect(self.db_connect_info) as conn:
+                list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
+                # Decode (tuple with 1 element)
+                list_url_hosts = [l[0] for l in list_url_hosts]
+        except Exception as e:
+            logger.warning("Exception fetching RSS sites: " + str(e))
+            list_url_hosts = []
+        return list_url_hosts
+
    def _format(self, values):
        # Repalce single quote ' with ''. Based on https://stackoverflow.com/a/12320729
        # String -> 'string', Int -> '1' (string-based), None -> NULL (no quotes for pgSQL to interpret Null value)
@@ -352,6 +374,7 @@ class URL_DB_Writer():
            # Decode source id
            id_source = c[0]
        # Cache
+        print("*"*10, source, id_source)
        self.redis_instance.set(source, id_source, ex=self.redis_expiry_seconds)
        return id_source
    
--- a/app_fetcher/src/news_feed.py
+++ b/app_fetcher/src/news_feed.py
@@ -1,34 +1,20 @@
-from .db_utils import URL_DB_Writer
+from .db_utils import DB_Handler
 import feedparser
 import dateutil
-import psycopg
 import logging
 logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
 logger = logging.getLogger("news_fetcher")

 class NewsFeed():
-    def __init__(self, db_connect_info, redis_connect_info) -> None:
+    def __init__(self, db_handler: DB_Handler) -> None:
        logger.debug("Initializing News feed")
-        self.db_connect_info = db_connect_info
-        self.redis_connect_info = redis_connect_info
-
-    def _get_feed_urls(self):
-        try:
-            with psycopg.connect(self.db_connect_info) as conn:
-                list_url_feeds = conn.execute("SELECT rss_feed FROM FEED;").fetchall()
-                # Decode (tuple with 1 element)
-                list_url_feeds = [l[0] for l in list_url_feeds]
-        except Exception as e:
-            logger.warning("Exception fetching RSS sites: " + str(e))
-            list_url_feeds = []
-        return list_url_feeds
+        self.db_handler = db_handler
    
    def run(self):
        try:
            logger.debug("Starting NewsFeed.run()")
-
            # Get feeds
-            list_url_feeds = self._get_feed_urls()
+            list_url_feeds = self.db_handler._get_feed_urls()
            logger.debug("Fetching news from feeds: {}".format(str(list_url_feeds)))

            # Process via RSS feeds
@@ -44,17 +30,20 @@ class NewsFeed():
                    # Process?
                    if (url is not None):
                        # Available publish date?
+                        publish_date_parsed = f.get("published_parsed")
+                        if (publish_date_parsed is None):
                            publish_date = f.get("published", None)
                            if (publish_date is not None):
-                            publish_date = dateutil.parser.parse(publish_date)
-                        urls_publish_date.append(publish_date)
+                                publish_date_parsed = dateutil.parser.parse(publish_date)
+                        
+                        # Published date
+                        urls_publish_date.append(publish_date_parsed)
                        # URL
                        urls_fetched.append(url)

                # URL fetching source
                source = "feed {}".format(url_feed)
                # Write to DB
-                db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
-                db_writer.write_batch(urls_fetched, source)
+                self.db_handler.write_batch(urls_fetched, source)
        except Exception as e:
            logger.warning("Exception in NewsFeed.run(): {}".format(str(e)))
--- a/app_fetcher/src/news_parsing.py
+++ b/app_fetcher/src/news_parsing.py
@@ -1,27 +1,15 @@
-from .db_utils import URL_DB_Writer
+from .db_utils import DB_Handler
 import newspaper
-import psycopg
 import logging
 logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
 logger = logging.getLogger("news_fetcher")

 class NewsSiteParsing():
-    def __init__(self, db_connect_info, redis_connect_info) -> None:
-        logger.debug("Initializing News SiteParsing newspaper3k")
-        self.db_connect_info = db_connect_info
-        self.redis_connect_info = redis_connect_info
-
-    def _get_url_hosts(self):
-        try:
-            with psycopg.connect(self.db_connect_info) as conn:
-                list_url_hosts = conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()
-                # Decode (tuple with 1 element)
-                list_url_hosts = [l[0] for l in list_url_hosts]
-        except Exception as e:
-            logger.warning("Exception fetching RSS sites: " + str(e))
-            list_url_hosts = []
-        return list_url_hosts
+    def __init__(self, db_handler: DB_Handler) -> None:
+        logger.debug("Initializing News SiteParsing newspaper4k")
+        self.db_handler = db_handler

+    # TODO: MOVE LOGIC ELSEWHERE!
    def _postprocess(self, article_urls):
        return [url.replace("#comment-stream", "") for url in article_urls]
    
@@ -29,11 +17,11 @@ class NewsSiteParsing():
        try:
            logger.debug("Starting NewsSiteParsing.run() for {}")

-            # Get feeds
-            list_url_hosts = self._get_url_hosts()
+            # Get URL hosts
+            list_url_hosts = self.db_handler._get_url_hosts()
            logger.info("Fetching news by parsing URL hosts: {}".format(str(list_url_hosts)))

-            # Process newspaper3k build method
+            # Process newspaper4k build method
            for url_host_feed in list_url_hosts:
                # Protocol
                if not (url_host_feed.startswith("http")):
@@ -41,18 +29,18 @@ class NewsSiteParsing():
                else:
                    url_host_feed_formatted = url_host_feed

-                logger.debug("Fetching newspaper3k parsing based on URL: {}".format(url_host_feed_formatted))
+                logger.debug("Fetching newspaper4k parsing based on URL: {}".format(url_host_feed_formatted))
                # Source object
                url_host_built = newspaper.build(url_host_feed_formatted)
                # Get articles URL list
                urls_fetched = url_host_built.article_urls()
+                # TODO: MOVE!
                # Post-processing
                urls_fetched = self._postprocess(urls_fetched)

                # URL fetching source
-                source = "newspaper3k {}".format(url_host_feed)
+                source = "newspaper4k {}".format(url_host_feed)
                # Write to DB
-                db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
-                db_writer.write_batch(urls_fetched, source)
+                self.db_handler.write_batch(urls_fetched, source)
        except Exception as e:
            logger.warning("Exception in NewsSiteParsing.run(): {}".format(str(e)))
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -18,6 +18,15 @@ services:
    ports:
      - 5432:5432

+  matitos_redis:
+    image: redis:alpine
+    container_name: db_redis
+    restart: unless-stopped
+    ports:
+      - 6379:6379
+    #expose:
+    #  - 6379
+
 # django:
 # Env: DB_HOST=matitos_db
 # DJANGO_DB_NAME=${DB_DATABASE_NAME:-matitos}