Better lang detect, fetch parser handling, rss google news search

2025-03-31 17:44:53 +02:00
parent b3f896b35a
commit 077219fcb6
6 changed files with 201 additions and 140 deletions
--- a/app_urls/api/src/fetch_parser.py
+++ b/app_urls/api/src/fetch_parser.py
@@ -28,11 +28,18 @@ class FetchParser():
                
                # Make sure no requests made for the last X seconds
                url_host_slowdown(url_host_protocol, url_host_slowdown_seconds=5)
-                # Source object
-                url_host_built = newspaper.build(url_host_protocol)
-                # Get articles URL list
-                urls_fetched = url_host_built.article_urls()
-
+                try:
+                    # Source object
+                    url_host_built = newspaper.build(url_host_protocol)
+                    # Get articles URL list
+                    urls_fetched = url_host_built.article_urls()
+                except newspaper.exceptions.ArticleException as e:
+                    logger.debug("ArticleException while parsing input URL {}\n{}".format(url_host_protocol, str(e.args)))
+                    urls_fetched = []
+                except Exception as e:
+                    logger.warning("Exception while parsing input URL {}\n{}".format(url_host_protocol, str(e)))
+                    urls_fetched = []
+                
                # Write to DB
                DB_Handler().insert_raw_urls(urls_fetched, obj_source, obj_search)
        except Exception as e:
--- a/app_urls/api/src/fetch_search.py
+++ b/app_urls/api/src/fetch_search.py
@@ -3,10 +3,36 @@ from ..models import Search, Source
 from django.db.models import Q
 import traceback
 import time
-from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news
+from .fetch_search_utils import search_gnews, search_ddg, search_googlenews_general, search_googlenews_news, search_googlenews_rss
 from .logger import get_logger
 logger = get_logger()

+'''
+from abc import ABC, abstractmethod
+
+# Generic fetcher (fetches articles, writes to DB)
+class FetcherAbstract(ABC):
+    @abstractmethod
+    def _fetch_raw_urls_list(self):
+        pass
+
+    def fetch_articles(self, db_writer):
+        logger.debug("Starting fetch() for {}".format(self.name))
+        # Fetch articles
+        list_news = self._fetch()
+        logger.info("Found #{} articles for search: {}".format(len(list_news), self.name))
+        # Write to DB
+        db_writer.write_batch(list_news, self.name)
+
+
+        self._fetch_raw_urls_list()
+        raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
+        raw_urls = self._post_process_urls(raw_urls, obj_search)
+        # Write to DB
+        DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
+'''
+
+
 class FetchSearcher():
    def __init__(self) -> None:
        logger.debug("Initializing Fetcher Searcher")
@@ -18,6 +44,16 @@ class FetchSearcher():
        obj_source, created = Source.objects.get_or_create(source=source)
        return obj_source

+    def _post_process_urls(self, raw_urls, obj_search):
+        # Searching URL Host based? Make sure results belong to that site
+        if (obj_search.type == Search.TYPE_ENUM.URL_HOST):
+            # Get clean URL host
+            url_host_clean = obj_search.search.replace("www.", "").replace("http://", "").replace("https://", "")
+            # Ensure URL host in URL
+            raw_urls = [u for u in raw_urls if url_host_clean in u]
+
+        return raw_urls
+
    def run(self):
        try:
            logger.debug("Starting FetchSearcher.run()")
@@ -33,49 +69,55 @@ class FetchSearcher():
                # TODO: intitle: "child abuse"
                
                # Search
-                keyword_search = "{}{}".format("site:" if obj_search.type is Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
+                keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
+
+                logger.debug("Starting keyword search: {}".format(keyword_search))
+                logger.debug("Search type: {}".format(obj_search.type))
+
+                # news.google.com/rss
+                time.sleep(5)
+                raw_urls, source = search_googlenews_rss(keyword_search, language="en", country="US")
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
+                # Write to DB
+                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
+                
                
                # DDG News
                time.sleep(5)
-                raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "wt-wt")
+                raw_urls, source = search_ddg(keyword_search, category="news", timelimit="d", max_results=None, region = "en-US")
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
                # Write to DB
                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)

                # GNews
                time.sleep(5)
                raw_urls, source = search_gnews(keyword_search, language="en", country="US")
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
                # Write to DB
                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)

-                # DDG Text
+                # DDG Text (week, 20 results)
                time.sleep(5)
-                raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=None, region = "wt-wt")
+                raw_urls, source = search_ddg(keyword_search, category="text", timelimit="d", max_results=20, region = "en-US")
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
                # Write to DB
                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
                
                # GoogleNews news
                time.sleep(5)
                raw_urls, source = search_googlenews_news(keyword_search, period="1d", language="en", country="US")
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
                # Write to DB
                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)

                # GoogleNews general
                time.sleep(5)
-                raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=5)
+                raw_urls, source = search_googlenews_general(keyword_search, period="1d", language="en", country="US", max_pages=2)
+                raw_urls = self._post_process_urls(raw_urls, obj_search)
                # Write to DB
                DB_Handler().insert_raw_urls(raw_urls, self._get_source_object(source), obj_search)
+                

-                # TODO:
-                # SearxNG
-                """
-                period = "day"
-                for searx_instance in get_searxng_instances():
-                    dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
-                    dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
-                    # Append thread
-                    FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
-                    FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)"
-                """
                # TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
        except Exception as e:
            logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))
--- a/app_urls/api/src/fetch_search_utils.py
+++ b/app_urls/api/src/fetch_search_utils.py
@@ -2,6 +2,9 @@ from django.core.cache import cache
 import traceback
 import random
 import time
+import feedparser
+import urllib
+import dateutil
 from .logger import get_logger
 logger = get_logger()

@@ -33,7 +36,7 @@ def decode_gnews_urls(encoded_urls, interval=2):
                    # Cache decoded URL
                    cache.set("gnews_decode_{}".format(url), decoded_url, timeout=60*60*12)
                else:
-                    logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, decoded_url["message"]))
+                    logger.warning("Error decoding news.google.com, URL {}\nMessage: {}".format(url, str(decoded_url)))
            except Exception as e:
                logger.warning("Error decoding news.google.com, URL: {}\n{}".format(url, traceback.format_exc()))
    return list_decoded_urls
@@ -124,7 +127,7 @@ def search_googlenews_general(keyword_search, period="1d", language="en", countr

        # Iterate pages
        for i in range(max_pages):
-            time.sleep(random.uniform(1, 2.5))
+            time.sleep(random.uniform(2, 4.5))
            num_before = len(set_links)

            # Get page
@@ -148,4 +151,47 @@ def search_googlenews_general(keyword_search, period="1d", language="en", countr
    
    return urls, source

-###########################################################################
+###########################################################################
+
+def search_googlenews_rss(keyword_search, language="en", country="US"):
+    # [source] [category] [period] [language-country] [max_results]
+    source = "googlenews-rss {}-{}".format(language, country).replace("None", "").strip()
+    logger.debug("Searching: {} --- Source:{}".format(keyword_search, source))
+
+    # https://news.google.com/rss/search?q={}&hl=en-US&gl=US&ceid=US:en
+
+    try:
+        # Search URL with parameters filled
+        search_url = "https://news.google.com/rss/search?q={}&hl={}&gl={}&ceid={}:{}".format(keyword_search, "{}-{}".format(language, country.upper()), country.upper(), country.upper(), language)
+        # Control characters
+        search_url = search_url.replace(" ", "+") # urllib.parse.quote(search_url) # Issue: https%3A//news.google.com/rss/search%3Fq%3Dbreitbart.com%26hl%3Den-US%26gl%3DUS%26ceid%3DUS%3Aen
+        # Initialize
+        encoded_urls = []
+        # Fetch feeds
+        feeds = feedparser.parse(search_url)
+        # Parse
+        for f in feeds.get("entries", []):
+            # Encoded URL
+            encoded_url = f.get("link", None)
+            '''
+            # Available publish date?
+            publish_date_parsed = f.get("published_parsed")
+            if (publish_date_parsed is None):
+                publish_date = f.get("published", None)
+                if (publish_date is not None):
+                    publish_date_parsed = dateutil.parser.parse(publish_date)
+
+            # Published date
+            urls_publish_date.append(publish_date_parsed)'
+            '''
+            # Append
+            encoded_urls.append(encoded_url)
+        
+        # Decode
+        urls = decode_gnews_urls(encoded_urls)
+        
+    except Exception as e:
+        logger.warning("Exception fetching {}: {}\n{}".format(source, str(e), traceback.format_exc()))
+        urls = []
+    
+    return urls, source
--- a/app_urls/api/src/url_processor.py
+++ b/app_urls/api/src/url_processor.py
@@ -4,9 +4,8 @@ logger = get_logger()
 import newspaper
 import time
 from urllib.parse import unquote
-# pip install langdetect
-#import langdetect
-#langdetect.DetectorFactory.seed = 0
+import langdetect
+langdetect.DetectorFactory.seed = 0

 def get_with_protocol(url):
    # http:// -> https://
@@ -76,7 +75,8 @@ def process_url(url):
        "url_host": article.source_url,
        "site_name": article.meta_site_name,
        "publish_date": article.publish_date,
-        "language": article.meta_lang, # langdetect.detect(article.text)
+        # article.meta_lang -> Not always reliable
+        "language": langdetect.detect("\n".join([article.title, article.meta_description, article.text]) ),
        "title": article.title,
        "description": article.meta_description,
        "content": article.text,