Files
matitos_news/app_urls/fetcher/src/fetch_search.py
2025-06-27 09:14:44 +02:00

63 lines
2.5 KiB
Python

from .db_utils import DB_Handler
from ..models import Search
from django.db.models import Q
import traceback
import time
import os
import random
from .fetch_search_instances import ListSearchInstances
from .logger import get_logger
logger = get_logger()
class FetchSearcher():
def __init__(self) -> None:
logger.debug("Initializing Fetcher Searcher")
def run(self):
try:
logger.debug("Starting FetchSearcher.run()")
# Get search objects of interest
list_search_obj = Search.objects.filter(Q(type=Search.TYPE_ENUM.URL_HOST) | Q(type=Search.TYPE_ENUM.KEYWORD_SEARCH))
# To list, shuffle
list_search_obj = list(list_search_obj)
random.shuffle(list_search_obj)
logger.debug("Fetching from search: {}".format(["{} ({})".format(e.search, e.type) for e in list_search_obj]))
# Search
for obj_search in list_search_obj:
# TODO: language & country customization
# Search
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
if (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
# Add search with intitle keyword
# TODO: allintitle: "child abuse"
# TODO: intitle: "child abuse"
pass
# language, country = obj_search.language_country.split("-")
logger.debug("Starting keyword search: {}".format(keyword_search))
logger.debug("Search type: {}".format(obj_search.type))
# DB writer
db_writer = DB_Handler()
# Keyword arguments
args = {
"language": "en",
"country": "US",
# "period": ["7d", "1d"], # TODO: List of periods to iterate
}
for SearchInstance in ListSearchInstances:
# Sleep between requests, avoid too many requests...
time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
# TODO: Random proxy / VPN
SearchInstance(args).fetch_articles(db_writer, obj_search)
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
except Exception as e:
logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))