from .db_utils import DB_Handler from ..models import Search from django.db.models import Q import traceback import time import os import random from .fetch_search_instances import ListSearchInstances from .logger import get_logger logger = get_logger() class FetchSearcher(): def __init__(self) -> None: logger.debug("Initializing Fetcher Searcher") def run(self): try: logger.debug("Starting FetchSearcher.run()") # Get search objects of interest list_search_obj = Search.objects.filter(Q(type=Search.TYPE_ENUM.URL_HOST) | Q(type=Search.TYPE_ENUM.KEYWORD_SEARCH)) # To list, shuffle list_search_obj = list(list_search_obj) random.shuffle(list_search_obj) logger.debug("Fetching from search: {}".format(["{} ({})".format(e.search, e.type) for e in list_search_obj])) # Search for obj_search in list_search_obj: # TODO: language & country customization # Search keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search) if (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH): # Add search with intitle keyword # TODO: allintitle: "child abuse" # TODO: intitle: "child abuse" pass # language, country = obj_search.language_country.split("-") logger.debug("Starting keyword search: {}".format(keyword_search)) logger.debug("Search type: {}".format(obj_search.type)) # DB writer db_writer = DB_Handler() # Keyword arguments args = { "language": "en", "country": "US", # "period": ["7d", "1d"], # TODO: List of periods to iterate } for SearchInstance in ListSearchInstances: # Sleep between requests, avoid too many requests... time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5))) # TODO: Random proxy / VPN SearchInstance(args).fetch_articles(db_writer, obj_search) # TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master except Exception as e: logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))