63 lines
2.5 KiB
Python
63 lines
2.5 KiB
Python
from .db_utils import DB_Handler
|
|
from ..models import Search
|
|
from django.db.models import Q
|
|
import traceback
|
|
import time
|
|
import os
|
|
import random
|
|
from .fetch_search_instances import ListSearchInstances
|
|
from .logger import get_logger
|
|
logger = get_logger()
|
|
|
|
class FetchSearcher():
|
|
def __init__(self) -> None:
|
|
logger.debug("Initializing Fetcher Searcher")
|
|
|
|
def run(self):
|
|
try:
|
|
logger.debug("Starting FetchSearcher.run()")
|
|
|
|
# Get search objects of interest
|
|
list_search_obj = Search.objects.filter(Q(type=Search.TYPE_ENUM.URL_HOST) | Q(type=Search.TYPE_ENUM.KEYWORD_SEARCH))
|
|
# To list, shuffle
|
|
list_search_obj = list(list_search_obj)
|
|
random.shuffle(list_search_obj)
|
|
logger.debug("Fetching from search: {}".format(["{} ({})".format(e.search, e.type) for e in list_search_obj]))
|
|
|
|
# Search
|
|
for obj_search in list_search_obj:
|
|
# TODO: language & country customization
|
|
|
|
# Search
|
|
keyword_search = "{}{}".format("site:" if obj_search.type == Search.TYPE_ENUM.URL_HOST else "", obj_search.search)
|
|
|
|
if (obj_search.type == Search.TYPE_ENUM.KEYWORD_SEARCH):
|
|
# Add search with intitle keyword
|
|
# TODO: allintitle: "child abuse"
|
|
# TODO: intitle: "child abuse"
|
|
pass
|
|
# language, country = obj_search.language_country.split("-")
|
|
|
|
logger.debug("Starting keyword search: {}".format(keyword_search))
|
|
logger.debug("Search type: {}".format(obj_search.type))
|
|
|
|
# DB writer
|
|
db_writer = DB_Handler()
|
|
|
|
# Keyword arguments
|
|
args = {
|
|
"language": "en",
|
|
"country": "US",
|
|
# "period": ["7d", "1d"], # TODO: List of periods to iterate
|
|
}
|
|
|
|
for SearchInstance in ListSearchInstances:
|
|
# Sleep between requests, avoid too many requests...
|
|
time.sleep(float(os.getenv("FETCHER_BETWEEN_SEARCHES_SLEEP", 5)))
|
|
# TODO: Random proxy / VPN
|
|
SearchInstance(args).fetch_articles(db_writer, obj_search)
|
|
|
|
# TODO: https://github.com/tasos-py/Search-Engines-Scraper/tree/master
|
|
except Exception as e:
|
|
logger.warning("Exception in FetchSearcher.run(): {}\n{}".format(e, traceback.format_exc()))
|