Refactoring fetcher
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ FROM continuumio/miniconda3:25.1.1-2
|
||||
COPY . /opt/app/
|
||||
|
||||
RUN conda install -c conda-forge curl
|
||||
RUN pip install --no-cache-dir --upgrade "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper3k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]
|
||||
RUN pip install --no-cache-dir --upgrade "psycopg[binary]" git+https://github.com/ranahaani/GNews.git GoogleNews duckduckgo_search newspaper4k numpy beautifulsoup4 requests feedparser pytz redis fastapi uvicorn fastapi-utils lxml[html_clean]
|
||||
RUN pip freeze
|
||||
# GoogleNews-1.6.10 Pillow-10.1.0 PyYAML-6.0.1 aiofiles-23.2.1 anyio-3.7.1 beautifulsoup4-4.9.3 bs4-0.0.1 click-8.1.7 cssselect-1.2.0 dateparser-1.2.0 dnspython-1.16.0 duckduckgo_search-3.9.8 fastapi-0.104.1 fastapi-utils-0.2.1 feedfinder2-0.0.4 feedparser-6.0.10 filelock-3.13.1 gnews-0.3.6 greenlet-3.0.1 h11-0.14.0 h2-4.1.0 hpack-4.0.0 httpcore-1.0.2 httpx-0.25.2 hyperframe-6.0.1 jieba3k-0.35.1 joblib-1.3.2 lxml-4.9.3 newspaper3k-0.2.8 nltk-3.8.1 numpy-1.26.2 psycopg-3.1.13 psycopg-binary-3.1.13 pydantic-1.10.13 pymongo-3.12.3 python-dateutil-2.8.2 python-dotenv-0.19.2 pytz-2023.3.post1 redis-5.0.1 regex-2023.10.3 requests-2.26.0 requests-file-1.5.1 sgmllib3k-1.0.0 six-1.16.0 sniffio-1.3.0 socksio-1.0.0 soupsieve-2.5 sqlalchemy-1.4.50 starlette-0.27.0 tinysegmenter-0.3 tldextract-5.1.1 typing-extensions-4.8.0 tzlocal-5.2 uvicorn-0.24.0.post1
|
||||
|
||||
|
||||
@@ -1,43 +1,21 @@
|
||||
import src.credentials as cred
|
||||
import logging
|
||||
from logging.handlers import RotatingFileHandler
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
logger.setLevel(logging.INFO)
|
||||
from src.fetch_feed import FetchFeed
|
||||
from src.fetch_parser import FetchParser
|
||||
from src.fetch_search import FetchSearch
|
||||
|
||||
import os
|
||||
os.makedirs("logs", exist_ok=True)
|
||||
|
||||
# To file log
|
||||
fh = RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: WARNING / ERROR
|
||||
fh_ = RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh_.setLevel(logging.WARNING)
|
||||
logger.addHandler(fh_)
|
||||
|
||||
logger.info("Environment: {}".format(cred.ENVIRONMENT))
|
||||
|
||||
##################################################################################################
|
||||
from src.news_feed import NewsFeed
|
||||
from src.news_parsing import NewsSiteParsing
|
||||
from src.news_search import NewsSearch
|
||||
from src.news_missing_kids import NewsMissingKids
|
||||
from src.missing_kids_fetch import MissingKidsFetch
|
||||
from src.missing_kids_status import MissingKidsStatus
|
||||
from src.url_status import UpdateErrorURLs
|
||||
from src.fetcher_status import FetcherStatus
|
||||
|
||||
from src.url_status import UpdateErrorURLs
|
||||
from src.db_utils import DB_Handler
|
||||
|
||||
import src.credentials as cred
|
||||
from logging_ import get_logger
|
||||
|
||||
from fastapi import FastAPI, BackgroundTasks
|
||||
# import requests
|
||||
# from fastapi_utils.tasks import repeat_every
|
||||
# import time
|
||||
# time.sleep(10)
|
||||
# import gc
|
||||
##################################################################################################
|
||||
|
||||
logger = get_logger()
|
||||
logger.info("Environment: {}".format(cred.ENVIRONMENT))
|
||||
|
||||
db_handler = DB_Handler(cred.db_connect_info, cred.redis_connect_info)
|
||||
|
||||
@@ -47,49 +25,55 @@ app = FastAPI()
|
||||
def hello_world():
|
||||
return {"message": "Ok"}
|
||||
|
||||
@app.get("/{fetch_type}")
|
||||
async def fetch(background_tasks: BackgroundTasks, fetch_type: str):
|
||||
@app.get("/{process_type}")
|
||||
async def process(background_tasks: BackgroundTasks, process_type: str):
|
||||
# Concurrent job running
|
||||
logger.info("Triggered fetch: {}".format(fetch_type))
|
||||
logger.info("Triggered: {}".format(process_type))
|
||||
|
||||
if (fetch_type == "feeds"):
|
||||
task_run = NewsFeed(db_handler).run
|
||||
elif (fetch_type == "parser"):
|
||||
task_run = NewsSiteParsing(db_handler).run
|
||||
elif (fetch_type == "fetch_missing_kids_reduced"):
|
||||
task_run = NewsMissingKids(cred.db_connect_info, cred.redis_connect_info, num_pages=4).run
|
||||
elif (fetch_type == "fetch_missing_kids_full"):
|
||||
task_run = NewsMissingKids(cred.db_connect_info, cred.redis_connect_info, num_pages=100000).run
|
||||
elif (fetch_type == "search") or (fetch_type == "search_full"):
|
||||
task_run = NewsSearch(cred.db_connect_info, cred.redis_connect_info, full=True).run
|
||||
elif (fetch_type == "search_reduced"):
|
||||
task_run = NewsSearch(cred.db_connect_info, cred.redis_connect_info, full=False).run
|
||||
elif (fetch_type == "update_missing_kids_status_reduced"):
|
||||
if (process_type == "fetch_feeds"):
|
||||
task_run = FetchFeed(db_handler).run
|
||||
elif (process_type == "fetch_parser"):
|
||||
task_run = FetchParser(db_handler).run
|
||||
elif (process_type == "search") or (process_type == "search_full"):
|
||||
task_run = FetchSearch(cred.db_connect_info, cred.redis_connect_info, full=True).run
|
||||
elif (process_type == "search_reduced"):
|
||||
task_run = FetchSearch(cred.db_connect_info, cred.redis_connect_info, full=False).run
|
||||
|
||||
# Selenium based
|
||||
elif (process_type == "fetch_missing_kids_reduced"):
|
||||
task_run = MissingKidsFetch(db_handler, num_pages=4).run
|
||||
elif (process_type == "fetch_missing_kids_full"):
|
||||
task_run = MissingKidsFetch(db_handler, num_pages=100000).run
|
||||
|
||||
elif (process_type == "update_missing_kids_status_reduced"):
|
||||
task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=50).update_missing_kids_status
|
||||
elif (fetch_type == "update_missing_kids_status_full"):
|
||||
elif (process_type == "update_missing_kids_status_full"):
|
||||
task_run = MissingKidsStatus(cred.db_connect_info, cred.redis_connect_info, num_urls=None).update_missing_kids_status
|
||||
elif (fetch_type == "update_error_urls"):
|
||||
|
||||
elif (process_type == "update_error_urls"):
|
||||
task_run = UpdateErrorURLs(cred.db_connect_info, cred.redis_connect_info, num_urls=100).update_error_urls_status
|
||||
elif (fetch_type == "fetch_warning_check"):
|
||||
task_run = FetcherStatus(cred.db_connect_info, cred.redis_connect_info, last_minutes_check=180).check_warning
|
||||
else:
|
||||
return {"message": "ERROR. Unknown fetcher type!"}
|
||||
|
||||
# Run task
|
||||
background_tasks.add_task(task_run)
|
||||
# Return message
|
||||
return {"message": "Started fetching {}: Ok".format(fetch_type)}
|
||||
return {"message": "Started {}: Ok".format(process_type)}
|
||||
|
||||
##################################################################################################
|
||||
"""
|
||||
# TODO: Instead of background tasks!
|
||||
|
||||
###########################
|
||||
'''
|
||||
@app.on_event("startup")
|
||||
def verify_db() -> None:
|
||||
logger.info("Testing DB connection")
|
||||
import psycopg
|
||||
with psycopg.connect(cred.db_connect_info) as conn:
|
||||
url_test_msg = "Num URLs: {}".format(conn.execute("SELECT COUNT(*) FROM URLS;").fetchall())
|
||||
logger.info(url_test_msg)
|
||||
'''
|
||||
###########################
|
||||
import rq
|
||||
import redis
|
||||
|
||||
# Redis connection
|
||||
redis_conn = redis.Redis(host='localhost', port=6379, db=0)
|
||||
queue = rq.Queue(connection=redis_conn)
|
||||
|
||||
# ...
|
||||
# Queue the processing task
|
||||
dict_args= {"db_handler": db_handler, }
|
||||
queue.enqueue(task_run, **dict_args)
|
||||
|
||||
# https://python-rq.org/
|
||||
"""
|
||||
@@ -6,9 +6,8 @@ import requests
|
||||
import json
|
||||
import os
|
||||
from .url_utils import process_article
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
# TODO: URL_DB_HANDLER, _get_search_list, _get_url_host, _get_url_host_list, ...
|
||||
# The rest, elsewhere
|
||||
@@ -40,6 +39,30 @@ class DB_Handler():
|
||||
logger.warning("Error updating URLs status: {}".format(str(e)))
|
||||
num_urls = None
|
||||
return num_urls
|
||||
|
||||
def _get_url_host_list(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# List of URL host
|
||||
list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()]
|
||||
# Clean http / https from URLs
|
||||
list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host]
|
||||
# Clean last slash if exists
|
||||
list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching URL host list: " + str(e))
|
||||
list_url_host = []
|
||||
return list_url_host
|
||||
|
||||
def _get_search_list(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# List of keyword searches
|
||||
list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching searches list: " + str(e))
|
||||
list_search_text = []
|
||||
return list_search_text
|
||||
|
||||
def _get_feed_urls(self):
|
||||
try:
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
from .db_utils import DB_Handler
|
||||
import feedparser
|
||||
import dateutil
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class NewsFeed():
|
||||
class FetchFeed():
|
||||
def __init__(self, db_handler: DB_Handler) -> None:
|
||||
logger.debug("Initializing News feed")
|
||||
self.db_handler = db_handler
|
||||
@@ -1,10 +1,9 @@
|
||||
from .db_utils import DB_Handler
|
||||
import newspaper
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class NewsSiteParsing():
|
||||
class FetchParser():
|
||||
def __init__(self, db_handler: DB_Handler) -> None:
|
||||
logger.debug("Initializing News SiteParsing newspaper4k")
|
||||
self.db_handler = db_handler
|
||||
73
app_fetcher/src/fetch_search.py
Normal file
73
app_fetcher/src/fetch_search.py
Normal file
@@ -0,0 +1,73 @@
|
||||
from .db_utils import DB_Handler
|
||||
from .utils import get_searxng_instances
|
||||
from .fetch_search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class FetchSearch():
|
||||
def __init__(self, db_handler: DB_Handler, full=True) -> None:
|
||||
logger.debug("Initializing News feed")
|
||||
self.db_handler = db_handler
|
||||
self.full_search = full
|
||||
|
||||
def _run_fetching(self, search_text):
|
||||
logger.debug("Starting _run_fetching() for {}".format(search_text))
|
||||
|
||||
# Common parameters
|
||||
lang, region = "en", "US"
|
||||
|
||||
### PreSearch
|
||||
dict_params_news = {"search": search_text}
|
||||
FetcherPreSearch(**dict_params_news).fetch_articles(self.db_handler)
|
||||
|
||||
### DuckDuckGo
|
||||
period = "d"
|
||||
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
|
||||
FetcherDuckDuckGo(**dict_params_news).fetch_articles(self.db_handler)
|
||||
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
|
||||
FetcherDuckDuckGo(**dict_params_general).fetch_articles(self.db_handler)
|
||||
|
||||
if (self.full_search):
|
||||
# Avoid site:{} search due to G-Bypass required time
|
||||
if ("site:" not in search_text):
|
||||
### GNews
|
||||
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
|
||||
FetcherGNews(**dict_params).fetch_articles(self.db_handler)
|
||||
|
||||
### GoogleNews
|
||||
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
FetcherGoogleNews(**dict_params_news).fetch_articles(self.db_handler)
|
||||
# dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
|
||||
if False:
|
||||
### SearxNG
|
||||
period = "day"
|
||||
for searx_instance in get_searxng_instances():
|
||||
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
# Append thread
|
||||
FetcherSearxNews(**dict_params_news).fetch_articles(self.db_handler)
|
||||
FetcherSearxNews(**dict_params_general).fetch_articles(self.db_handler)
|
||||
|
||||
logger.debug("Finished _run_fetching()")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.info("Fetching text searches & URL hosts of interest")
|
||||
|
||||
# Get text searches of interest
|
||||
list_search_text_of_interest = self.db_handler._get_search_list()
|
||||
|
||||
# Get URL host of interest
|
||||
list_url_host = self.db_handler._get_url_host_list()
|
||||
# Get text searches for URL hosts
|
||||
list_search_text_url_host = ["site:{}".format(l) for l in list_url_host]
|
||||
|
||||
for search_text in list_search_text_of_interest + list_search_text_url_host:
|
||||
logger.debug("Fetching news for search: {}".format(search_text))
|
||||
self._run_fetching(search_text)
|
||||
|
||||
logger.info("Finished fetching text searches & URL hosts of interest")
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsSearch.run(): {}".format(str(e)))
|
||||
|
||||
@@ -9,12 +9,10 @@ import time
|
||||
import json
|
||||
import numpy as np
|
||||
import random
|
||||
from .user_agents import user_agents_list
|
||||
from .google_bypass import GoogleByPass
|
||||
from abc import ABC, abstractmethod
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
|
||||
@@ -32,6 +30,71 @@ class FetcherAbstract(ABC):
|
||||
# Write to DB
|
||||
db_writer.write_batch(list_news, self.name)
|
||||
|
||||
# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
|
||||
|
||||
user_agents_list = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
|
||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
|
||||
]
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import json
|
||||
import logging
|
||||
import requests
|
||||
import os
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class FetcherStatus():
|
||||
def __init__(self, db_connect_info, redis_connect_info, last_minutes_check) -> None:
|
||||
self.db_connect_info = db_connect_info
|
||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
||||
self.last_minutes_check = last_minutes_check
|
||||
|
||||
def check_warning(self):
|
||||
try:
|
||||
logger.info("Starting fetcher check for last minutes {}".format(self.last_minutes_check))
|
||||
|
||||
# Get number of URLs
|
||||
num_urls = self.db_writer.get_urls_count(last_minutes_check=self.last_minutes_check)
|
||||
logger.debug("Fetched #URLs {} during the last {} minutes".format(num_urls, self.last_minutes_check))
|
||||
|
||||
webhook_token = os.environ.get("CLIQ_WEBHOOK_TOKEN")
|
||||
endpoint_message = "https://cliq.zoho.com/api/v2/channelsbyname/urlfetchwarnings/message?zapikey={}".format(webhook_token)
|
||||
|
||||
if (num_urls is None):
|
||||
try:
|
||||
payload = json.dumps({"text": "[WARNING] Error on query to DB"})
|
||||
r = requests.post(endpoint_message, data=payload)
|
||||
except Exception as e:
|
||||
logger.warning("Webhook failed: {}".format(str(e)))
|
||||
elif (num_urls == 0):
|
||||
try:
|
||||
payload = json.dumps({"text": "[WARNING] No URLs fetched for {} minutes".format(self.last_minutes_check) })
|
||||
r = requests.post(endpoint_message, data=payload)
|
||||
except Exception as e:
|
||||
logger.warning("Webhook failed: {}".format(str(e)))
|
||||
except Exception as e:
|
||||
logger.warning("Exception in UpdateErrorURLs.run(): {}".format(str(e)))
|
||||
@@ -1,8 +1,7 @@
|
||||
import requests
|
||||
import json
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class GoogleByPass():
|
||||
def __init__(self) -> None:
|
||||
|
||||
22
app_fetcher/src/logger.py
Normal file
22
app_fetcher/src/logger.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import logging
|
||||
|
||||
import os
|
||||
os.makedirs("logs", exist_ok=True)
|
||||
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# To file log: INFO / WARNING / ERROR
|
||||
fh = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher.log", mode="a", maxBytes=10000000, backupCount=4)
|
||||
fh.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
logger.addHandler(fh)
|
||||
|
||||
# To file log: WARNING / ERROR
|
||||
fh_ = logging.handlers.RotatingFileHandler(filename="logs/log_app_fetcher_error.log", mode="a", maxBytes=10000000, backupCount=1)
|
||||
fh_.setFormatter(logging.Formatter('%(levelname)s | %(asctime)s | %(message)s'))
|
||||
fh_.setLevel(logging.WARNING)
|
||||
logger.addHandler(fh_)
|
||||
|
||||
def get_logger():
|
||||
return logger
|
||||
@@ -1,30 +1,27 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
from .db_utils import DB_Handler
|
||||
import requests
|
||||
import json
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class NewsMissingKids():
|
||||
def __init__(self, db_connect_info, redis_connect_info, num_pages) -> None:
|
||||
class MissingKidsFetch():
|
||||
def __init__(self, db_handler: DB_Handler, num_pages) -> None:
|
||||
logger.debug("Initializing News MissingKids")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
self.db_handler = db_handler
|
||||
self.num_pages = num_pages
|
||||
self.missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}"
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.debug("Starting NewsMissingKids.run()")
|
||||
try:
|
||||
# Missing kids fetching endpoint, parameter number of pages to fetch
|
||||
missingkids_fetch_endpoint = "http://selenium_app:80/get_missing_kids/?pages={}".format(self.num_pages)
|
||||
# Timeout
|
||||
if (self.num_pages > 15):
|
||||
timeout = 60*90 # 1.5h
|
||||
else:
|
||||
timeout = 60*5 # 5 min
|
||||
# Request
|
||||
r = requests.get(missingkids_fetch_endpoint, timeout=timeout)
|
||||
r = requests.get(self.missingkids_fetch_endpoint.format(self.num_pages), timeout=timeout)
|
||||
# Decode
|
||||
urls_fetched = json.loads(r.text).get("list_urls", [])
|
||||
except Exception as e:
|
||||
@@ -34,7 +31,6 @@ class NewsMissingKids():
|
||||
# URL fetching source
|
||||
source = "missingkids fetcher"
|
||||
# Write to DB
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
db_writer.write_batch(urls_fetched, source)
|
||||
self.db_handler.write_batch(urls_fetched, source)
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsMissingKids.run(): {}".format(str(e)))
|
||||
@@ -1,10 +1,39 @@
|
||||
import requests
|
||||
from .db_utils import URL_DB_Writer
|
||||
from .url_utils import get_missing_kid_status
|
||||
import time
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def get_missing_kid_status(url, return_canonical_url=False):
|
||||
import time
|
||||
import requests
|
||||
|
||||
# Sleep
|
||||
time.sleep(0.75)
|
||||
try:
|
||||
# Request
|
||||
r = requests.get(url, timeout=300)
|
||||
# Decode
|
||||
status_code = r.status_code
|
||||
# Canonical URL removing parameters
|
||||
url_canonical = r.url
|
||||
except Exception as e:
|
||||
logger.warning("Exception on get URL status request: {}. {}".format(url, str(e)))
|
||||
status_code = None
|
||||
url_canonical = url
|
||||
|
||||
if (status_code == 200):
|
||||
status = "valid"
|
||||
elif (status_code == 404):
|
||||
status = "invalid"
|
||||
else:
|
||||
status = "unknown"
|
||||
|
||||
logger.debug("Missing Kid URL {} status: {}".format(url, status))
|
||||
if (return_canonical_url):
|
||||
return status, url_canonical
|
||||
else:
|
||||
return status
|
||||
|
||||
class MissingKidsStatus():
|
||||
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
|
||||
|
||||
@@ -1,181 +0,0 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
import psycopg
|
||||
from .utils import get_searxng_instances
|
||||
from .search_sources import FetcherDuckDuckGo, FetcherGNews, FetcherGoogleNews, FetcherSearxNews, FetcherPreSearch
|
||||
from threading import Thread
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
|
||||
class NewsSearch():
|
||||
def __init__(self, db_connect_info, redis_connect_info, full=True) -> None:
|
||||
logger.debug("Initializing News feed")
|
||||
self.db_connect_info = db_connect_info
|
||||
self.redis_connect_info = redis_connect_info
|
||||
self.db_writer = URL_DB_Writer(db_connect_info, redis_connect_info)
|
||||
self.full_search = full
|
||||
|
||||
def _get_url_host_list(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# List of URL host
|
||||
list_url_host = [l[0] for l in conn.execute("SELECT url_host FROM WEBSITE_OF_INTEREST;").fetchall()]
|
||||
# Clean http / https from URLs
|
||||
list_url_host = [l.replace("https://", "").replace("http://", "") for l in list_url_host]
|
||||
# Clean last slash if exists
|
||||
list_url_host = [ l if not l.endswith("/") else l[:-1] for l in list_url_host]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching URL host list: " + str(e))
|
||||
list_url_host = []
|
||||
return list_url_host
|
||||
|
||||
def _get_search_list(self):
|
||||
try:
|
||||
with psycopg.connect(self.db_connect_info) as conn:
|
||||
# List of keyword searches
|
||||
list_search_text = [l[0] for l in conn.execute("SELECT keyword_search FROM SEARCH;").fetchall()]
|
||||
except Exception as e:
|
||||
logger.warning("Exception fetching searches list: " + str(e))
|
||||
list_search_text = []
|
||||
return list_search_text
|
||||
|
||||
def _run_fetching(self, search_text):
|
||||
logger.debug("Starting _run_fetching() for {}".format(search_text))
|
||||
|
||||
# Initialize DB Writer
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
|
||||
# Common parameters
|
||||
lang, region = "en", "US"
|
||||
|
||||
### PreSearch
|
||||
dict_params_news = {"search": search_text}
|
||||
FetcherPreSearch(**dict_params_news).fetch_articles(db_writer)
|
||||
|
||||
### DuckDuckGo
|
||||
period = "d"
|
||||
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
|
||||
FetcherDuckDuckGo(**dict_params_news).fetch_articles(db_writer)
|
||||
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
|
||||
FetcherDuckDuckGo(**dict_params_general).fetch_articles(db_writer)
|
||||
|
||||
if (self.full_search):
|
||||
# Avoid site:{} search due to G-Bypass required time
|
||||
if ("site:" not in search_text):
|
||||
### GNews
|
||||
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
|
||||
FetcherGNews(**dict_params).fetch_articles(db_writer)
|
||||
|
||||
### GoogleNews
|
||||
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
FetcherGoogleNews(**dict_params_news).fetch_articles(db_writer)
|
||||
# dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
|
||||
|
||||
'''
|
||||
# Method run concurrently, minimize overlapping
|
||||
time.sleep(random.uniform(1, 15))
|
||||
list_threads = []
|
||||
|
||||
def run_search(FetcherObject, dict_params):
|
||||
# Initialize DB Writer
|
||||
db_writer = URL_DB_Writer(self.db_connect_info, self.redis_connect_info)
|
||||
# Fetch and write to DB
|
||||
FetcherObject(**dict_params).fetch_articles(db_writer)
|
||||
|
||||
"""
|
||||
### SearxNG
|
||||
period = "day"
|
||||
for searx_instance in get_searxng_instances():
|
||||
dict_params_news = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
dict_params_general = {"search": search_text, "searx_instance": searx_instance, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
# Append thread
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherSearxNews, dict_params_news, )) )
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherSearxNews, dict_params_general, )) )
|
||||
"""
|
||||
|
||||
### PreSearch
|
||||
dict_params_news = {"search": search_text}
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherPreSearch, dict_params_news, )) )
|
||||
|
||||
### DuckDuckGo
|
||||
period = "d"
|
||||
dict_params_news = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "news", "period": period}
|
||||
dict_params_general = {"search": search_text, "lang": "wt", "region": "wt", "search_category": "general", "period": period}
|
||||
# Append thread
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherDuckDuckGo, dict_params_news, )) )
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherDuckDuckGo, dict_params_general, )) )
|
||||
|
||||
if (self.full_search):
|
||||
# Avoid site:{} search due to G-Bypass required time
|
||||
if ("site:" not in search_text):
|
||||
### GNews
|
||||
for period in ["1d"]: # ["1d", "6h"]:
|
||||
dict_params = {"search": search_text, "lang": "wt", "region": "wt", "period": period}
|
||||
# Append thread
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherGNews, dict_params, )) )
|
||||
|
||||
### GoogleNews
|
||||
for period in ["1d"]: # ["1d", "6h"]:
|
||||
# News
|
||||
dict_params_news = {"search": search_text, "lang": lang, "region": region, "search_category": "news", "period": period}
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherGoogleNews, dict_params_news, )) )
|
||||
if False:
|
||||
dict_params_general = {"search": search_text, "lang": lang, "region": region, "search_category": "general", "period": period}
|
||||
list_threads.append( Thread(target=run_search, args=(FetcherGoogleNews, dict_params_general, )) )
|
||||
|
||||
# Run
|
||||
MULTITHREADED = False
|
||||
logger.debug("Fetching threads starting")
|
||||
if MULTITHREADED:
|
||||
for t in list_threads:
|
||||
t.start()
|
||||
# Join
|
||||
for t in list_threads:
|
||||
t.join()
|
||||
else:
|
||||
for t in list_threads:
|
||||
t.start()
|
||||
t.join()
|
||||
logger.debug("Fetching threads finished")
|
||||
'''
|
||||
logger.debug("Finished _run_fetching()")
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
logger.info("Fetching text searches & URL hosts of interest")
|
||||
|
||||
# Get text searches of interest
|
||||
list_search_text_of_interest = self._get_search_list()
|
||||
|
||||
# Get URL host of interest
|
||||
list_url_host = self._get_url_host_list()
|
||||
# Get text searches for URL hosts
|
||||
list_search_text_url_host = ["site:{}".format(l) for l in list_url_host]
|
||||
|
||||
MULTITHREADED = False
|
||||
if MULTITHREADED:
|
||||
# Run fetching
|
||||
list_fetching_threads = []
|
||||
for search_text in list_search_text_of_interest + list_search_text_url_host:
|
||||
logger.debug("Fetching news for search: {}".format(search_text))
|
||||
# Append thread
|
||||
list_fetching_threads.append( Thread(target=self._run_fetching, args=(search_text, )) )
|
||||
|
||||
# Run
|
||||
for t in list_fetching_threads:
|
||||
t.start()
|
||||
# Join
|
||||
for t in list_fetching_threads:
|
||||
t.join()
|
||||
else:
|
||||
for search_text in list_search_text_of_interest + list_search_text_url_host:
|
||||
logger.debug("Fetching news for search: {}".format(search_text))
|
||||
self._run_fetching(search_text)
|
||||
|
||||
logger.info("Finished fetching text searches & URL hosts of interest")
|
||||
except Exception as e:
|
||||
logger.warning("Exception in NewsSearch.run(): {}".format(str(e)))
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
from .db_utils import URL_DB_Writer
|
||||
from .url_utils import process_article
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
class UpdateErrorURLs():
|
||||
def __init__(self, db_connect_info, redis_connect_info, num_urls) -> None:
|
||||
|
||||
@@ -10,9 +10,8 @@ import json
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import logging
|
||||
logging.basicConfig(format='%(filename)s | %(levelname)s | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger("news_fetcher")
|
||||
from .logger import get_logger
|
||||
logger = get_logger()
|
||||
|
||||
def get_published_date(article):
|
||||
try:
|
||||
@@ -82,33 +81,7 @@ def get_status_pattern_matching(url, article_status, list_pattern_status_tuple):
|
||||
# Pattern matching not required or not found, original article status
|
||||
return article_status
|
||||
|
||||
def get_missing_kid_status(url, return_canonical_url=False):
|
||||
# Sleep
|
||||
time.sleep(0.75)
|
||||
try:
|
||||
# Request
|
||||
r = requests.get(url, timeout=300)
|
||||
# Decode
|
||||
status_code = r.status_code
|
||||
# Canonical URL removing parameters
|
||||
url_canonical = r.url
|
||||
except Exception as e:
|
||||
logger.warning("Exception on get URL status request: {}. {}".format(url, str(e)))
|
||||
status_code = None
|
||||
url_canonical = url
|
||||
|
||||
if (status_code == 200):
|
||||
status = "valid"
|
||||
elif (status_code == 404):
|
||||
status = "invalid"
|
||||
else:
|
||||
status = "unknown"
|
||||
|
||||
logger.debug("Missing Kid URL {} status: {}".format(url, status))
|
||||
if (return_canonical_url):
|
||||
return status, url_canonical
|
||||
else:
|
||||
return status
|
||||
|
||||
def bypass_google_link(article_url):
|
||||
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
# https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
|
||||
|
||||
user_agents_list = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:111.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 OPR/96.0.0.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 OPR/97.0.0.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.39",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.2 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:112.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0",
|
||||
"Mozilla/5.0 (X11; CrOS x86_64 14541.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.0.2246 Yowser/2.5 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.6.1 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 6.1; rv:102.0) Gecko/20100101 Goanna/6.0 Firefox/102.0 PaleMoon/32.0.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.41",
|
||||
"Mozilla/5.0 (Windows NT 10.0; rv:110.0) Gecko/20100101 Firefox/110.0",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.5.708 Yowser/2.5 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
|
||||
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
|
||||
]
|
||||
Reference in New Issue
Block a user