diff --git a/.env b/.env index a8c2eae..217212b 100644 --- a/.env +++ b/.env @@ -35,6 +35,8 @@ FETCHER_URL_HOST_SLEEP=1.5 FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=5 FETCHER_BETWEEN_SEARCHES_SLEEP=1 FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100 +FETCHER_INSERT_URL_CACHE_TIME=86400 +FETCHER_ERROR_URL_CACHE_TIME=172800 # Selenium SELENIUM_ENDPOINT=http://fetcher_app_selenium:80 @@ -43,4 +45,8 @@ ENDPOINT_OLLAMA=https://ollamamodel.matitos.org # APP: Selenium ARCH=amd64 # arm64, amd64 SELENIUM_SLEEP_PER_PAGE=4 -PATH_LOGS_DIRECTORY=/opt/logs \ No newline at end of file +PATH_LOGS_DIRECTORY=/opt/logs + +# Deploy resources per App +DEPLOY_CPUS=2 +DEPLOY_RAM=4G diff --git a/.gitignore b/.gitignore index 051548c..99fcf64 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ __pycache__/ *.pyc **/credentials.py logs/ -postgres/ \ No newline at end of file +postgres/ +docker_data/ \ No newline at end of file diff --git a/app_urls/fetcher/src/db_utils.py b/app_urls/fetcher/src/db_utils.py index ae5d59f..f67cc84 100644 --- a/app_urls/fetcher/src/db_utils.py +++ b/app_urls/fetcher/src/db_utils.py @@ -6,16 +6,14 @@ from django.utils import timezone from datetime import timedelta from .url_processor import process_url, get_with_protocol import re +import os import traceback from .logger import get_logger logger = get_logger() class DB_Handler(): def __init__(self): - # Inserting raw URL, cache time: 1 day - self._cache_timeout_insert_url = 86400 - # Processing error URL, cache time: 2 days - self._cache_timeout_error_url = 86400*2 + pass def insert_raw_urls(self, urls, obj_source, obj_search): try: @@ -75,8 +73,8 @@ class DB_Handler(): # Insert or update cache for url in urls_clean: - cache.set("insert_{}".format(url), True, timeout=self._cache_timeout_insert_url) - cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=self._cache_timeout_insert_url) + cache.set("insert_{}".format(url), True, timeout=int(os.getenv("FETCHER_INSERT_URL_CACHE_TIME", 86400))) + cache.set("insert_{}{}{}".format(url, obj_source.source, obj_search.search), True, timeout=int(os.getenv("FETCHER_INSERT_URL_CACHE_TIME", 86400))) logger.info("Inserted #{} raw URLs, Source-Search {} - {}".format(len(urls_to_insert), obj_source.source, obj_search.search)) @@ -84,7 +82,7 @@ class DB_Handler(): logger.warning("Exception inserting raw URLs: {}\n{}".format(e, traceback.format_exc())) - def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error): + def _process_single_url(self, obj_url, status_pattern_match, raise_exception_on_error, override_url=None): def set_status(obj_url, status): # Update status if setting a new value @@ -102,8 +100,13 @@ class DB_Handler(): return try: + # Override URL for request? + if (override_url is not None): + url_of_interest = override_url + else: + url_of_interest = obj_url.url # Extract URL content - dict_url_data = process_url(obj_url.url) + dict_url_data = process_url(url_of_interest) except Exception as e: if (raise_exception_on_error): # Simply raise exception, handled in a different way @@ -232,11 +235,12 @@ class DB_Handler(): try: # Process URL - self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True) + override_url = "https://marreta.pcdomanual.com/p/{}".format(obj_url.url) + self._process_single_url(obj_url, status_pattern_match=None, raise_exception_on_error=True, override_url=override_url) num_urls_processed += 1 except Exception as e: # Error, cache to avoid re-processing for X time - cache.set("error_{}".format(obj_url.id), True, timeout=self._cache_timeout_insert_url) + cache.set("error_{}".format(obj_url.id), True, timeout=int(os.getenv("FETCHER_ERROR_URL_CACHE_TIME", 86400))) num_urls_skipped += 1 # Get following batch of URLs, status='error' diff --git a/app_urls/fetcher/tasks.py b/app_urls/fetcher/tasks.py index 21fed3d..0042b7a 100644 --- a/app_urls/fetcher/tasks.py +++ b/app_urls/fetcher/tasks.py @@ -100,6 +100,7 @@ def background_task(process_type: str): number_pages = int(process_type.split("_")[-1]) except Exception as e: number_pages = -1 + FetchMissingKids().run(number_pages=number_pages) elif ("process_" in process_type): @@ -108,6 +109,7 @@ def background_task(process_type: str): batch_size = int(process_type.split("_")[-1]) except Exception as e: batch_size = None + # Task type if ("process_raw_urls" in process_type): DB_Handler().process_raw_urls(batch_size=batch_size) @@ -122,6 +124,7 @@ def background_task(process_type: str): older_than_days = float(process_type.split("_")[-1]) except Exception as e: older_than_days = None + DB_Handler().clean_old_url_content(older_than_days=older_than_days) else: diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index bbae0ba..56b0fae 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -22,8 +22,8 @@ services: deploy: resources: limits: - cpus: '4' - memory: 4G + cpus: '${DEPLOY_CPUS}' + memory: ${DEPLOY_RAM} fetcher_app_urls: image: fetcher_app_urls @@ -56,9 +56,11 @@ services: # Fetcher - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP} - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP} - - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} - - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} - - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100 + - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search + - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host + - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection + - FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL + - FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL # Selenium - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT} - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA} @@ -77,8 +79,8 @@ services: deploy: resources: limits: - cpus: '4' - memory: 4G + cpus: '${DEPLOY_CPUS}' + memory: ${DEPLOY_RAM} #labels: # Reverse proxy sample # - "traefik.enable=true" # - "traefik.http.routers.fetcher.rule=Host(`urls.yourdomain.com`)" diff --git a/docker-compose.yml b/docker-compose.yml index ba7905e..54d04d5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,8 +22,8 @@ services: deploy: resources: limits: - cpus: '4' - memory: 4G + cpus: '${DEPLOY_CPUS}' + memory: ${DEPLOY_RAM} fetcher_app_urls: image: fetcher_app_urls @@ -56,9 +56,11 @@ services: # Fetcher - FETCHER_GNEWS_DECODE_SLEEP=${FETCHER_GNEWS_DECODE_SLEEP} - FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP=${FETCHER_GOOGLE_GENERAL_PAGE_ITER_SLEEP} - - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} - - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} - - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=100 + - FETCHER_BETWEEN_SEARCHES_SLEEP=${FETCHER_BETWEEN_SEARCHES_SLEEP} # Sleep time between each search + - FETCHER_URL_HOST_SLEEP=${FETCHER_URL_HOST_SLEEP} # Sleep time between requests to same URL host + - FETCHER_LANGUAGE_DETECTION_MIN_CHAR=${FETCHER_LANGUAGE_DETECTION_MIN_CHAR} # Min amonut of characters to run language detection + - FETCHER_INSERT_URL_CACHE_TIME=${FETCHER_INSERT_URL_CACHE_TIME} # Cache time: Insert raw URL + - FETCHER_ERROR_URL_CACHE_TIME=${FETCHER_ERROR_URL_CACHE_TIME} # Cache time: Error on processing URL # Selenium - SELENIUM_ENDPOINT=${SELENIUM_ENDPOINT} - ENDPOINT_OLLAMA=${ENDPOINT_OLLAMA} @@ -77,8 +79,8 @@ services: deploy: resources: limits: - cpus: '4' - memory: 4G + cpus: '${DEPLOY_CPUS}' + memory: ${DEPLOY_RAM} labels: # Reverse proxy sample - "traefik.enable=true" - "traefik.http.routers.fetcher.rule=Host(`${REVERSE_PROXY_URL}`)"