From 99578cf4e80e00adc3998845ff7d3ef59ce4fa53 Mon Sep 17 00:00:00 2001 From: Luciano Resende Date: Tue, 12 Dec 2023 18:28:53 -0800 Subject: [PATCH] Expose kernel_info_timeout configuration --- docs/source/operators/config-add-env.md | 4 ++++ docs/source/users/kernel-envs.md | 4 ++++ enterprise_gateway/services/kernels/remotemanager.py | 6 ++++++ enterprise_gateway/services/processproxies/processproxy.py | 2 ++ etc/docker/docker-compose.yml | 1 + etc/docker/enterprise-gateway/start-enterprise-gateway.sh | 4 +++- .../helm/enterprise-gateway/templates/deployment.yaml | 2 ++ 7 files changed, 22 insertions(+), 1 deletion(-) diff --git a/docs/source/operators/config-add-env.md b/docs/source/operators/config-add-env.md index 519a3dcd7..14ac149da 100644 --- a/docs/source/operators/config-add-env.md +++ b/docs/source/operators/config-add-env.md @@ -32,6 +32,10 @@ Besides those environment variables associated with configurable options, the fo startup attempt will take place. If a second timeout occurs, Enterprise Gateway will report a failure to the client. + EG_KERNEL_INFO_TIMEOUT=60 + The time (in seconds) Enterprise Gateway will wait for kernel info response + before deeming the request a failure. + EG_SENSITIVE_ENV_KEYS="" A comma separated list (e.g. "secret,pwd,auth") of sensitive environment variables. Any environment variables that contain any of the words from this diff --git a/docs/source/users/kernel-envs.md b/docs/source/users/kernel-envs.md index 304a32bbf..deba2a245 100644 --- a/docs/source/users/kernel-envs.md +++ b/docs/source/users/kernel-envs.md @@ -54,6 +54,10 @@ There are several supported `KERNEL_` variables that the Enterprise Gateway serv be submitted in the kernel startup if that particular kernel's startup time is expected to exceed that of the EG_KERNEL_LAUNCH_TIMEOUT set when Enterprise Gateway starts. + + KERNEL_INFO_TIMEOUT= or EG_KERNEL_INFO_TIMEOUT=60 + The time (in seconds) Enterprise Gateway will wait for kernel info response + before deeming the request a failure. KERNEL_NAMESPACE= or KERNEL_POD_NAME or EG_NAMESPACE Kubernetes only. This indicates the name of the namespace to use or create on diff --git a/enterprise_gateway/services/kernels/remotemanager.py b/enterprise_gateway/services/kernels/remotemanager.py index b277b2d21..fce2b68f0 100644 --- a/enterprise_gateway/services/kernels/remotemanager.py +++ b/enterprise_gateway/services/kernels/remotemanager.py @@ -26,6 +26,7 @@ from ..sessions.kernelsessionmanager import KernelSessionManager default_kernel_launch_timeout = float(os.getenv("EG_KERNEL_LAUNCH_TIMEOUT", "30")) +default_kernel_info_timeout = float(os.getenv("EG_KERNEL_INFO_TIMEOUT", "60")) kernel_restart_status_poll_interval = float(os.getenv("EG_RESTART_STATUS_POLL_INTERVAL", 1.0)) @@ -437,6 +438,7 @@ def __init__(self, **kwargs: dict[str, Any] | None): self.kernel_id = None self.user_overrides = {} self.kernel_launch_timeout = default_kernel_launch_timeout + self.kernel_info_timeout = default_kernel_info_timeout self.restarting = False # need to track whether we're in a restart situation or not self._activity_stream = None @@ -513,6 +515,10 @@ def _capture_user_overrides(self, **kwargs: dict[str, Any] | None) -> None: self.kernel_launch_timeout = float( env.get("KERNEL_LAUNCH_TIMEOUT", default_kernel_launch_timeout) ) + # if KERNEL_INFO_TIMEOUT is passed in the payload, override it. + self.kernel_info_timeout = float( + env.get("KERNEL_INFO_TIMEOUT", default_kernel_info_timeout) + ) self.user_overrides.update( { key: value diff --git a/enterprise_gateway/services/processproxies/processproxy.py b/enterprise_gateway/services/processproxies/processproxy.py index 177c49492..3aa964a72 100644 --- a/enterprise_gateway/services/processproxies/processproxy.py +++ b/enterprise_gateway/services/processproxies/processproxy.py @@ -61,6 +61,7 @@ redaction_mask = os.getenv("EG_REDACTION_MASK", "********") default_kernel_launch_timeout = float(os.getenv("EG_KERNEL_LAUNCH_TIMEOUT", "30")) +default_kernel_info_timeout = float(os.getenv("EG_KERNEL_INFO_TIMEOUT", "60")) max_poll_attempts = int(os.getenv("EG_MAX_POLL_ATTEMPTS", "10")) poll_interval = float(os.getenv("EG_POLL_INTERVAL", "0.5")) socket_timeout = float(os.getenv("EG_SOCKET_TIMEOUT", "0.005")) @@ -430,6 +431,7 @@ def __init__(self, kernel_manager: RemoteKernelManager, proxy_config: dict): # self.kernel_id = self.kernel_manager.kernel_id self.kernel_launch_timeout = default_kernel_launch_timeout + self.kernel_info_timeout = default_kernel_info_timeout self.lower_port = 0 self.upper_port = 0 self._validate_port_range() diff --git a/etc/docker/docker-compose.yml b/etc/docker/docker-compose.yml index 5a17c5ae3..178caf68d 100644 --- a/etc/docker/docker-compose.yml +++ b/etc/docker/docker-compose.yml @@ -21,6 +21,7 @@ services: environment: - "EG_DOCKER_NETWORK=${EG_DOCKER_NETWORK:-enterprise-gateway_enterprise-gateway}" - "EG_KERNEL_LAUNCH_TIMEOUT=${EG_KERNEL_LAUNCH_TIMEOUT:-60}" + - "EG_KERNEL_INFO_TIMEOUT=${EG_KERNEL_INFO_TIMEOUT:-60}" - "EG_CULL_IDLE_TIMEOUT=${EG_CULL_IDLE_TIMEOUT:-3600}" # Use double-defaulting for B/C. Support for EG_KERNEL_WHITELIST will be removed in a future release - "EG_ALLOWED_KERNELS=${EG_ALLOWED_KERNELS:-${EG_KERNEL_WHITELIST:-'r_docker','python_docker','python_tf_docker','python_tf_gpu_docker','scala_docker'}}" diff --git a/etc/docker/enterprise-gateway/start-enterprise-gateway.sh b/etc/docker/enterprise-gateway/start-enterprise-gateway.sh index 1ad25bc1f..81c77afc8 100755 --- a/etc/docker/enterprise-gateway/start-enterprise-gateway.sh +++ b/etc/docker/enterprise-gateway/start-enterprise-gateway.sh @@ -29,6 +29,7 @@ export EG_CULL_CONNECTED=${EG_CULL_CONNECTED:-False} EG_ALLOWED_KERNELS=${EG_ALLOWED_KERNELS:-${EG_KERNEL_WHITELIST:-"null"}} export EG_ALLOWED_KERNELS=`echo ${EG_ALLOWED_KERNELS} | sed 's/[][]//g'` # sed is used to strip off surrounding brackets as they should no longer be included. export EG_DEFAULT_KERNEL_NAME=${EG_DEFAULT_KERNEL_NAME:-python_docker} +export EG_KERNEL_INFO_TIMEOUT=${EG_KERNEL_INFO_TIMEOUT:-60} # Determine whether the kernels-allowed list should be added to the start command. # This is conveyed via a 'null' value for the env - which indicates no kernel names @@ -46,4 +47,5 @@ exec jupyter enterprisegateway \ --RemoteMappingKernelManager.cull_idle_timeout=${EG_CULL_IDLE_TIMEOUT} \ --RemoteMappingKernelManager.cull_interval=${EG_CULL_INTERVAL} \ --RemoteMappingKernelManager.cull_connected=${EG_CULL_CONNECTED} \ - --RemoteMappingKernelManager.default_kernel_name=${EG_DEFAULT_KERNEL_NAME} + --RemoteMappingKernelManager.default_kernel_name=${EG_DEFAULT_KERNEL_NAME} \ + --RemoteMappingKernelManager.kernel_info_timeout=${EG_KERNEL_INFO_TIMEOUT} diff --git a/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml b/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml index fd6d09b27..771318a6a 100644 --- a/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml +++ b/etc/kubernetes/helm/enterprise-gateway/templates/deployment.yaml @@ -71,6 +71,8 @@ spec: value: {{ .Values.logLevel }} - name: EG_KERNEL_LAUNCH_TIMEOUT value: !!str {{ .Values.kernel.launchTimeout }} + - name: EG_KERNEL_INFO_TIMEOUT + value: !!str {{ .Values.kernel.infoTimeout }} - name: EG_ALLOWED_KERNELS value: {{ toJson .Values.kernel.allowedKernels | squote }} - name: EG_DEFAULT_KERNEL_NAME