From be005e02f008fd525dac90f009bc80d6b4cbdb22 Mon Sep 17 00:00:00 2001 From: Artem Minyaylov Date: Wed, 6 Mar 2024 07:04:51 +0000 Subject: [PATCH] Upgrade ray version; shrink worker resource allocation --- .../rag-kaggle-ray-sql-latest.ipynb | 5 ++- .../kuberay-autopilot-values.yaml | 36 +++++++++---------- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb index b0a769af1..3570e7db3 100644 --- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb +++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb @@ -252,7 +252,7 @@ "id": "7ba6c3ff-a25a-4f4d-b58e-68f7fe7d33df", "metadata": {}, "outputs": [], - "source": [ + "source": [ "job_id = client.submit_job(\n", " entrypoint=\"python test.py\",\n", " # Path to the local directory that contains the entrypoint file.\n", @@ -278,10 +278,9 @@ " status = client.get_job_status(job_id)\n", " if status != prev_status:\n", " print(\"Job status:\", status)\n", + " print(\"Job info:\", client.get_job_info(job_id).message)\n", " prev_status = status\n", " if status.is_terminal():\n", - " if status == 'FAILED':\n", - " print(\"Job info:\", client.get_job_info(job_id))\n", " break\n", " time.sleep(5)\n" ] diff --git a/modules/kuberay-cluster/kuberay-autopilot-values.yaml b/modules/kuberay-cluster/kuberay-autopilot-values.yaml index 11a20a90e..410cb30b8 100644 --- a/modules/kuberay-cluster/kuberay-autopilot-values.yaml +++ b/modules/kuberay-cluster/kuberay-autopilot-values.yaml @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ image: # Replace this with your own image if needed. repository: rayproject/ray - tag: 2.6.1-py310-gpu + tag: 2.7.1-py310-gpu pullPolicy: IfNotPresent nameOverride: "kuberay" @@ -64,8 +64,6 @@ head: # containerEnv specifies environment variables for the Ray container, # Follows standard K8s container env schema. containerEnv: - # - name: EXAMPLE_ENV - # value: "1" - name: RAY_memory_monitor_refresh_ms value: "0" - name: RAY_GRAFANA_IFRAME_HOST @@ -90,18 +88,18 @@ head: # for further guidance. resources: limits: - cpu: "8" + cpu: "1" # To avoid out-of-memory issues, never allocate less than 2G memory for the Ray head. - memory: "20G" + memory: "8G" ephemeral-storage: 20Gi requests: - cpu: "8" - memory: "20G" + cpu: "1" + memory: "8G" ephemeral-storage: 20Gi annotations: gke-gcsfuse/volumes: "true" - gke-gcsfuse/cpu-limit: "2" - gke-gcsfuse/memory-limit: 20Gi + gke-gcsfuse/cpu-limit: "1" + gke-gcsfuse/memory-limit: 4Gi gke-gcsfuse/ephemeral-storage-limit: 20Gi nodeSelector: cloud.google.com/compute-class: "Performance" @@ -158,8 +156,6 @@ worker: disabled: true # The map's key is used as the groupName. -# For example, key:small-group in the map below -# will be used as the groupName additionalWorkerGroups: cpuGroup: # Disabled by default @@ -194,16 +190,16 @@ additionalWorkerGroups: resources: limits: cpu: 4 - memory: "20G" + memory: "16G" ephemeral-storage: 20Gi requests: cpu: 4 - memory: "20G" + memory: "16G" ephemeral-storage: 20Gi annotations: gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "2" - gke-gcsfuse/memory-limit: 20Gi + gke-gcsfuse/memory-limit: 8Gi gke-gcsfuse/ephemeral-storage-limit: 20Gi nodeSelector: cloud.google.com/compute-class: "Performance" @@ -287,19 +283,19 @@ additionalWorkerGroups: # for further guidance. resources: limits: - cpu: "8" + cpu: "4" nvidia.com/gpu: "2" - memory: "40G" + memory: "16G" ephemeral-storage: 20Gi requests: - cpu: "8" + cpu: "4" nvidia.com/gpu: "2" - memory: "40G" + memory: "16G" ephemeral-storage: 20Gi annotations: gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "2" - gke-gcsfuse/memory-limit: 20Gi + gke-gcsfuse/memory-limit: 8Gi gke-gcsfuse/ephemeral-storage-limit: 20Gi nodeSelector: cloud.google.com/compute-class: "Accelerator"