From acddb696fd36483eb5efdd650e47204d8e51e2c1 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Thu, 19 Dec 2024 21:21:03 -0500 Subject: [PATCH 1/3] support p5en instance types and update neuron CR to fully support trn2 --- .../templates/linux/neuron-monitor-daemonset.yaml | 8 +++++++- charts/amazon-cloudwatch-observability/values.yaml | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml index 75e20561..1f9fdf7f 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml @@ -36,7 +36,7 @@ spec: - name: PATH value: /usr/local/bin:/usr/bin:/bin:/opt/aws/neuron/bin - name: GOMEMLIMIT - value: 160MiB + value: 320MiB ports: - name: "metrics" port: {{ .Values.neuronMonitor.service.port }} @@ -52,6 +52,9 @@ spec: - mountPath: /etc/amazon-cloudwatch-observability-neuron-cert/ name: neurontls readOnly: true + - mountPath: /opt-aws + name: "aws-config" + readOnly: true volumes: - name: neurontls secret: @@ -61,6 +64,9 @@ spec: path: server.crt - key: tls.key path: server.key + - name: "aws-config" + hostPath: + path: /opt/aws monitorConfig: | { "period": "5s", diff --git a/charts/amazon-cloudwatch-observability/values.yaml b/charts/amazon-cloudwatch-observability/values.yaml index b6ad767f..7b51125e 100644 --- a/charts/amazon-cloudwatch-observability/values.yaml +++ b/charts/amazon-cloudwatch-observability/values.yaml @@ -73,6 +73,7 @@ gpuInstances: - p4de.24xlarge - p5.48xlarge - p5e.48xlarge + - p5en.48xlarge - ml.g3.4xlarge - ml.g3.8xlarge - ml.g3.16xlarge @@ -132,6 +133,7 @@ gpuInstances: - ml.p4de.24xlarge - ml.p5.48xlarge - ml.p5e.48xlarge + - ml.p5en.48xlarge ## Tranium/Infrentia instance types neuronInstances: - trn1.2xlarge From 7ffe8f27e2a77255d97203efbc98160de7391e3d Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Fri, 20 Dec 2024 11:18:05 -0500 Subject: [PATCH 2/3] remove trn2 support changes --- .../templates/linux/neuron-monitor-daemonset.yaml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml index 1f9fdf7f..465b43c6 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml @@ -36,7 +36,7 @@ spec: - name: PATH value: /usr/local/bin:/usr/bin:/bin:/opt/aws/neuron/bin - name: GOMEMLIMIT - value: 320MiB + value: 160MiB ports: - name: "metrics" port: {{ .Values.neuronMonitor.service.port }} @@ -52,9 +52,6 @@ spec: - mountPath: /etc/amazon-cloudwatch-observability-neuron-cert/ name: neurontls readOnly: true - - mountPath: /opt-aws - name: "aws-config" - readOnly: true volumes: - name: neurontls secret: @@ -64,9 +61,6 @@ spec: path: server.crt - key: tls.key path: server.key - - name: "aws-config" - hostPath: - path: /opt/aws monitorConfig: | { "period": "5s", @@ -85,7 +79,7 @@ spec: }, { "type": "execution_stats" - } + }`` ] } ], From 5bde7e6f445951eb3db3fc12d8a3efaa0196a1b3 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Fri, 20 Dec 2024 11:20:04 -0500 Subject: [PATCH 3/3] typo --- .../templates/linux/neuron-monitor-daemonset.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml index 465b43c6..75e20561 100644 --- a/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml +++ b/charts/amazon-cloudwatch-observability/templates/linux/neuron-monitor-daemonset.yaml @@ -79,7 +79,7 @@ spec: }, { "type": "execution_stats" - }`` + } ] } ],