From b90bf68a41d8e97696a822960f0a1f392775c5b1 Mon Sep 17 00:00:00 2001
From: ElijahQuinones <118852979+ElijahQuinones@users.noreply.github.com>
Date: Tue, 13 Aug 2024 14:00:11 -0400
Subject: [PATCH] UPSTREAM: 2108: Fix gpus not being considered when counting
 allocatables (#2108)

* Fix gpus not being considered when counting allocatables

* Parallelize volume limits table generating scripts refactor volume limits unit tests add go doc comment to GetReservedSlotsForInstanceType
---
 hack/generate-gpu-count-table.sh      | 37 ++++++++++++++
 hack/generate-instance-store-table.sh |  5 +-
 pkg/cloud/volume_limits.go            | 70 +++++++++++++++++++++++++--
 pkg/driver/node.go                    |  2 +-
 pkg/driver/node_test.go               | 50 +++++++++++++++++++
 5 files changed, 157 insertions(+), 7 deletions(-)
 create mode 100755 hack/generate-gpu-count-table.sh

diff --git a/hack/generate-gpu-count-table.sh b/hack/generate-gpu-count-table.sh
new file mode 100755
index 0000000000..00ae0e4c03
--- /dev/null
+++ b/hack/generate-gpu-count-table.sh
@@ -0,0 +1,37 @@
+# Copyright 2024 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Generates gpu table for `pkg/cloud/volume_limits.go` from the AWS API
+# Ensure you are opted into all opt-in regions before running
+# Ensure your account isn't in any private instance type betas before running
+
+set -euo pipefail
+
+BIN="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../bin"
+
+function get_gpus_for_region() {
+  REGION="${1}"
+  echo "Getting gpu counts for ${REGION}..." >&2
+  "${BIN}/aws" ec2 describe-instance-types --region "${REGION}" --query "InstanceTypes[?GpuInfo!=null].[InstanceType, GpuInfo]" |
+    jq -r 'map("\"" + .[0] + "\": " + (.[1].Gpus | map(.Count) | add | tostring) + ",") | .[]'
+}
+
+function get_all_gpus() {
+  "${BIN}/aws" account list-regions --max-results 50 | jq -r '.Regions | map(.RegionName) | .[]' | while read REGION; do
+    sleep 1
+    get_gpus_for_region $REGION &
+  done
+}
+
+get_all_gpus | sort | uniq
diff --git a/hack/generate-instance-store-table.sh b/hack/generate-instance-store-table.sh
index b4b4417e33..f94e3cba37 100755
--- a/hack/generate-instance-store-table.sh
+++ b/hack/generate-instance-store-table.sh
@@ -24,14 +24,15 @@ BIN="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../bin"
 
 function get_instance_stores_for_region() {
   REGION="${1}"
-  echo "Getting limits for ${REGION}..." >&2
+  echo "Getting instance store limits for ${REGION}..." >&2
   "${BIN}/aws" ec2 describe-instance-types --region "${REGION}" --filters "Name=instance-storage-supported,Values=true" --query "InstanceTypes[].[InstanceType, InstanceStorageInfo]" |
     jq -r 'map("\"" + .[0] + "\": " + (.[1].Disks | map(.Count) | add | tostring) + ",") | .[]'
 }
 
 function get_all_instance_stores() {
   "${BIN}/aws" account list-regions --max-results 50 | jq -r '.Regions | map(.RegionName) | .[]' | while read REGION; do
-    get_instance_stores_for_region $REGION
+    sleep 1
+    get_instance_stores_for_region $REGION &
   done
 }
 
diff --git a/pkg/cloud/volume_limits.go b/pkg/cloud/volume_limits.go
index 5a90b21314..b167369c9a 100644
--- a/pkg/cloud/volume_limits.go
+++ b/pkg/cloud/volume_limits.go
@@ -138,11 +138,18 @@ func GetDedicatedLimitForInstanceType(it string) int {
 	}
 }
 
-func GetNVMeInstanceStoreVolumesForInstanceType(it string) int {
-	if v, ok := nvmeInstanceStoreVolumes[it]; ok {
-		return v
+// GetReservedSlotsForInstanceType calculates how many attachment slots are already used up by other devices on shared EBS volume limit instances.
+func GetReservedSlotsForInstanceType(it string) int {
+	total := 0
+	nvmeInstanceStoreVolumes, ok := nvmeInstanceStoreVolumes[it]
+	if ok {
+		total += nvmeInstanceStoreVolumes
 	}
-	return 0
+	gpus, ok := gpuInstanceGpus[it]
+	if ok {
+		total += gpus
+	}
+	return total
 }
 
 // / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-volumes.html
@@ -489,3 +496,58 @@ var nvmeInstanceStoreVolumes = map[string]int{
 	"z1d.metal":       2,
 	"z1d.xlarge":      1,
 }
+
+// / https://aws.amazon.com/ec2/instance-types
+var gpuInstanceGpus = map[string]int{
+	"dl1.24xlarge":  8,
+	"g3.16xlarge":   4,
+	"g3.4xlarge":    1,
+	"g3.8xlarge":    2,
+	"g3s.xlarge":    1,
+	"g4ad.16xlarge": 4,
+	"g4ad.2xlarge":  1,
+	"g4ad.4xlarge":  1,
+	"g4ad.8xlarge":  2,
+	"g4ad.xlarge":   1,
+	"g4dn.12xlarge": 4,
+	"g4dn.16xlarge": 1,
+	"g4dn.2xlarge":  1,
+	"g4dn.4xlarge":  1,
+	"g4dn.8xlarge":  1,
+	"g4dn.metal":    8,
+	"g4dn.xlarge":   1,
+	"g5.12xlarge":   4,
+	"g5.16xlarge":   1,
+	"g5.24xlarge":   4,
+	"g5.2xlarge":    1,
+	"g5.48xlarge":   8,
+	"g5.4xlarge":    1,
+	"g5.8xlarge":    1,
+	"g5g.16xlarge":  2,
+	"g5g.2xlarge":   1,
+	"g5g.4xlarge":   1,
+	"g5g.8xlarge":   1,
+	"g5g.metal":     2,
+	"g5g.xlarge":    1,
+	"g5.xlarge":     1,
+	"g6.12xlarge":   4,
+	"g6.16xlarge":   1,
+	"g6.24xlarge":   4,
+	"g6.2xlarge":    1,
+	"g6.48xlarge":   8,
+	"g6.4xlarge":    1,
+	"g6.8xlarge":    1,
+	"g6.xlarge":     1,
+	"gr6.4xlarge":   1,
+	"gr6.8xlarge":   1,
+	"p2.16xlarge":   16,
+	"p2.8xlarge":    8,
+	"p2.xlarge":     1,
+	"p3.16xlarge":   8,
+	"p3.2xlarge":    1,
+	"p3.8xlarge":    4,
+	"p3dn.24xlarge": 8,
+	"p4d.24xlarge":  8,
+	"p4de.24xlarge": 8,
+	"p5.48xlarge":   8,
+}
diff --git a/pkg/driver/node.go b/pkg/driver/node.go
index a088ce3b41..fcd50c0d11 100644
--- a/pkg/driver/node.go
+++ b/pkg/driver/node.go
@@ -790,7 +790,7 @@ func (d *NodeService) getVolumesLimit() int64 {
 		availableAttachments = dedicatedLimit
 	} else if isNitro {
 		enis := d.metadata.GetNumAttachedENIs()
-		nvmeInstanceStoreVolumes := cloud.GetNVMeInstanceStoreVolumesForInstanceType(instanceType)
+		nvmeInstanceStoreVolumes := cloud.GetReservedSlotsForInstanceType(instanceType)
 		availableAttachments = availableAttachments - enis - nvmeInstanceStoreVolumes
 	}
 	availableAttachments = availableAttachments - reservedVolumeAttachments
diff --git a/pkg/driver/node_test.go b/pkg/driver/node_test.go
index 1b4856e9f9..1c72fef2ca 100644
--- a/pkg/driver/node_test.go
+++ b/pkg/driver/node_test.go
@@ -1183,6 +1183,56 @@ func TestGetVolumesLimit(t *testing.T) {
 				return m
 			},
 		},
+		{
+			name: "g4dn.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
+			options: &Options{
+				VolumeAttachLimit:         -1,
+				ReservedVolumeAttachments: -1,
+			},
+			expectedVal: 24,
+			metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
+				m := metadata.NewMockMetadataService(ctrl)
+				m.EXPECT().GetRegion().Return("us-west-2")
+				m.EXPECT().GetInstanceType().Return("g4dn.xlarge")
+				m.EXPECT().GetNumBlockDeviceMappings().Return(0)
+				m.EXPECT().GetNumAttachedENIs().Return(1)
+				return m
+			},
+		},
+		// 1 gpu
+		{
+			name: "g4ad.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
+			options: &Options{
+				VolumeAttachLimit:         -1,
+				ReservedVolumeAttachments: -1,
+			},
+			expectedVal: 24,
+			metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
+				m := metadata.NewMockMetadataService(ctrl)
+				m.EXPECT().GetRegion().Return("us-west-2")
+				m.EXPECT().GetInstanceType().Return("g4ad.xlarge")
+				m.EXPECT().GetNumBlockDeviceMappings().Return(0)
+				m.EXPECT().GetNumAttachedENIs().Return(1)
+				return m
+			},
+		},
+		// 4 gpus
+		{
+			name: "g4dn.12xlarge_volume_attach_limit (4 GPUS, 1 InstanceStoreVolume)",
+			options: &Options{
+				VolumeAttachLimit:         -1,
+				ReservedVolumeAttachments: -1,
+			},
+			expectedVal: 21,
+			metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
+				m := metadata.NewMockMetadataService(ctrl)
+				m.EXPECT().GetRegion().Return("us-west-2")
+				m.EXPECT().GetInstanceType().Return("g4dn.12xlarge")
+				m.EXPECT().GetNumBlockDeviceMappings().Return(0)
+				m.EXPECT().GetNumAttachedENIs().Return(1)
+				return m
+			},
+		},
 	}
 
 	for _, tc := range testCases {