From b90bf68a41d8e97696a822960f0a1f392775c5b1 Mon Sep 17 00:00:00 2001 From: ElijahQuinones <118852979+ElijahQuinones@users.noreply.github.com> Date: Tue, 13 Aug 2024 14:00:11 -0400 Subject: [PATCH] UPSTREAM: 2108: Fix gpus not being considered when counting allocatables (#2108) * Fix gpus not being considered when counting allocatables * Parallelize volume limits table generating scripts refactor volume limits unit tests add go doc comment to GetReservedSlotsForInstanceType --- hack/generate-gpu-count-table.sh | 37 ++++++++++++++ hack/generate-instance-store-table.sh | 5 +- pkg/cloud/volume_limits.go | 70 +++++++++++++++++++++++++-- pkg/driver/node.go | 2 +- pkg/driver/node_test.go | 50 +++++++++++++++++++ 5 files changed, 157 insertions(+), 7 deletions(-) create mode 100755 hack/generate-gpu-count-table.sh diff --git a/hack/generate-gpu-count-table.sh b/hack/generate-gpu-count-table.sh new file mode 100755 index 0000000000..00ae0e4c03 --- /dev/null +++ b/hack/generate-gpu-count-table.sh @@ -0,0 +1,37 @@ +# Copyright 2024 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Generates gpu table for `pkg/cloud/volume_limits.go` from the AWS API +# Ensure you are opted into all opt-in regions before running +# Ensure your account isn't in any private instance type betas before running + +set -euo pipefail + +BIN="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../bin" + +function get_gpus_for_region() { + REGION="${1}" + echo "Getting gpu counts for ${REGION}..." >&2 + "${BIN}/aws" ec2 describe-instance-types --region "${REGION}" --query "InstanceTypes[?GpuInfo!=null].[InstanceType, GpuInfo]" | + jq -r 'map("\"" + .[0] + "\": " + (.[1].Gpus | map(.Count) | add | tostring) + ",") | .[]' +} + +function get_all_gpus() { + "${BIN}/aws" account list-regions --max-results 50 | jq -r '.Regions | map(.RegionName) | .[]' | while read REGION; do + sleep 1 + get_gpus_for_region $REGION & + done +} + +get_all_gpus | sort | uniq diff --git a/hack/generate-instance-store-table.sh b/hack/generate-instance-store-table.sh index b4b4417e33..f94e3cba37 100755 --- a/hack/generate-instance-store-table.sh +++ b/hack/generate-instance-store-table.sh @@ -24,14 +24,15 @@ BIN="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../bin" function get_instance_stores_for_region() { REGION="${1}" - echo "Getting limits for ${REGION}..." >&2 + echo "Getting instance store limits for ${REGION}..." >&2 "${BIN}/aws" ec2 describe-instance-types --region "${REGION}" --filters "Name=instance-storage-supported,Values=true" --query "InstanceTypes[].[InstanceType, InstanceStorageInfo]" | jq -r 'map("\"" + .[0] + "\": " + (.[1].Disks | map(.Count) | add | tostring) + ",") | .[]' } function get_all_instance_stores() { "${BIN}/aws" account list-regions --max-results 50 | jq -r '.Regions | map(.RegionName) | .[]' | while read REGION; do - get_instance_stores_for_region $REGION + sleep 1 + get_instance_stores_for_region $REGION & done } diff --git a/pkg/cloud/volume_limits.go b/pkg/cloud/volume_limits.go index 5a90b21314..b167369c9a 100644 --- a/pkg/cloud/volume_limits.go +++ b/pkg/cloud/volume_limits.go @@ -138,11 +138,18 @@ func GetDedicatedLimitForInstanceType(it string) int { } } -func GetNVMeInstanceStoreVolumesForInstanceType(it string) int { - if v, ok := nvmeInstanceStoreVolumes[it]; ok { - return v +// GetReservedSlotsForInstanceType calculates how many attachment slots are already used up by other devices on shared EBS volume limit instances. +func GetReservedSlotsForInstanceType(it string) int { + total := 0 + nvmeInstanceStoreVolumes, ok := nvmeInstanceStoreVolumes[it] + if ok { + total += nvmeInstanceStoreVolumes } - return 0 + gpus, ok := gpuInstanceGpus[it] + if ok { + total += gpus + } + return total } // / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-volumes.html @@ -489,3 +496,58 @@ var nvmeInstanceStoreVolumes = map[string]int{ "z1d.metal": 2, "z1d.xlarge": 1, } + +// / https://aws.amazon.com/ec2/instance-types +var gpuInstanceGpus = map[string]int{ + "dl1.24xlarge": 8, + "g3.16xlarge": 4, + "g3.4xlarge": 1, + "g3.8xlarge": 2, + "g3s.xlarge": 1, + "g4ad.16xlarge": 4, + "g4ad.2xlarge": 1, + "g4ad.4xlarge": 1, + "g4ad.8xlarge": 2, + "g4ad.xlarge": 1, + "g4dn.12xlarge": 4, + "g4dn.16xlarge": 1, + "g4dn.2xlarge": 1, + "g4dn.4xlarge": 1, + "g4dn.8xlarge": 1, + "g4dn.metal": 8, + "g4dn.xlarge": 1, + "g5.12xlarge": 4, + "g5.16xlarge": 1, + "g5.24xlarge": 4, + "g5.2xlarge": 1, + "g5.48xlarge": 8, + "g5.4xlarge": 1, + "g5.8xlarge": 1, + "g5g.16xlarge": 2, + "g5g.2xlarge": 1, + "g5g.4xlarge": 1, + "g5g.8xlarge": 1, + "g5g.metal": 2, + "g5g.xlarge": 1, + "g5.xlarge": 1, + "g6.12xlarge": 4, + "g6.16xlarge": 1, + "g6.24xlarge": 4, + "g6.2xlarge": 1, + "g6.48xlarge": 8, + "g6.4xlarge": 1, + "g6.8xlarge": 1, + "g6.xlarge": 1, + "gr6.4xlarge": 1, + "gr6.8xlarge": 1, + "p2.16xlarge": 16, + "p2.8xlarge": 8, + "p2.xlarge": 1, + "p3.16xlarge": 8, + "p3.2xlarge": 1, + "p3.8xlarge": 4, + "p3dn.24xlarge": 8, + "p4d.24xlarge": 8, + "p4de.24xlarge": 8, + "p5.48xlarge": 8, +} diff --git a/pkg/driver/node.go b/pkg/driver/node.go index a088ce3b41..fcd50c0d11 100644 --- a/pkg/driver/node.go +++ b/pkg/driver/node.go @@ -790,7 +790,7 @@ func (d *NodeService) getVolumesLimit() int64 { availableAttachments = dedicatedLimit } else if isNitro { enis := d.metadata.GetNumAttachedENIs() - nvmeInstanceStoreVolumes := cloud.GetNVMeInstanceStoreVolumesForInstanceType(instanceType) + nvmeInstanceStoreVolumes := cloud.GetReservedSlotsForInstanceType(instanceType) availableAttachments = availableAttachments - enis - nvmeInstanceStoreVolumes } availableAttachments = availableAttachments - reservedVolumeAttachments diff --git a/pkg/driver/node_test.go b/pkg/driver/node_test.go index 1b4856e9f9..1c72fef2ca 100644 --- a/pkg/driver/node_test.go +++ b/pkg/driver/node_test.go @@ -1183,6 +1183,56 @@ func TestGetVolumesLimit(t *testing.T) { return m }, }, + { + name: "g4dn.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)", + options: &Options{ + VolumeAttachLimit: -1, + ReservedVolumeAttachments: -1, + }, + expectedVal: 24, + metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { + m := metadata.NewMockMetadataService(ctrl) + m.EXPECT().GetRegion().Return("us-west-2") + m.EXPECT().GetInstanceType().Return("g4dn.xlarge") + m.EXPECT().GetNumBlockDeviceMappings().Return(0) + m.EXPECT().GetNumAttachedENIs().Return(1) + return m + }, + }, + // 1 gpu + { + name: "g4ad.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)", + options: &Options{ + VolumeAttachLimit: -1, + ReservedVolumeAttachments: -1, + }, + expectedVal: 24, + metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { + m := metadata.NewMockMetadataService(ctrl) + m.EXPECT().GetRegion().Return("us-west-2") + m.EXPECT().GetInstanceType().Return("g4ad.xlarge") + m.EXPECT().GetNumBlockDeviceMappings().Return(0) + m.EXPECT().GetNumAttachedENIs().Return(1) + return m + }, + }, + // 4 gpus + { + name: "g4dn.12xlarge_volume_attach_limit (4 GPUS, 1 InstanceStoreVolume)", + options: &Options{ + VolumeAttachLimit: -1, + ReservedVolumeAttachments: -1, + }, + expectedVal: 21, + metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService { + m := metadata.NewMockMetadataService(ctrl) + m.EXPECT().GetRegion().Return("us-west-2") + m.EXPECT().GetInstanceType().Return("g4dn.12xlarge") + m.EXPECT().GetNumBlockDeviceMappings().Return(0) + m.EXPECT().GetNumAttachedENIs().Return(1) + return m + }, + }, } for _, tc := range testCases {