Skip to content

Commit

Permalink
UPSTREAM: 2108: Fix gpus not being considered when counting allocatab…
Browse files Browse the repository at this point in the history
…les (kubernetes-sigs#2108)

* Fix gpus not being considered when counting allocatables

* Parallelize volume limits table generating scripts refactor volume limits unit tests add go doc comment to GetReservedSlotsForInstanceType
  • Loading branch information
ElijahQuinones authored and mpatlasov committed Aug 20, 2024
1 parent 2bb5b31 commit b90bf68
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 7 deletions.
37 changes: 37 additions & 0 deletions hack/generate-gpu-count-table.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2024 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Generates gpu table for `pkg/cloud/volume_limits.go` from the AWS API
# Ensure you are opted into all opt-in regions before running
# Ensure your account isn't in any private instance type betas before running

set -euo pipefail

BIN="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../bin"

function get_gpus_for_region() {
REGION="${1}"
echo "Getting gpu counts for ${REGION}..." >&2
"${BIN}/aws" ec2 describe-instance-types --region "${REGION}" --query "InstanceTypes[?GpuInfo!=null].[InstanceType, GpuInfo]" |
jq -r 'map("\"" + .[0] + "\": " + (.[1].Gpus | map(.Count) | add | tostring) + ",") | .[]'
}

function get_all_gpus() {
"${BIN}/aws" account list-regions --max-results 50 | jq -r '.Regions | map(.RegionName) | .[]' | while read REGION; do
sleep 1
get_gpus_for_region $REGION &
done
}

get_all_gpus | sort | uniq
5 changes: 3 additions & 2 deletions hack/generate-instance-store-table.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,15 @@ BIN="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/../bin"

function get_instance_stores_for_region() {
REGION="${1}"
echo "Getting limits for ${REGION}..." >&2
echo "Getting instance store limits for ${REGION}..." >&2
"${BIN}/aws" ec2 describe-instance-types --region "${REGION}" --filters "Name=instance-storage-supported,Values=true" --query "InstanceTypes[].[InstanceType, InstanceStorageInfo]" |
jq -r 'map("\"" + .[0] + "\": " + (.[1].Disks | map(.Count) | add | tostring) + ",") | .[]'
}

function get_all_instance_stores() {
"${BIN}/aws" account list-regions --max-results 50 | jq -r '.Regions | map(.RegionName) | .[]' | while read REGION; do
get_instance_stores_for_region $REGION
sleep 1
get_instance_stores_for_region $REGION &
done
}

Expand Down
70 changes: 66 additions & 4 deletions pkg/cloud/volume_limits.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,18 @@ func GetDedicatedLimitForInstanceType(it string) int {
}
}

func GetNVMeInstanceStoreVolumesForInstanceType(it string) int {
if v, ok := nvmeInstanceStoreVolumes[it]; ok {
return v
// GetReservedSlotsForInstanceType calculates how many attachment slots are already used up by other devices on shared EBS volume limit instances.
func GetReservedSlotsForInstanceType(it string) int {
total := 0
nvmeInstanceStoreVolumes, ok := nvmeInstanceStoreVolumes[it]
if ok {
total += nvmeInstanceStoreVolumes
}
return 0
gpus, ok := gpuInstanceGpus[it]
if ok {
total += gpus
}
return total
}

// / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-volumes.html
Expand Down Expand Up @@ -489,3 +496,58 @@ var nvmeInstanceStoreVolumes = map[string]int{
"z1d.metal": 2,
"z1d.xlarge": 1,
}

// / https://aws.amazon.com/ec2/instance-types
var gpuInstanceGpus = map[string]int{
"dl1.24xlarge": 8,
"g3.16xlarge": 4,
"g3.4xlarge": 1,
"g3.8xlarge": 2,
"g3s.xlarge": 1,
"g4ad.16xlarge": 4,
"g4ad.2xlarge": 1,
"g4ad.4xlarge": 1,
"g4ad.8xlarge": 2,
"g4ad.xlarge": 1,
"g4dn.12xlarge": 4,
"g4dn.16xlarge": 1,
"g4dn.2xlarge": 1,
"g4dn.4xlarge": 1,
"g4dn.8xlarge": 1,
"g4dn.metal": 8,
"g4dn.xlarge": 1,
"g5.12xlarge": 4,
"g5.16xlarge": 1,
"g5.24xlarge": 4,
"g5.2xlarge": 1,
"g5.48xlarge": 8,
"g5.4xlarge": 1,
"g5.8xlarge": 1,
"g5g.16xlarge": 2,
"g5g.2xlarge": 1,
"g5g.4xlarge": 1,
"g5g.8xlarge": 1,
"g5g.metal": 2,
"g5g.xlarge": 1,
"g5.xlarge": 1,
"g6.12xlarge": 4,
"g6.16xlarge": 1,
"g6.24xlarge": 4,
"g6.2xlarge": 1,
"g6.48xlarge": 8,
"g6.4xlarge": 1,
"g6.8xlarge": 1,
"g6.xlarge": 1,
"gr6.4xlarge": 1,
"gr6.8xlarge": 1,
"p2.16xlarge": 16,
"p2.8xlarge": 8,
"p2.xlarge": 1,
"p3.16xlarge": 8,
"p3.2xlarge": 1,
"p3.8xlarge": 4,
"p3dn.24xlarge": 8,
"p4d.24xlarge": 8,
"p4de.24xlarge": 8,
"p5.48xlarge": 8,
}
2 changes: 1 addition & 1 deletion pkg/driver/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -790,7 +790,7 @@ func (d *NodeService) getVolumesLimit() int64 {
availableAttachments = dedicatedLimit
} else if isNitro {
enis := d.metadata.GetNumAttachedENIs()
nvmeInstanceStoreVolumes := cloud.GetNVMeInstanceStoreVolumesForInstanceType(instanceType)
nvmeInstanceStoreVolumes := cloud.GetReservedSlotsForInstanceType(instanceType)
availableAttachments = availableAttachments - enis - nvmeInstanceStoreVolumes
}
availableAttachments = availableAttachments - reservedVolumeAttachments
Expand Down
50 changes: 50 additions & 0 deletions pkg/driver/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1183,6 +1183,56 @@ func TestGetVolumesLimit(t *testing.T) {
return m
},
},
{
name: "g4dn.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 24,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4dn.xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
// 1 gpu
{
name: "g4ad.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 24,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4ad.xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
// 4 gpus
{
name: "g4dn.12xlarge_volume_attach_limit (4 GPUS, 1 InstanceStoreVolume)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 21,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4dn.12xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
}

for _, tc := range testCases {
Expand Down

0 comments on commit b90bf68

Please sign in to comment.