Skip to content

Commit

Permalink
Fix accelerators not being considered when counting allocatables
Browse files Browse the repository at this point in the history
  • Loading branch information
ElijahQuinones committed Aug 15, 2024
1 parent 911b991 commit 9c32b47
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 35 deletions.
3 changes: 2 additions & 1 deletion hack/generate-gpu-count-table.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# Generates gpu table for `pkg/cloud/volume_limits.go` from the AWS API
# Ensure you are opted into all opt-in regions before running
# Ensure your account isn't in any private instance type betas before running
# We are exluding the g5.48xlarge instance type as it is a special case that does not comply to regular ebs volume limit calculations

set -euo pipefail

Expand All @@ -34,4 +35,4 @@ function get_all_gpus() {
done
}

get_all_gpus | sort | uniq
get_all_gpus | sort | uniq | grep -v "g5.48xlarge"
3 changes: 2 additions & 1 deletion hack/generate-instance-store-table.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# Generates instance store table for `pkg/cloud/volume_limits.go` from the AWS API
# Ensure you are opted into all opt-in regions before running
# Ensure your account isn't in any private instance type betas before running
# We are exluding the g5.48xlarge instance type as it is a special case that does not comply to regular ebs volume limit calculations

set -euo pipefail

Expand All @@ -36,4 +37,4 @@ function get_all_instance_stores() {
done
}

get_all_instance_stores | sort | uniq
get_all_instance_stores | sort | uniq | grep -v "g5.48xlarge"
43 changes: 33 additions & 10 deletions pkg/cloud/volume_limits.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
"strings"
)

// / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances
const (
highMemoryMetalInstancesMaxVolumes = 19
highMemoryVirtualInstancesMaxVolumes = 27
Expand Down Expand Up @@ -51,7 +51,7 @@ func init() {

var dedicatedVolumeLimits = map[string]int{}

// / List of nitro instance types can be found here: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances
// List of nitro instance types can be found here: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-types.html#ec2-nitro-instances
var nonNitroInstanceFamilies = map[string]struct{}{
"t2": {},
"c3": {},
Expand All @@ -67,6 +67,7 @@ var nonNitroInstanceFamilies = map[string]struct{}{
"g3": {},
"d2": {},
"h1": {},
"f1": {},
}

func IsNitroInstanceType(it string) bool {
Expand All @@ -88,8 +89,8 @@ func GetMaxAttachments(nitro bool) int {
return nonNitroMaxAttachments
}

// / Some instance types have a maximum limit of EBS volumes
// / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html
// Some instance types have a maximum limit of EBS volumes
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html
var maxVolumeLimits = map[string]int{
"d3.8xlarge": 3,
"d3en.12xlarge": 3,
Expand Down Expand Up @@ -149,12 +150,16 @@ func GetReservedSlotsForInstanceType(it string) int {
if ok {
total += gpus
}
acceleratorSlots, ok := acceleratorSlotsTaken[it]
if ok {
total += acceleratorSlots
}
return total
}

// / https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-volumes.html
// / IMDS does not provide NVMe instance store data; we'll just list all instances here
// / TODO: See if we can get these values from DescribeInstanceTypes API
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-volumes.html
// IMDS does not provide NVMe instance store data; we'll just list all instances here
// g5.48xlarge is not added to this table as it is in the maxVolumeLimits
var nvmeInstanceStoreVolumes = map[string]int{
"c1.medium": 1,
"c1.xlarge": 4,
Expand Down Expand Up @@ -242,7 +247,6 @@ var nvmeInstanceStoreVolumes = map[string]int{
"g5.16xlarge": 1,
"g5.24xlarge": 1,
"g5.2xlarge": 1,
"g5.48xlarge": 2,
"g5.4xlarge": 1,
"g5.8xlarge": 1,
"g5.xlarge": 1,
Expand Down Expand Up @@ -497,7 +501,9 @@ var nvmeInstanceStoreVolumes = map[string]int{
"z1d.xlarge": 1,
}

// / https://aws.amazon.com/ec2/instance-types
// https://aws.amazon.com/ec2/instance-types
// Despite the dl1.24xlarge having Gaudi Accelerators describe instance types considers them GPUs as such that instacne type is in this table
// g5.48xlarge is not added to this table as it is in the maxVolumeLimits
var gpuInstanceGpus = map[string]int{
"dl1.24xlarge": 8,
"g3.16xlarge": 4,
Expand All @@ -520,7 +526,6 @@ var gpuInstanceGpus = map[string]int{
"g5.16xlarge": 1,
"g5.24xlarge": 4,
"g5.2xlarge": 1,
"g5.48xlarge": 8,
"g5.4xlarge": 1,
"g5.8xlarge": 1,
"g5g.16xlarge": 2,
Expand Down Expand Up @@ -551,3 +556,21 @@ var gpuInstanceGpus = map[string]int{
"p4de.24xlarge": 8,
"p5.48xlarge": 8,
}

// Note this table is not a reflection of how many accelerators an instance has but of how many slots their combined acclerators take up
// VT instance type accelerators take two slots each with the exception of the vt1.24xlarge which takes 0 slots for its accelerators
// inf1 instance types are purposely not added to this table as they are in the maxVolumeLimits table
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html
var acceleratorSlotsTaken = map[string]int{
"vt1.3xlarge": 2,
"vt1.6xlarge": 4,
"vt1.24xlarge": 0,
"dl2q.24xlarge": 8,
"inf2.xlarge": 1,
"inf2.8xlarge": 1,
"inf2.24xlarge": 6,
"inf2.48xlarge": 12,
"trn1.2xlarge": 1,
"trn1.32xlarge": 16,
"trn1n.32xlarge": 16,
}
12 changes: 8 additions & 4 deletions pkg/driver/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -781,8 +781,8 @@ func (d *NodeService) getVolumesLimit() int64 {
}

dedicatedLimit := cloud.GetDedicatedLimitForInstanceType(instanceType)
maxEBSAttachments, ok := cloud.GetEBSLimitForInstanceType(instanceType)
if ok {
maxEBSAttachments, hasMaxVolumeLimit := cloud.GetEBSLimitForInstanceType(instanceType)
if hasMaxVolumeLimit {
availableAttachments = min(maxEBSAttachments, availableAttachments)
}
// For special dedicated limit instance types, the limit is only for EBS volumes
Expand All @@ -791,8 +791,12 @@ func (d *NodeService) getVolumesLimit() int64 {
availableAttachments = dedicatedLimit
} else if isNitro {
enis := d.metadata.GetNumAttachedENIs()
nvmeInstanceStoreVolumes := cloud.GetReservedSlotsForInstanceType(instanceType)
availableAttachments = availableAttachments - enis - nvmeInstanceStoreVolumes
reservedSlots := cloud.GetReservedSlotsForInstanceType(instanceType)
if hasMaxVolumeLimit {
availableAttachments = availableAttachments - (enis - 1) - reservedSlots
} else {
availableAttachments = availableAttachments - enis - reservedSlots
}
}
availableAttachments = availableAttachments - reservedVolumeAttachments
if availableAttachments <= 0 {
Expand Down
121 changes: 102 additions & 19 deletions pkg/driver/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1136,23 +1136,87 @@ func TestGetVolumesLimit(t *testing.T) {
},
},
{
name: "inf1.24xlarge_volume_attach_limit",
name: "mac1.metal_volume_attach_limit",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 9,
expectedVal: 15,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("inf1.24xlarge")
m.EXPECT().GetInstanceType().Return("mac1.metal")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "u-12tb1.metal_volume_attach_limit",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 18,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("u-12tb1.metal")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "mac1.metal_volume_attach_limit",
name: "g4dn.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 24,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4dn.xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "g4ad.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 24,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4ad.xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "g4dn.12xlarge_volume_attach_limit (4 GPUS, 1 InstanceStoreVolume)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 21,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4dn.12xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "dl1.24xlarge_volume_attach_limit (8 Accelerator slots , 4 InstanceStoreVolume)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
Expand All @@ -1161,73 +1225,92 @@ func TestGetVolumesLimit(t *testing.T) {
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("mac1.metal")
m.EXPECT().GetInstanceType().Return("dl1.24xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "u-12tb1.metal_volume_attach_limit",
// will and should fail if g5.48xlarge instance type is in any table other than maxVolumeLimits table
name: "g5.48xlarge_volume_attach_limit (Instance has attached GPUs and NVMe Instance Store volumes but should be ignored for EBS volume limits calculation)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 17,
expectedVal: 8,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("u-12tb1.metal")
m.EXPECT().GetInstanceType().Return("g5.48xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
name: "g4dn.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
// Should fail if inf1.xlarge instance type is in any table other than maxVolumeLimits table
name: "inf1.xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 24,
expectedVal: 25,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4dn.xlarge")
m.EXPECT().GetInstanceType().Return("inf1.xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
// 1 gpu
{
name: "g4ad.xlarge_volume_attach_limit (1 GPU 1 InstanceStoreVolume)",
// Should fail if inf1.xlarge instance type is in any table other than maxVolumeLimits table
name: "inf1.2xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 24,
expectedVal: 25,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4ad.xlarge")
m.EXPECT().GetInstanceType().Return("inf1.2xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
// 4 gpus
{
name: "g4dn.12xlarge_volume_attach_limit (4 GPUS, 1 InstanceStoreVolume)",
// Should fail if inf1.6xlarge instance type is in any table other than maxVolumeLimits table
name: "inf1.6xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 21,
expectedVal: 22,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("g4dn.12xlarge")
m.EXPECT().GetInstanceType().Return("inf1.6xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
},
},
{
// Should fail if inf1.24xlarge instance type is in any table other than maxVolumeLimits table
name: "inf1.24xlarge_volume_attach_limit (Instance has attached Accelerators but should be ignored for EBS volume limits calculation)",
options: &Options{
VolumeAttachLimit: -1,
ReservedVolumeAttachments: -1,
},
expectedVal: 10,
metadataMock: func(ctrl *gomock.Controller) *metadata.MockMetadataService {
m := metadata.NewMockMetadataService(ctrl)
m.EXPECT().GetRegion().Return("us-west-2")
m.EXPECT().GetInstanceType().Return("inf1.24xlarge")
m.EXPECT().GetNumBlockDeviceMappings().Return(0)
m.EXPECT().GetNumAttachedENIs().Return(1)
return m
Expand Down

0 comments on commit 9c32b47

Please sign in to comment.