Skip to content

Commit

Permalink
Fix bug with MIG devices introduced when adding opaque config support
Browse files Browse the repository at this point in the history
Signed-off-by: Kevin Klues <[email protected]>
  • Loading branch information
klueska committed Sep 12, 2024
1 parent 2609cb0 commit 2eb5f4a
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 34 deletions.
3 changes: 3 additions & 0 deletions api/nvidia.com/resource/gpu/v1alpha1/migconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ func DefaultMigDeviceConfig() *MigDeviceConfig {
APIVersion: GroupName + "/" + Version,
Kind: MigDeviceConfigKind,
},
Sharing: &MigDeviceSharing{
Strategy: TimeSlicingStrategy,
},
}
}

Expand Down
5 changes: 4 additions & 1 deletion api/nvidia.com/resource/gpu/v1alpha1/sharing.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,10 @@ func (s *GpuSharing) IsMps() bool {

// IsTimeSlicing checks if the TimeSlicing strategy is applied.
func (s *MigDeviceSharing) IsTimeSlicing() bool {
return false
if s == nil {
return false
}
return s.Strategy == TimeSlicingStrategy
}

// IsMps checks if the MPS strategy is applied.
Expand Down
6 changes: 5 additions & 1 deletion api/nvidia.com/resource/gpu/v1alpha1/validate.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ func (s GpuSharingStrategy) Validate() error {

// Validate ensures that MigDeviceSharingStrategy has a valid set of values.
func (s MigDeviceSharingStrategy) Validate() error {
if s == MpsStrategy {
switch s {
case TimeSlicingStrategy, MpsStrategy:
return nil
}
return fmt.Errorf("unknown GPU sharing strategy: %v", s)
Expand Down Expand Up @@ -83,6 +84,9 @@ func (s *MigDeviceSharing) Validate() error {
if err := s.Strategy.Validate(); err != nil {
return err
}
if s.IsTimeSlicing() {
return nil
}
if s.IsMps() {
return s.MpsConfig.Validate()
}
Expand Down
18 changes: 10 additions & 8 deletions cmd/nvidia-dra-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
if _, ok := c.Config.(*configapi.GpuConfig); ok && device.Type() != GpuDeviceType {
return nil, fmt.Errorf("cannot apply GPU config to MIG device from request: %v", result.Request)
}
if _, ok := c.Config.(*configapi.MigDeviceConfig); ok && device.Type() == MigDeviceType {
if _, ok := c.Config.(*configapi.MigDeviceConfig); ok && device.Type() != MigDeviceType {
return nil, fmt.Errorf("cannot apply MIG device config to GPU from request: %v", result.Request)
}
configResultsMap[c.Config] = append(configResultsMap[c.Config], &result)
Expand All @@ -239,7 +239,7 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
if _, ok := c.Config.(*configapi.GpuConfig); ok && device.Type() != GpuDeviceType {
continue
}
if _, ok := c.Config.(*configapi.MigDeviceConfig); ok && device.Type() == MigDeviceType {
if _, ok := c.Config.(*configapi.MigDeviceConfig); ok && device.Type() != MigDeviceType {
continue
}
configResultsMap[c.Config] = append(configResultsMap[c.Config], &result)
Expand Down Expand Up @@ -317,7 +317,7 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
}
}

preparedDeviceGroup.Devices = append(preparedDeviceGroup.Devices, &preparedDevice)
preparedDeviceGroup.Devices = append(preparedDeviceGroup.Devices, preparedDevice)
}

preparedDevices = append(preparedDevices, &preparedDeviceGroup)
Expand All @@ -335,7 +335,7 @@ func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, dev

// Go back to default time-slicing for all full GPUs.
tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig
if err := s.tsManager.SetTimeSlice(group, tsc); err != nil {
if err := s.tsManager.SetTimeSlice(group.Devices.Gpus(), tsc); err != nil {
return fmt.Errorf("error setting timeslice for devices: %w", err)
}
}
Expand Down Expand Up @@ -367,15 +367,17 @@ func (s *DeviceState) applyConfig(ctx context.Context, config configapi.Interfac
// Declare a device group atate object to populate.
var configState DeviceConfigState

// Apply time-slicing settings.
// Apply time-slicing settings (if available).
if sharing.IsTimeSlicing() {
tsc, err := sharing.GetTimeSlicingConfig()
if err != nil {
return nil, fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
}
err = s.tsManager.SetTimeSlice(allocatableDevices, tsc)
if err != nil {
return nil, fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
if tsc != nil {
err = s.tsManager.SetTimeSlice(allocatableDevices, tsc)
if err != nil {
return nil, fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
}
}
}

Expand Down
77 changes: 53 additions & 24 deletions cmd/nvidia-dra-plugin/prepared.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
)

type PreparedDeviceList []PreparedDevice
type PreparedDevices []*PreparedDeviceGroup
type PreparedClaims map[string]PreparedDevices

Expand All @@ -39,8 +40,8 @@ type PreparedMigDevice struct {
}

type PreparedDeviceGroup struct {
Devices []*PreparedDevice `json:"devices"`
ConfigState DeviceConfigState `json:"configState"`
Devices PreparedDeviceList `json:"devices"`
ConfigState DeviceConfigState `json:"configState"`
}

func (d PreparedDevice) Type() string {
Expand Down Expand Up @@ -73,6 +74,26 @@ func (d *PreparedDevice) CanonicalIndex() string {
panic("unexpected type for AllocatableDevice")
}

func (l PreparedDeviceList) Gpus() PreparedDeviceList {
var devices PreparedDeviceList
for _, device := range l {
if device.Type() == GpuDeviceType {
devices = append(devices, device)
}
}
return devices
}

func (l PreparedDeviceList) MigDevices() PreparedDeviceList {
var devices PreparedDeviceList
for _, device := range l {
if device.Type() == MigDeviceType {
devices = append(devices, device)
}
}
return devices
}

func (d PreparedDevices) GetDevices() []*drapbv1.Device {
var devices []*drapbv1.Device
for _, group := range d {
Expand All @@ -94,46 +115,54 @@ func (d *PreparedDeviceGroup) GetDevices() []*drapbv1.Device {
return devices
}

func (d PreparedDevices) GpuUUIDs() []string {
var uuids []string
for _, group := range d {
uuids = append(uuids, group.GpuUUIDs()...)
}
return uuids
func (l PreparedDeviceList) UUIDs() []string {
return append(l.GpuUUIDs(), l.MigDeviceUUIDs()...)
}

func (d *PreparedDeviceGroup) GpuUUIDs() []string {
func (g *PreparedDeviceGroup) UUIDs() []string {
return append(g.GpuUUIDs(), g.MigDeviceUUIDs()...)
}

func (d PreparedDevices) UUIDs() []string {
return append(d.GpuUUIDs(), d.MigDeviceUUIDs()...)
}

func (l PreparedDeviceList) GpuUUIDs() []string {
var uuids []string
for _, device := range d.Devices {
if device.Type() == GpuDeviceType {
uuids = append(uuids, device.Gpu.Info.UUID)
}
for _, device := range l.Gpus() {
uuids = append(uuids, device.Gpu.Info.UUID)
}
return uuids
}

func (d PreparedDevices) MigDeviceUUIDs() []string {
func (g *PreparedDeviceGroup) GpuUUIDs() []string {
return g.Devices.Gpus().UUIDs()
}

func (d PreparedDevices) GpuUUIDs() []string {
var uuids []string
for _, group := range d {
uuids = append(uuids, group.MigDeviceUUIDs()...)
uuids = append(uuids, group.GpuUUIDs()...)
}
return uuids
}

func (d *PreparedDeviceGroup) MigDeviceUUIDs() []string {
func (l PreparedDeviceList) MigDeviceUUIDs() []string {
var uuids []string
for _, device := range d.Devices {
if device.Type() == MigDeviceType {
uuids = append(uuids, device.Mig.Info.UUID)
}
for _, device := range l.MigDevices() {
uuids = append(uuids, device.Mig.Info.UUID)
}
return uuids
}

func (d *PreparedDeviceGroup) UUIDs() []string {
return append(d.GpuUUIDs(), d.MigDeviceUUIDs()...)
func (g *PreparedDeviceGroup) MigDeviceUUIDs() []string {
return g.Devices.MigDevices().UUIDs()
}

func (d PreparedDevices) UUIDs() []string {
return append(d.GpuUUIDs(), d.MigDeviceUUIDs()...)
func (d PreparedDevices) MigDeviceUUIDs() []string {
var uuids []string
for _, group := range d {
uuids = append(uuids, group.MigDeviceUUIDs()...)
}
return uuids
}

0 comments on commit 2eb5f4a

Please sign in to comment.