Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure each imex domain.cliqueId has a unique set of channel numbers #187

Merged
merged 2 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 83 additions & 28 deletions cmd/nvidia-dra-controller/imex.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package main
import (
"context"
"fmt"
"strings"
"sync"
"time"

Expand All @@ -36,17 +37,20 @@ import (
)

const (
DriverName = "gpu.nvidia.com"
ImexDomainLabel = "nvidia.com/gpu.imex-domain"
ImexChannelLimit = 128
DriverName = "gpu.nvidia.com"
ImexDomainLabel = "nvidia.com/gpu.imex-domain"
ResourceSliceImexChannelLimit = 128
DriverImexChannelLimit = 2048
)

type ImexManager struct {
waitGroup sync.WaitGroup
clientset kubernetes.Interface
}

type DriverResources resourceslice.DriverResources
// imexDomainOffsets represents the offset for assigning IMEX channels
// to ResourceSlices for each <imex-domain, cliqueid> combination.
type imexDomainOffsets map[string]map[string]int

func StartIMEXManager(ctx context.Context, config *Config) (*ImexManager, error) {
// Build a client set config
Expand Down Expand Up @@ -99,29 +103,34 @@ func StartIMEXManager(ctx context.Context, config *Config) (*ImexManager, error)

// manageResourceSlices reacts to added and removed IMEX domains and triggers the creation / removal of resource slices accordingly.
func (m *ImexManager) manageResourceSlices(ctx context.Context, owner resourceslice.Owner, addedDomainsCh <-chan string, removedDomainsCh <-chan string) error {
driverResources := resourceslice.DriverResources{}
controller, err := resourceslice.StartController(ctx, m.clientset, DriverName, owner, &driverResources)
driverResources := &resourceslice.DriverResources{}
controller, err := resourceslice.StartController(ctx, m.clientset, DriverName, owner, driverResources)
if err != nil {
return fmt.Errorf("error starting resource slice controller: %w", err)
}

imexDomainOffsets := new(imexDomainOffsets)
m.waitGroup.Add(1)
go func() {
defer m.waitGroup.Done()
for {
select {
case addedDomain := <-addedDomainsCh:
klueska marked this conversation as resolved.
Show resolved Hide resolved
offset, err := imexDomainOffsets.add(addedDomain, ResourceSliceImexChannelLimit, DriverImexChannelLimit)
if err != nil {
klog.Errorf("Error calculating channel offset for IMEX domain %s: %v", addedDomain, err)
return
}
klog.Infof("Adding channels for new IMEX domain: %v", addedDomain)
newDriverResources := DriverResources(driverResources).DeepCopy()
newDriverResources.Pools[addedDomain] = generateImexChannelPool(addedDomain, ImexChannelLimit)
controller.Update(&newDriverResources)
driverResources = newDriverResources
driverResources := driverResources.DeepCopy()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was a bug -- we actually want the following (i.e. no : because we want to overwrite the original):

driverResources = driverResources.DeepCopy()

It's fixed in my refactoring:
#189

driverResources.Pools[addedDomain] = generateImexChannelPool(addedDomain, offset, ResourceSliceImexChannelLimit)
controller.Update(driverResources)
case removedDomain := <-removedDomainsCh:
klog.Infof("Removing channels for removed IMEX domain: %v", removedDomain)
newDriverResources := DriverResources(driverResources).DeepCopy()
delete(newDriverResources.Pools, removedDomain)
controller.Update(&newDriverResources)
driverResources = newDriverResources
driverResources := driverResources.DeepCopy()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likewise here...

delete(driverResources.Pools, removedDomain)
imexDomainOffsets.remove(removedDomain)
controller.Update(driverResources)
case <-ctx.Done():
return
}
Expand All @@ -146,17 +155,6 @@ func (m *ImexManager) Stop() error {
return nil
}

// DeepCopy will perform a deep copy of the provided DriverResources.
func (d DriverResources) DeepCopy() resourceslice.DriverResources {
driverResources := resourceslice.DriverResources{
Pools: make(map[string]resourceslice.Pool),
}
for p := range d.Pools {
driverResources.Pools[p] = generateImexChannelPool(p, ImexChannelLimit)
}
return driverResources
}

// streamImexDomains returns two channels that streams imexDomans that are added and removed from nodes over time.
func (m *ImexManager) streamImexDomains(ctx context.Context) (<-chan string, <-chan string, error) {
// Create channels to stream IMEX domain ids that are added / removed
Expand Down Expand Up @@ -249,10 +247,10 @@ func (m *ImexManager) streamImexDomains(ctx context.Context) (<-chan string, <-c
}

// generateImexChannelPool generates the contents of a ResourceSlice pool for a given IMEX domain.
func generateImexChannelPool(imexDomain string, numChannels int) resourceslice.Pool {
// Generate dchannels from 0 to numChannels
func generateImexChannelPool(imexDomain string, startChannel int, numChannels int) resourceslice.Pool {
// Generate channels from startChannel to offset+numChannels
var devices []resourceapi.Device
for i := 0; i < numChannels; i++ {
for i := startChannel; i < (startChannel + numChannels); i++ {
d := resourceapi.Device{
Name: fmt.Sprintf("imex-channel-%d", i),
Basic: &resourceapi.BasicDevice{
Expand Down Expand Up @@ -312,3 +310,60 @@ func (m *ImexManager) cleanupResourceSlices() error {

return nil
}

// add sets the offset where an IMEX domain's channels should start counting from.
func (offsets imexDomainOffsets) add(imexDomain string, resourceSliceImexChannelLimit, driverImexChannelLimit int) (int, error) {
// Split the incoming imexDomain to split off its cliqueID
id := strings.SplitN(imexDomain, ".", 2)
if len(id) != 2 {
return -1, fmt.Errorf("error adding IMEX domain %s: invalid format", imexDomain)
}
imexDomain = id[0]
cliqueID := id[1]

// Check if the IMEX domain is already in the map
if _, ok := offsets[imexDomain]; !ok {
offsets[imexDomain] = make(map[string]int)
}

// Return early if the clique is already in the map
if offset, exists := offsets[imexDomain][cliqueID]; exists {
return offset, nil
}

// Track used offsets for the current imexDomain
usedOffsets := make(map[int]struct{})
for _, v := range offsets[imexDomain] {
usedOffsets[v] = struct{}{}
}

// Look for the first unused offset, stepping by resourceSliceImexChannelLimit
var offset int
for offset = 0; offset < driverImexChannelLimit; offset += resourceSliceImexChannelLimit {
if _, exists := usedOffsets[offset]; !exists {
break
}
}

// If we reach the limit, return an error
if offset == driverImexChannelLimit {
return -1, fmt.Errorf("error adding IMEX domain %s: channel limit reached", imexDomain)
}
offsets[imexDomain][cliqueID] = offset

return offset, nil
}

func (offsets imexDomainOffsets) remove(imexDomain string) {
id := strings.SplitN(imexDomain, ".", 2)
if len(id) != 2 {
return
}
imexDomain = id[0]
cliqueID := id[1]

delete(offsets[imexDomain], cliqueID)
if len(offsets[imexDomain]) == 0 {
delete(offsets, imexDomain)
}
}
10 changes: 5 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ module github.com/NVIDIA/k8s-dra-driver

go 1.23.1

replace k8s.io/dynamic-resource-allocation => github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241014110620-faf89fe5e93b
replace k8s.io/dynamic-resource-allocation => github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241025114252-8b063a6a08bc

require (
github.com/Masterminds/semver v1.5.0
Expand Down Expand Up @@ -56,11 +56,11 @@ require (
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/moby/sys/mountinfo v0.7.1 // indirect
github.com/moby/sys/mountinfo v0.7.2 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/opencontainers/runc v1.1.13 // indirect
github.com/opencontainers/runc v1.1.15 // indirect
github.com/opencontainers/runtime-spec v1.2.0 // indirect
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
Expand All @@ -73,8 +73,8 @@ require (
github.com/x448/float16 v0.8.4 // indirect
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.26.0 // indirect
golang.org/x/mod v0.20.0 // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/mod v0.21.0 // indirect
golang.org/x/net v0.30.0 // indirect
golang.org/x/oauth2 v0.23.0 // indirect
golang.org/x/term v0.25.0 // indirect
Expand Down
21 changes: 10 additions & 11 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,13 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241014110620-faf89fe5e93b h1:S43DzJ7/nhmV+gH1DacCljYoJu6WztK1Fjpz9iUmoPQ=
github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241014110620-faf89fe5e93b/go.mod h1:8g5MWHFI0UiuvtrclYyS4K8EA55VrU1/4lBG6HIQ2cI=
github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241025114252-8b063a6a08bc h1:7Y3cSZsYslBJhAPVsseOhrU9RxJuCdzgaMYvoIDuGUA=
github.com/kubernetes/kubernetes/staging/src/k8s.io/dynamic-resource-allocation v0.0.0-20241025114252-8b063a6a08bc/go.mod h1:dZVQfU+lUXza85oigVyOmZXj/xsFIon0O6/NamFg82M=
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
github.com/mndrix/tap-go v0.0.0-20171203230836-629fa407e90b/go.mod h1:pzzDgJWZ34fGzaAZGFW22KVZDfyrYW+QABMrWnJBnSs=
github.com/moby/sys/mountinfo v0.7.1 h1:/tTvQaSJRr2FshkhXiIpux6fQ2Zvc4j7tAhMTStAG2g=
github.com/moby/sys/mountinfo v0.7.1/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI=
github.com/moby/sys/mountinfo v0.7.2 h1:1shs6aH5s4o5H2zQLn796ADW1wMrIwHsyJ2v9KouLrg=
github.com/moby/sys/mountinfo v0.7.2/go.mod h1:1YOa8w8Ih7uW0wALDUgT1dTTSBrZ+HiBLGws92L2RU4=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
Expand All @@ -104,8 +104,8 @@ github.com/onsi/ginkgo/v2 v2.19.0 h1:9Cnnf7UHo57Hy3k6/m5k3dRfGTMXGvxhHFvkDTCTpvA
github.com/onsi/ginkgo/v2 v2.19.0/go.mod h1:rlwLi9PilAFJ8jCg9UE1QP6VBpd6/xj3SRC0d6TU0To=
github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk=
github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0=
github.com/opencontainers/runc v1.1.13 h1:98S2srgG9vw0zWcDpFMn5TRrh8kLxa/5OFUstuUhmRs=
github.com/opencontainers/runc v1.1.13/go.mod h1:R016aXacfp/gwQBYw2FDGa9m+n6atbLWrYY8hNMT/sA=
github.com/opencontainers/runc v1.1.15 h1:QMmSU2q1YUg3iOJX11phnaDi2A5/zhx4BR6h+XZ1DMA=
github.com/opencontainers/runc v1.1.15/go.mod h1:E4C2z+7BxR7GHXp0hAY53mek+x49X1LjPNeMTfRGvOA=
github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
Expand Down Expand Up @@ -171,17 +171,17 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo=
go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so=
go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8=
go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8=
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0=
golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
Expand All @@ -200,7 +200,6 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo=
golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
Expand Down
2 changes: 1 addition & 1 deletion vendor/github.com/moby/sys/mountinfo/mounted_linux.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion vendor/go.uber.org/zap/.golangci.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion vendor/go.uber.org/zap/.readme.tmpl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading