diff --git a/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go b/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go new file mode 100644 index 00000000..2dd59390 --- /dev/null +++ b/api/nvidia.com/resource/gpu/v1alpha1/driverconfig.go @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package v1alpha1 + +import "fmt" + +// GpuDriver encodes the gpu driver as a string. +type GpuDriver string + +const ( + NvidiaDriver GpuDriver = "nvidia" + VfioPciDriver GpuDriver = "vfio-pci" +) + +// GpuDriverConfig holds the set of parameters for configuring a GPU with a driver. +type GpuDriverConfig struct { + Driver GpuDriver `json:"driver"` +} + +// DefaultGpuDriverConfig provides the default configuration of a GPU with a driver. +func DefaultGpuDriverConfig() *GpuDriverConfig { + return &GpuDriverConfig{ + Driver: NvidiaDriver, + } +} + +// Normalize updates a GpuDriverConfig config with implied default values based on other settings. +func (c *GpuDriverConfig) Normalize() error { + if c.Driver == "" { + c.Driver = NvidiaDriver + } + return nil +} + +// Validate ensures that GpuDriverConfig has a valid set of values. +func (c *GpuDriverConfig) Validate() error { + switch c.Driver { + case NvidiaDriver: + fallthrough + case VfioPciDriver: + break + default: + return fmt.Errorf("invalid driver '%s' specified in gpu driver configuration", c.Driver) + } + return nil +} diff --git a/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go b/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go index d14699fd..d61db48c 100644 --- a/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go +++ b/api/nvidia.com/resource/gpu/v1alpha1/gpuconfig.go @@ -29,7 +29,8 @@ import ( // GpuConfig holds the set of parameters for configuring a GPU. type GpuConfig struct { metav1.TypeMeta `json:",inline"` - Sharing *GpuSharing `json:"sharing,omitempty"` + Sharing *GpuSharing `json:"sharing,omitempty"` + DriverConfig *GpuDriverConfig `json:"driverConfig,omitempty"` } // DefaultGpuConfig provides the default GPU configuration. @@ -45,11 +46,27 @@ func DefaultGpuConfig() *GpuConfig { Interval: ptr.To(DefaultTimeSlice), }, }, + DriverConfig: &GpuDriverConfig{ + Driver: NvidiaDriver, + }, } } // Normalize updates a GpuConfig config with implied default values based on other settings. func (c *GpuConfig) Normalize() error { + if c.DriverConfig == nil { + c.DriverConfig = DefaultGpuDriverConfig() + } + + if err := c.DriverConfig.Normalize(); err != nil { + return err + } + + // If sharing is not supported, don't proceed with normalizing its configuration. + if !c.SupportsSharing() { + return nil + } + if c.Sharing == nil { c.Sharing = &GpuSharing{ Strategy: TimeSlicingStrategy, @@ -68,8 +85,23 @@ func (c *GpuConfig) Normalize() error { // Validate ensures that GpuConfig has a valid set of values. func (c *GpuConfig) Validate() error { - if c.Sharing == nil { - return fmt.Errorf("no sharing strategy set") + if err := c.DriverConfig.Validate(); err != nil { + return err } - return c.Sharing.Validate() + + if c.SupportsSharing() { + if c.Sharing == nil { + return fmt.Errorf("no sharing strategy set") + } + if err := c.Sharing.Validate(); err != nil { + return err + } + } else if c.Sharing != nil { + return fmt.Errorf("sharing strategy cannot be provided while using non-nvidia driver") + } + return nil +} + +func (c *GpuConfig) SupportsSharing() bool { + return c.DriverConfig.Driver == NvidiaDriver } diff --git a/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go b/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go index 86a9f407..7cf34e00 100644 --- a/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go +++ b/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go @@ -1,4 +1,5 @@ //go:build !ignore_autogenerated +// +build !ignore_autogenerated /* * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. @@ -33,6 +34,11 @@ func (in *GpuConfig) DeepCopyInto(out *GpuConfig) { *out = new(GpuSharing) (*in).DeepCopyInto(*out) } + if in.DriverConfig != nil { + in, out := &in.DriverConfig, &out.DriverConfig + *out = new(GpuDriverConfig) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GpuConfig. @@ -53,6 +59,21 @@ func (in *GpuConfig) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GpuDriverConfig) DeepCopyInto(out *GpuDriverConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GpuDriverConfig. +func (in *GpuDriverConfig) DeepCopy() *GpuDriverConfig { + if in == nil { + return nil + } + out := new(GpuDriverConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GpuSharing) DeepCopyInto(out *GpuSharing) { *out = *in diff --git a/cmd/nvidia-dra-plugin/allocatable.go b/cmd/nvidia-dra-plugin/allocatable.go index 3350716c..68a54241 100644 --- a/cmd/nvidia-dra-plugin/allocatable.go +++ b/cmd/nvidia-dra-plugin/allocatable.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -106,3 +106,14 @@ func (d AllocatableDevices) UUIDs() []string { slices.Sort(uuids) return uuids } + +func (d AllocatableDevices) PciAddresses() []string { + var pciAddresses []string + for _, device := range d { + if device.Type() == GpuDeviceType { + pciAddresses = append(pciAddresses, device.Gpu.PciAddress) + } + } + slices.Sort(pciAddresses) + return pciAddresses +} diff --git a/cmd/nvidia-dra-plugin/device_state.go b/cmd/nvidia-dra-plugin/device_state.go index 0ecd3423..9e7407cb 100644 --- a/cmd/nvidia-dra-plugin/device_state.go +++ b/cmd/nvidia-dra-plugin/device_state.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,17 +38,19 @@ type OpaqueDeviceConfig struct { } type DeviceConfigState struct { - MpsControlDaemonID string `json:"mpsControlDaemonID"` + MpsControlDaemonID string `json:"mpsControlDaemonID"` + GpuConfig *configapi.GpuConfig `json:"gpuConfig,omitempty"` containerEdits *cdiapi.ContainerEdits } type DeviceState struct { sync.Mutex - cdi *CDIHandler - tsManager *TimeSlicingManager - mpsManager *MpsManager - allocatable AllocatableDevices - config *Config + cdi *CDIHandler + tsManager *TimeSlicingManager + mpsManager *MpsManager + vfioPciManager *VfioPciManager + allocatable AllocatableDevices + config *Config nvdevlib *deviceLib checkpointManager checkpointmanager.CheckpointManager @@ -87,6 +89,8 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) { tsManager := NewTimeSlicingManager(nvdevlib) mpsManager := NewMpsManager(config, nvdevlib, MpsRoot, hostDriverRoot, MpsControlDaemonTemplatePath) + vfioPciManager := NewVfioPciManager() + if err := cdi.CreateStandardDeviceSpecFile(allocatable); err != nil { return nil, fmt.Errorf("unable to create base CDI spec file: %v", err) } @@ -100,12 +104,18 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) { cdi: cdi, tsManager: tsManager, mpsManager: mpsManager, + vfioPciManager: vfioPciManager, allocatable: allocatable, config: config, nvdevlib: nvdevlib, checkpointManager: checkpointManager, } + // Initialize the vfio-pci driver manager. + if err := vfioPciManager.Init(); err != nil { + return nil, fmt.Errorf("unable to initialize vfio-pci manager: %v", err) + } + checkpoints, err := state.checkpointManager.ListCheckpoints() if err != nil { return nil, fmt.Errorf("unable to list checkpoints: %v", err) @@ -349,35 +359,67 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res func (s *DeviceState) unprepareDevices(ctx context.Context, claimUID string, devices PreparedDevices) error { for _, group := range devices { + if group.ConfigState.GpuConfig != nil { + err := s.unprepareGpus(ctx, group.ConfigState.GpuConfig, group.Devices.Gpus()) + if err != nil { + return err + } + } // Stop any MPS control daemons started for each group of prepared devices. mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(claimUID, group) if err := mpsControlDaemon.Stop(ctx); err != nil { return fmt.Errorf("error stopping MPS control daemon: %w", err) } - // Go back to default time-slicing for all full GPUs. tsc := configapi.DefaultGpuConfig().Sharing.TimeSlicingConfig - if err := s.tsManager.SetTimeSlice(group.Devices.Gpus(), tsc); err != nil { + if err := s.tsManager.SetTimeSlice(devices, tsc); err != nil { return fmt.Errorf("error setting timeslice for devices: %w", err) } } return nil } +func (s *DeviceState) unprepareGpus(ctx context.Context, config *configapi.GpuConfig, devices PreparedDeviceList) error { + if config.DriverConfig.Driver == configapi.VfioPciDriver { + for _, device := range devices { + if err := s.vfioPciManager.Unconfigure(device.Gpu.Info); err != nil { + return fmt.Errorf("error unconfiguring vfio-pci device: %w", err) + } + } + } + return nil +} func (s *DeviceState) applyConfig(ctx context.Context, config configapi.Interface, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) { + var configState DeviceConfigState switch castConfig := config.(type) { case *configapi.GpuConfig: - return s.applySharingConfig(ctx, castConfig.Sharing, claim, results) + configState.GpuConfig = castConfig + return s.applyGpuConfig(ctx, castConfig, claim, results, &configState) case *configapi.MigDeviceConfig: - return s.applySharingConfig(ctx, castConfig.Sharing, claim, results) + return s.applySharingConfig(ctx, castConfig.Sharing, claim, results, &configState) case *configapi.ImexChannelConfig: - return s.applyImexChannelConfig(ctx, castConfig, claim, results) + return s.applyImexChannelConfig(ctx, castConfig, claim, results, &configState) default: return nil, fmt.Errorf("unknown config type: %T", castConfig) } } -func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.Sharing, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) { +func (s *DeviceState) applyGpuConfig(ctx context.Context, config *configapi.GpuConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) { + var err error + configState, err = s.applyGpuDriverConfig(ctx, config.DriverConfig, results, configState) + if err != nil { + return nil, err + } + if config.SupportsSharing() { + configState, err = s.applySharingConfig(ctx, config.Sharing, claim, results, configState) + if err != nil { + return nil, err + } + } + return configState, nil +} + +func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.Sharing, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) { // Get the list of claim requests this config is being applied over. var requests []string for _, r := range results { @@ -390,9 +432,6 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S allocatableDevices[r.Device] = s.allocatable[r.Device] } - // Declare a device group state object to populate. - var configState DeviceConfigState - // Apply time-slicing settings (if available). if config.IsTimeSlicing() { tsc, err := config.GetTimeSlicingConfig() @@ -424,13 +463,10 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S configState.containerEdits = mpsControlDaemon.GetCDIContainerEdits() } - return &configState, nil + return configState, nil } -func (s *DeviceState) applyImexChannelConfig(ctx context.Context, config *configapi.ImexChannelConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult) (*DeviceConfigState, error) { - // Declare a device group state object to populate. - var configState DeviceConfigState - +func (s *DeviceState) applyImexChannelConfig(ctx context.Context, config *configapi.ImexChannelConfig, claim *resourceapi.ResourceClaim, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) { // Create any necessary IMEX channels and gather their CDI container edits. for _, r := range results { imexChannel := s.allocatable[r.Device].ImexChannel @@ -440,7 +476,25 @@ func (s *DeviceState) applyImexChannelConfig(ctx context.Context, config *config configState.containerEdits = configState.containerEdits.Append(s.cdi.GetImexChannelContainerEdits(imexChannel)) } - return &configState, nil + return configState, nil +} + +func (s *DeviceState) applyGpuDriverConfig(ctx context.Context, config *configapi.GpuDriverConfig, results []*resourceapi.DeviceRequestAllocationResult, configState *DeviceConfigState) (*DeviceConfigState, error) { + if config.Driver != configapi.VfioPciDriver { + return configState, nil + } + + // Apply vfio-pci driver settings. + for _, r := range results { + info := s.allocatable[r.Device] + err := s.vfioPciManager.Configure(info.Gpu) + if err != nil { + return nil, err + } + configState.containerEdits = configState.containerEdits.Append(s.vfioPciManager.GetCDIContainerEdits(info.Gpu)) + } + + return configState, nil } // GetOpaqueDeviceConfigs returns an ordered list of the configs contained in possibleConfigs for this driver. diff --git a/cmd/nvidia-dra-plugin/deviceinfo.go b/cmd/nvidia-dra-plugin/deviceinfo.go index cc899c44..194742ea 100644 --- a/cmd/nvidia-dra-plugin/deviceinfo.go +++ b/cmd/nvidia-dra-plugin/deviceinfo.go @@ -29,6 +29,7 @@ import ( type GpuInfo struct { UUID string `json:"uuid"` + PciAddress string `json:"pciAddress"` index int minor int migEnabled bool @@ -130,6 +131,9 @@ func (d *GpuInfo) GetDevice() resourceapi.Device { "cudaDriverVersion": { VersionValue: ptr.To(semver.MustParse(d.cudaDriverVersion).String()), }, + "pciAddress": { + StringValue: &d.PciAddress, + }, }, Capacity: map[resourceapi.QualifiedName]resource.Quantity{ "memory": *resource.NewQuantity(int64(d.memoryBytes), resource.BinarySI), diff --git a/cmd/nvidia-dra-plugin/mutex.go b/cmd/nvidia-dra-plugin/mutex.go new file mode 100644 index 00000000..e98fed15 --- /dev/null +++ b/cmd/nvidia-dra-plugin/mutex.go @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "sync" +) + +type PerGPUMutex struct { + sync.Mutex + submutex map[string]*sync.Mutex +} + +var perGpuLock *PerGPUMutex + +func init() { + perGpuLock = &PerGPUMutex{ + submutex: make(map[string]*sync.Mutex), + } +} + +func (pgm *PerGPUMutex) Get(gpu string) *sync.Mutex { + pgm.Mutex.Lock() + defer pgm.Mutex.Unlock() + if pgm.submutex[gpu] == nil { + pgm.submutex[gpu] = &sync.Mutex{} + } + return pgm.submutex[gpu] +} diff --git a/cmd/nvidia-dra-plugin/nvlib.go b/cmd/nvidia-dra-plugin/nvlib.go index 421e7a50..37639e92 100644 --- a/cmd/nvidia-dra-plugin/nvlib.go +++ b/cmd/nvidia-dra-plugin/nvlib.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -240,7 +240,10 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting CUDA driver version: %w", err) } - + pciAddress, err := device.GetPCIBusID() + if err != nil { + return nil, err + } var migProfiles []*MigProfileInfo for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ { giProfileInfo, ret := device.GetGpuInstanceProfileInfo(i) @@ -307,6 +310,7 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) driverVersion: driverVersion, cudaDriverVersion: fmt.Sprintf("%v.%v", cudaDriverVersion/1000, (cudaDriverVersion%1000)/10), migProfiles: migProfiles, + PciAddress: pciAddress, } return gpuInfo, nil diff --git a/cmd/nvidia-dra-plugin/prepared.go b/cmd/nvidia-dra-plugin/prepared.go index edb369fe..f939e77b 100644 --- a/cmd/nvidia-dra-plugin/prepared.go +++ b/cmd/nvidia-dra-plugin/prepared.go @@ -203,3 +203,23 @@ func (d PreparedDevices) MigDeviceUUIDs() []string { slices.Sort(uuids) return uuids } + +func (l PreparedDeviceList) PciAddresses() []string { + var pciAddresses []string + for _, device := range l.Gpus() { + pciAddresses = append(pciAddresses, device.Gpu.Info.PciAddress) + } + return pciAddresses +} + +func (g *PreparedDeviceGroup) PciAddresses() []string { + return g.Devices.Gpus().PciAddresses() +} + +func (d PreparedDevices) PciAddresses() []string { + var pciAddresses []string + for _, group := range d { + pciAddresses = append(pciAddresses, group.PciAddresses()...) + } + return pciAddresses +} diff --git a/cmd/nvidia-dra-plugin/vfio-device.go b/cmd/nvidia-dra-plugin/vfio-device.go new file mode 100644 index 00000000..d4c8fc8e --- /dev/null +++ b/cmd/nvidia-dra-plugin/vfio-device.go @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + + cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" + cdispec "tags.cncf.io/container-device-interface/specs-go" +) + +const ( + hostNamespaceMount = "/proc/1/ns/mnt" + vfioPciModule = "vfio_pci" + vfioPciDriver = "vfio-pci" + nvidiaDriver = "nvidia" + unbindFromDriverScript = "/usr/bin/unbind_from_driver.sh" + bindToDriverScript = "/usr/bin/bind_to_driver.sh" + driverResetRetries = "5" +) + +type VfioPciManager struct { + pciDevicesRoot string + vfioDevicesRoot string + sysModulesRoot string + driver string + vfioPciModule string +} + +func NewVfioPciManager() *VfioPciManager { + return &VfioPciManager{ + pciDevicesRoot: "/sys/bus/pci/devices", + vfioDevicesRoot: "/dev/vfio", + sysModulesRoot: "/sys/module", + driver: vfioPciDriver, + vfioPciModule: vfioPciModule, + } +} + +// Init ensures the vfio-pci module is loaded on the host. +func (vm *VfioPciManager) Init() error { + if !vm.isVfioPCIModuleLoaded() { + err := vm.loadVfioPciModule() + if err != nil { + return err + } + } + return nil +} + +func (vm *VfioPciManager) isVfioPCIModuleLoaded() bool { + modules, err := os.ReadDir(vm.sysModulesRoot) + if err != nil { + return false + } + + for _, module := range modules { + if module.Name() == vm.vfioPciModule { + return true + } + } + + return false + +} + +func (vm *VfioPciManager) loadVfioPciModule() error { + _, err := execCommandInHostNamespace("modprobe", []string{vm.vfioPciModule}) //nolint:gosec + if err != nil { + return err + } + + return nil +} + +// Configure binds the GPU to the vfio-pci driver. +func (vm *VfioPciManager) Configure(info *GpuInfo) error { + perGpuLock.Get(info.PciAddress).Lock() + defer perGpuLock.Get(info.PciAddress).Unlock() + + driver, err := getDriver(vm.pciDevicesRoot, info.PciAddress) + if err != nil { + return err + } + if driver == vm.driver { + return nil + } + err = changeDriver(info.PciAddress, vm.driver) + if err != nil { + return err + } + return nil +} + +// Unconfigure binds the GPU to the nvidia driver. +func (vm *VfioPciManager) Unconfigure(info *GpuInfo) error { + perGpuLock.Get(info.PciAddress).Lock() + defer perGpuLock.Get(info.PciAddress).Unlock() + + driver, err := getDriver(vm.pciDevicesRoot, info.PciAddress) + if err != nil { + return err + } + if driver == nvidiaDriver { + return nil + } + err = changeDriver(info.PciAddress, nvidiaDriver) + if err != nil { + return err + } + return nil +} + +func getDriver(pciDevicesRoot, pciAddress string) (string, error) { + driverPath, err := os.Readlink(filepath.Join(pciDevicesRoot, pciAddress, "driver")) + if err != nil { + return "", err + } + _, driver := filepath.Split(driverPath) + return driver, nil +} + +func changeDriver(pciAddress, driver string) error { + err := unbindFromDriver(pciAddress, driver) + if err != nil { + return err + } + err = bindToDriver(pciAddress, driver) + if err != nil { + return err + } + return nil +} + +func unbindFromDriver(pciAddress, driverResetRetries string) error { + _, err := execCommandInHostNamespace(unbindFromDriverScript, []string{pciAddress, driverResetRetries}) //nolint:gosec + if err != nil { + return err + } + return nil +} + +func bindToDriver(pciAddress, driver string) error { + _, err := execCommandInHostNamespace(bindToDriverScript, []string{pciAddress, driver}) //nolint:gosec + if err != nil { + return err + } + return nil +} + +func (vm *VfioPciManager) getIommuGroupForVfioPciDevice(pciAddress string) string { + iommuGroup, err := os.Readlink(filepath.Join(vm.pciDevicesRoot, pciAddress, "iommu_group")) + if err != nil { + return "" + } + _, file := filepath.Split(iommuGroup) + return file + +} + +// GetCDIContainerEdits returns the CDI spec for a container to have access to the GPU while bound on vfio-pci driver. +func (vm *VfioPciManager) GetCDIContainerEdits(info *GpuInfo) *cdiapi.ContainerEdits { + iommuGroup := vm.getIommuGroupForVfioPciDevice(info.PciAddress) + vfioDevicePath := filepath.Join(vm.vfioDevicesRoot, iommuGroup) + return &cdiapi.ContainerEdits{ + ContainerEdits: &cdispec.ContainerEdits{ + DeviceNodes: []*cdispec.DeviceNode{ + { + Path: vfioDevicePath, + }, + }, + }, + } +} + +func execCommandInHostNamespace(cmd string, args []string) ([]byte, error) { + nsenterArgs := []string{fmt.Sprintf("--mount=%s", hostNamespaceMount), "--", cmd} + nsenterArgs = append(nsenterArgs, args...) + return exec.Command("nsenter", nsenterArgs...).CombinedOutput() +} diff --git a/demo/clusters/kind/install-dra-driver.sh b/demo/clusters/kind/install-dra-driver.sh index ece8cdf1..b48aafd6 100755 --- a/demo/clusters/kind/install-dra-driver.sh +++ b/demo/clusters/kind/install-dra-driver.sh @@ -24,7 +24,7 @@ source "${CURRENT_DIR}/scripts/common.sh" kubectl label node -l node-role.x-k8s.io/worker --overwrite nvidia.com/gpu.present=true -deviceClasses=${1:-"gpu,mig,imex"} +deviceClasses=${1:-"gpu,mig,imex,vfiopci"} helm upgrade -i --create-namespace --namespace nvidia nvidia-dra-driver ${PROJECT_DIR}/deployments/helm/k8s-dra-driver \ --set deviceClasses="{${deviceClasses}}" \ ${NVIDIA_CTK_PATH:+--set nvidiaCtkPath=${NVIDIA_CTK_PATH}} \ diff --git a/demo/clusters/kind/scripts/kind-cluster-config.yaml b/demo/clusters/kind/scripts/kind-cluster-config.yaml index f1a34a1c..890b0b71 100644 --- a/demo/clusters/kind/scripts/kind-cluster-config.yaml +++ b/demo/clusters/kind/scripts/kind-cluster-config.yaml @@ -66,3 +66,5 @@ nodes: # on the kind nodes. - hostPath: /usr/bin/nvidia-ctk containerPath: /usr/bin/nvidia-ctk + - hostPath: /sys + containerPath: /sys \ No newline at end of file diff --git a/demo/specs/quickstart/gpu-test-vfiopci.yaml b/demo/specs/quickstart/gpu-test-vfiopci.yaml new file mode 100644 index 00000000..75f6cfbe --- /dev/null +++ b/demo/specs/quickstart/gpu-test-vfiopci.yaml @@ -0,0 +1,41 @@ +# One pod, one container asking for 1 distinct GPU + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: gpu-test-vfiopci + +--- +apiVersion: resource.k8s.io/v1alpha3 +kind: ResourceClaimTemplate +metadata: + namespace: gpu-test-vfiopci + name: single-gpu +spec: + spec: + devices: + requests: + - name: gpu + deviceClassName: vfiopci.nvidia.com + +--- +apiVersion: v1 +kind: Pod +metadata: + namespace: gpu-test-vfiopci + name: pod1 + labels: + app: pod +spec: + containers: + - name: ctr + image: ubuntu:22.04 + command: ["bash", "-c"] + args: ["sleep 9999 & wait"] + resources: + claims: + - name: gpu + resourceClaims: + - name: gpu + resourceClaimTemplateName: single-gpu diff --git a/deployments/container/Dockerfile.ubi8 b/deployments/container/Dockerfile.ubi8 index c365f123..ab3b2715 100644 --- a/deployments/container/Dockerfile.ubi8 +++ b/deployments/container/Dockerfile.ubi8 @@ -19,7 +19,7 @@ ARG BASE_DIST=ubi8 FROM --platform=${TARGETARCH} nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST} as build RUN yum install -y \ - wget make git gcc \ + wget make git gcc kmod \ && \ rm -rf /var/cache/yum/* @@ -59,9 +59,12 @@ LABEL org.opencontainers.image.description "NVIDIA GPU DRA driver for Kubernetes RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE -COPY --from=build /artifacts/nvidia-dra-controller /usr/bin/nvidia-dra-controller -COPY --from=build /artifacts/nvidia-dra-plugin /usr/bin/nvidia-dra-plugin -COPY --from=build /build/templates /templates +COPY --from=build /artifacts/nvidia-dra-plugin /usr/bin/nvidia-dra-plugin +COPY --from=build /build/templates /templates +COPY --from=build /build/scripts/bind_to_driver.sh /usr/bin/bind_to_driver.sh +COPY --from=build /build/scripts/unbind_from_driver.sh /usr/bin/unbind_from_driver.sh +COPY --from=build /usr/bin/nsenter /usr/bin/nsenter +COPY --from=build /usr/sbin/modprobe /usr/sbin/modprobe # Install / upgrade packages here that are required to resolve CVEs ARG CVE_UPDATES diff --git a/deployments/container/Dockerfile.ubuntu b/deployments/container/Dockerfile.ubuntu index ea5d58a3..3559e5c0 100644 --- a/deployments/container/Dockerfile.ubuntu +++ b/deployments/container/Dockerfile.ubuntu @@ -19,7 +19,7 @@ ARG BASE_DIST=ubuntu20.04 FROM --platform=${BUILDOS}/amd64 nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST} as build RUN apt-get update && \ - apt-get install -y wget make git gcc-aarch64-linux-gnu gcc \ + apt-get install -y wget make git gcc-aarch64-linux-gnu gcc kmod \ && \ rm -rf /var/lib/apt/lists/* @@ -64,9 +64,12 @@ LABEL org.opencontainers.image.description "NVIDIA GPU DRA driver for Kubernetes RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE -COPY --from=build /artifacts/nvidia-dra-controller /usr/bin/nvidia-dra-controller -COPY --from=build /artifacts/nvidia-dra-plugin /usr/bin/nvidia-dra-plugin -COPY --from=build /build/templates /templates +COPY --from=build /artifacts/nvidia-dra-plugin /usr/bin/nvidia-dra-plugin +COPY --from=build /build/templates /templates +COPY --from=build /build/scripts/bind_to_driver.sh /usr/bin/bind_to_driver.sh +COPY --from=build /build/scripts/unbind_from_driver.sh /usr/bin/unbind_from_driver.sh +COPY --from=build /usr/bin/nsenter /usr/bin/nsenter +COPY --from=build /usr/sbin/modprobe /usr/sbin/modprobe # Install / upgrade packages here that are required to resolve CVEs ARG CVE_UPDATES diff --git a/deployments/helm/k8s-dra-driver/templates/_helpers.tpl b/deployments/helm/k8s-dra-driver/templates/_helpers.tpl index 7cf4ea01..8fff1579 100644 --- a/deployments/helm/k8s-dra-driver/templates/_helpers.tpl +++ b/deployments/helm/k8s-dra-driver/templates/_helpers.tpl @@ -127,3 +127,21 @@ Filter a list by a set of valid values {{- end }} {{- $result -}} {{- end -}} + +{{- define "k8s-dra-driver.vfiopciDeviceClassVolumes" -}} +- name: sysfs + hostPath: + path: /sys +- name: dev-vfio + hostPath: + path: /dev/vfio +{{- end -}} + +{{- define "k8s-dra-driver.vfiopciDeviceClassVolumeMounts" -}} +- name: sysfs + mountPath: /sys + readOnly: false +- name: dev-vfio + mountPath: /dev/vfio + readOnly: false +{{- end -}} \ No newline at end of file diff --git a/deployments/helm/k8s-dra-driver/templates/deviceclass-vfiopci.yaml b/deployments/helm/k8s-dra-driver/templates/deviceclass-vfiopci.yaml new file mode 100644 index 00000000..9fb1cb31 --- /dev/null +++ b/deployments/helm/k8s-dra-driver/templates/deviceclass-vfiopci.yaml @@ -0,0 +1,19 @@ +{{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "vfiopci") }} +--- +apiVersion: resource.k8s.io/v1alpha3 +kind: DeviceClass +metadata: + name: vfiopci.nvidia.com +spec: + config: + - opaque: + driver: gpu.nvidia.com + parameters: + apiVersion: gpu.nvidia.com/v1alpha1 + kind: GpuConfig + driverConfig: + driver: vfio-pci + selectors: + - cel: + expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'gpu'" +{{- end }} diff --git a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml index 0b9b09b0..161d78a0 100644 --- a/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml +++ b/deployments/helm/k8s-dra-driver/templates/kubeletplugin.yaml @@ -103,6 +103,9 @@ spec: - name: driver-root mountPath: /driver-root readOnly: true + {{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "vfiopci") }} + {{- include "k8s-dra-driver.vfiopciDeviceClassVolumeMounts" . | nindent 8 }} + {{- end }} volumes: - name: plugins-registry hostPath: @@ -116,6 +119,9 @@ spec: - name: driver-root hostPath: path: {{ .Values.nvidiaDriverRoot }} + {{- if include "k8s-dra-driver.listHas" (list $.Values.deviceClasses "vfiopci") }} + {{- include "k8s-dra-driver.vfiopciDeviceClassVolumes" . | nindent 6}} + {{- end }} {{- with .Values.kubeletPlugin.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/deployments/helm/k8s-dra-driver/templates/validation.yaml b/deployments/helm/k8s-dra-driver/templates/validation.yaml index ce2dbe68..f93e7dd3 100644 --- a/deployments/helm/k8s-dra-driver/templates/validation.yaml +++ b/deployments/helm/k8s-dra-driver/templates/validation.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -{{- $validDeviceClasses := list "gpu" "mig" "imex" }} +{{- $validDeviceClasses := list "gpu" "mig" "imex" "vfiopci" }} {{- if not (kindIs "slice" .Values.deviceClasses) }} {{- $error := "" }} diff --git a/deployments/helm/k8s-dra-driver/values.yaml b/deployments/helm/k8s-dra-driver/values.yaml index 76ff38ca..a5c7ee91 100644 --- a/deployments/helm/k8s-dra-driver/values.yaml +++ b/deployments/helm/k8s-dra-driver/values.yaml @@ -34,7 +34,7 @@ selectorLabelsOverride: {} allowDefaultNamespace: false -deviceClasses: ["gpu", "mig", "imex"] +deviceClasses: ["gpu", "mig", "imex", "vfiopci"] # Masking of the params file is typically done to allow nvkind to # selectively exclude certain GPUs from being visible to the @@ -96,6 +96,10 @@ kubeletPlugin: plugin: securityContext: privileged: true + allowPrivilegeEscalation: true + runAsNonRoot: false + runAsUser: 0 + runAsGroup: 0 resources: {} affinity: nodeAffinity: diff --git a/scripts/bind_to_driver.sh b/scripts/bind_to_driver.sh new file mode 100644 index 00000000..26840cd4 --- /dev/null +++ b/scripts/bind_to_driver.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Usage: ./bind_to_driver.sh +# Bind the GPU specified by the PCI_ID=ssss:bb:dd.f to the given driver. + +bind_to_driver() +{ + local gpu=$1 + local driver=$2 + local drivers_path="/sys/bus/pci/drivers" + local driver_override_file="/sys/bus/pci/devices/$gpu/driver_override" + local bind_file="$drivers_path/$driver/bind" + + if [ ! -e "$driver_override_file" ]; then + echo "'$driver_override_file' file does not exist" >&2 + return 1 + fi + + echo "$driver" > "$driver_override_file" + if [ $? -ne 0 ]; then + echo "failed to write '$driver' to $driver_override_file" >&2 + return 1 + fi + + if [ ! -e "$bind_file" ]; then + echo "'$bind_file' file does not exist" >&2 + return 1 + fi + + echo "$gpu" > "$bind_file" + if [ $? -ne 0 ]; then + echo "failed to write '$gpu' to $bind_file" >&2 + echo "" > "$driver_override_file" + return 1 + fi +} + +bind_to_driver "$1" "$2" || exit 1 \ No newline at end of file diff --git a/scripts/unbind_from_driver.sh b/scripts/unbind_from_driver.sh new file mode 100644 index 00000000..c7653679 --- /dev/null +++ b/scripts/unbind_from_driver.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Usage: ./unbind_from_driver.sh +# Unbind the GPU specified by the PCI_ID=ssss:bb:dd.f from the driver its bound to. +# Attempt to acquire the unbindLock within the retries specified before unbinding the device from its driver. + +acquire_unbind_lock() +{ + local gpu=$1 + local lock_retries=5 + local unbind_lock_file="/proc/driver/nvidia/gpus/$gpu/unbindLock" + local unbind_lock=0 + local attempt=1 + + if [ ! -e "${unbind_lock_file}" ]; then + return 0 + fi + + while [[ $attempt -le ${lock_retries} ]]; do + echo "[retry $attempt/${lock_retries}] Attempting to acquire unbindLock for $gpu" >&1 + + echo 1 > "{$unbind_lock_file}" + read -r unbind_lock < "${unbind_lock_file}" + if [ ${unbind_lock} -eq 1 ]; then + echo "UnbindLock acquired for $gpu" >&1 + return 0 + fi + + sleep $attempt + attempt=$((attempt + 1)) + done + + echo "cannot obtain unbindLock for $gpu" >&2 + return 1 +} + +unbind_from_driver() +{ + local gpu=$1 + local existing_driver + local existing_driver_name + + [ -e "/sys/bus/pci/devices/$gpu/driver" ] || return 0 + existing_driver=$(readlink -f "/sys/bus/pci/devices/$gpu/driver") + existing_driver_name=$(basename "${existing_driver}") + if [ "${existing_driver_name}" == "nvidia" ]; then + acquire_unbind_lock "$gpu" || return 1 + fi + echo "$gpu" > "${existing_driver}/unbind" + return 0 +} + +unbind_from_driver "$1" || exit 1 \ No newline at end of file