Skip to content

Commit

Permalink
Merge pull request #506 from elezar/use-var-run-nvidia-mps
Browse files Browse the repository at this point in the history
Allow mps root to be specified
  • Loading branch information
elezar authored Feb 21, 2024
2 parents 4d9e794 + 283cc30 commit c675243
Show file tree
Hide file tree
Showing 9 changed files with 126 additions and 39 deletions.
3 changes: 3 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ type Flags struct {
type CommandLineFlags struct {
MigStrategy *string `json:"migStrategy" yaml:"migStrategy"`
FailOnInitError *bool `json:"failOnInitError" yaml:"failOnInitError"`
MpsRoot *string `json:"mpsRoot,omitempty" yaml:"mpsRoot,omitempty"`
NvidiaDriverRoot *string `json:"nvidiaDriverRoot,omitempty" yaml:"nvidiaDriverRoot,omitempty"`
GDSEnabled *bool `json:"gdsEnabled" yaml:"gdsEnabled"`
MOFEDEnabled *bool `json:"mofedEnabled" yaml:"mofedEnabled"`
Expand Down Expand Up @@ -116,6 +117,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.MigStrategy, c, n)
case "fail-on-init-error":
updateFromCLIFlag(&f.FailOnInitError, c, n)
case "mps-root":
updateFromCLIFlag(&f.MpsRoot, c, n)
case "nvidia-driver-root":
updateFromCLIFlag(&f.NvidiaDriverRoot, c, n)
case "gds-enabled":
Expand Down
29 changes: 14 additions & 15 deletions cmd/mps-control-daemon/mps/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import (
"io"
"os"
"os/exec"
"path/filepath"

"k8s.io/klog/v2"

Expand All @@ -46,14 +45,14 @@ type Daemon struct {
rm rm.ResourceManager
// root represents the root at which the files and folders controlled by the
// daemon are created. These include the log and pipe directories.
root string
root Root
}

// NewDaemon creates an MPS daemon instance.
func NewDaemon(rm rm.ResourceManager) *Daemon {
func NewDaemon(rm rm.ResourceManager, root Root) *Daemon {
return &Daemon{
rm: rm,
root: "/mps",
root: root,
}
}

Expand All @@ -77,8 +76,8 @@ func (e envvars) toSlice() []string {
// TODO: Set CUDA_VISIBLE_DEVICES to include only the devices for this resource type.
func (d *Daemon) Envvars() envvars {
return map[string]string{
"CUDA_MPS_PIPE_DIRECTORY": d.pipeDir(),
"CUDA_MPS_LOG_DIRECTORY": d.logDir(),
"CUDA_MPS_PIPE_DIRECTORY": d.PipeDir(),
"CUDA_MPS_LOG_DIRECTORY": d.LogDir(),
}
}

Expand All @@ -90,12 +89,12 @@ func (d *Daemon) Start() error {

klog.InfoS("Staring MPS daemon", "resource", d.rm.Resource())

pipeDir := d.pipeDir()
pipeDir := d.PipeDir()
if err := os.MkdirAll(pipeDir, 0755); err != nil {
return fmt.Errorf("error creating directory %v: %w", pipeDir, err)
}

logDir := d.logDir()
logDir := d.LogDir()
if err := os.MkdirAll(logDir, 0755); err != nil {
return fmt.Errorf("error creating directory %v: %w", logDir, err)
}
Expand Down Expand Up @@ -147,20 +146,20 @@ func (d *Daemon) Stop() error {
return nil
}

func (d *Daemon) resourceRoot() string {
return filepath.Join(d.root, string(d.rm.Resource()))
func (d *Daemon) LogDir() string {
return d.root.LogDir(d.rm.Resource())
}

func (d *Daemon) pipeDir() string {
return filepath.Join(d.resourceRoot(), "pipe")
func (d *Daemon) PipeDir() string {
return d.root.PipeDir(d.rm.Resource())
}

func (d *Daemon) logDir() string {
return filepath.Join(d.resourceRoot(), "log")
func (d *Daemon) ShmDir() string {
return "/dev/shm"
}

func (d *Daemon) startedFile() string {
return filepath.Join(d.resourceRoot(), ".started")
return d.root.startedFile(d.rm.Resource())
}

// AssertHealthy checks that the MPS control daemon is healthy.
Expand Down
2 changes: 1 addition & 1 deletion cmd/mps-control-daemon/mps/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (m *manager) Daemons() ([]*Daemon, error) {
klog.InfoS("Resource is not shared", "resource", "resource", resourceManager.Resource())
continue
}
daemon := NewDaemon(resourceManager)
daemon := NewDaemon(resourceManager, ContainerRoot)
daemons = append(daemons, daemon)
}

Expand Down
59 changes: 59 additions & 0 deletions cmd/mps-control-daemon/mps/root.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package mps

import (
"path/filepath"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
)

const (
ContainerRoot = Root("/mps")
)

// Root represents an MPS root.
// This is where per-resource pipe and log directories are created.
// For containerised applications the host root is typically mounted to /mps in the container.
type Root string

// LogDir returns the per-resource pipe dir for the specified root.
func (r Root) LogDir(resourceName spec.ResourceName) string {
return r.Path(string(resourceName), "log")
}

// PipeDir returns the per-resource pipe dir for the specified root.
func (r Root) PipeDir(resourceName spec.ResourceName) string {
return r.Path(string(resourceName), "pipe")
}

// ShmDir returns the shm dir associated with the root.
// Note that the shm dir is the same for all resources.
func (r Root) ShmDir(resourceName spec.ResourceName) string {
return r.Path("shm")
}

// startedFile returns the per-resource .started file name for the specified root.
func (r Root) startedFile(resourceName spec.ResourceName) string {
return r.Path(string(resourceName), ".started")
}

// Path returns a path relative to the MPS root.
func (r Root) Path(parts ...string) string {
pathparts := append([]string{string(r)}, parts...)
return filepath.Join(pathparts...)
}
8 changes: 8 additions & 0 deletions cmd/nvidia-device-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ func main() {
Usage: "the path where the NVIDIA driver root is mounted in the container; used for generating CDI specifications",
EnvVars: []string{"CONTAINER_DRIVER_ROOT"},
},
&cli.StringFlag{
Name: "mps-root",
Usage: "the path on the host where MPS-specific mounts and files are created by the MPS control daemon manager",
EnvVars: []string{"MPS_ROOT"},
},
}

err := c.Run(os.Args)
Expand Down Expand Up @@ -148,6 +153,9 @@ func validateFlags(config *spec.Config) error {
if *config.Flags.MigStrategy == spec.MigStrategyMixed {
return fmt.Errorf("using --mig-strategy=mixed is not supported with MPS")
}
if config.Flags.MpsRoot == nil || *config.Flags.MpsRoot == "" {
return fmt.Errorf("using MPS requires --mps-root to be specified")
}
}

return nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ spec:
name: nvidia-device-plugin-ctr
command: ["nvidia-device-plugin"]
env:
- name: MPS_ROOT
value: "{{ .Values.mps.root }}"
{{- if typeIs "string" .Values.migStrategy }}
- name: MIG_STRATEGY
value: "{{ .Values.migStrategy }}"
Expand Down Expand Up @@ -215,12 +217,11 @@ spec:
path: /var/lib/kubelet/device-plugins
- name: mps-root
hostPath:
# TODO: This should be /var/run/nvidia/mps
path: /var/lib/kubelet/device-plugins/mps
path: {{ .Values.mps.root }}
type: DirectoryOrCreate
- name: mps-shm
hostPath:
path: /var/lib/kubelet/device-plugins/mps/shm
path: {{ .Values.mps.root }}/shm
{{- if typeIs "string" .Values.nvidiaDriverRoot }}
- name: driver-root
hostPath:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,11 @@ spec:
volumes:
- name: mps-root
hostPath:
# TODO: This should be /var/run/nvidia/mps
path: /var/lib/kubelet/device-plugins/mps
path: {{ .Values.mps.root }}
type: DirectoryOrCreate
- name: mps-shm
hostPath:
path: /var/lib/kubelet/device-plugins/mps/shm
path: {{ .Values.mps.root }}/shm
{{- if eq $hasConfigMap "true" }}
- name: available-configs
configMap:
Expand Down
7 changes: 7 additions & 0 deletions deployments/helm/nvidia-device-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,10 @@ nfd:
- "0302"
deviceLabelFields:
- vendor

mps:
# root specifies the location where files and folders for managing MPS will
# be created. This includes a daemon-specific /dev/shm and pipe and log
# directories.
# Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
root: "/run/nvidia/mps"
45 changes: 28 additions & 17 deletions internal/plugin/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ type NvidiaDevicePlugin struct {
server *grpc.Server
health chan *rm.Device
stop chan interface{}

mpsDaemon *mps.Daemon
mpsHostRoot mps.Root
}

// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
Expand All @@ -74,6 +77,13 @@ func NewNvidiaDevicePlugin(config *spec.Config, resourceManager rm.ResourceManag
pluginName := "nvidia-" + name
pluginPath := filepath.Join(pluginapi.DevicePluginPath, pluginName)

var mpsDaemon *mps.Daemon
var mpsHostRoot mps.Root
if config.Sharing.SharingStrategy() != spec.SharingStrategyMPS {
mpsDaemon = mps.NewDaemon(resourceManager, mps.ContainerRoot)
mpsHostRoot = mps.Root(*config.Flags.CommandLineFlags.MpsRoot)
}

return &NvidiaDevicePlugin{
rm: resourceManager,
config: config,
Expand All @@ -83,6 +93,9 @@ func NewNvidiaDevicePlugin(config *spec.Config, resourceManager rm.ResourceManag
cdiHandler: cdiHandler,
cdiAnnotationPrefix: *config.Flags.Plugin.CDIAnnotationPrefix,

mpsDaemon: mpsDaemon,
mpsHostRoot: mpsHostRoot,

// These will be reinitialized every
// time the plugin server is restarted.
server: nil,
Expand Down Expand Up @@ -148,11 +161,12 @@ func (plugin *NvidiaDevicePlugin) waitForMPSDaemon() error {
if plugin.config.Sharing.SharingStrategy() != spec.SharingStrategyMPS {
return nil
}
// TODO: Check the started file here.
// TODO: Check the .ready file here.
// TODO: Have some retry strategy here.
if err := mps.NewDaemon(plugin.rm).AssertHealthy(); err != nil {
if err := plugin.mpsDaemon.AssertHealthy(); err != nil {
return fmt.Errorf("error checking MPS daemon health: %w", err)
}
klog.InfoS("MPS daemon is healthy", "resource", plugin.rm.Resource())
return nil
}

Expand Down Expand Up @@ -329,7 +343,6 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu
response := &pluginapi.ContainerAllocateResponse{
Envs: make(map[string]string),
}

if plugin.deviceListStrategies.IsCDIEnabled() {
responseID := uuid.New().String()
if err := plugin.updateResponseForCDI(response, responseID, deviceIDs...); err != nil {
Expand Down Expand Up @@ -361,26 +374,24 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu
// This includes per-resource pipe and log directories as well as a global daemon-specific shm
// and assumes that an MPS control daemon has already been started.
func (plugin NvidiaDevicePlugin) updateResponseForMPS(response *pluginapi.ContainerAllocateResponse) {
pipeDir := filepath.Join("/mps", string(plugin.rm.Resource()), "pipe")
response.Envs["CUDA_MPS_PIPE_DIRECTORY"] = pipeDir
// TODO: We should check that the deviceIDs are shared using MPS.
for k, v := range plugin.mpsDaemon.Envvars() {
response.Envs[k] = v
}

resourceName := plugin.rm.Resource()
response.Mounts = append(response.Mounts,
&pluginapi.Mount{
ContainerPath: pipeDir,
HostPath: filepath.Join("/var/lib/kubelet/device-plugins", pipeDir),
ContainerPath: plugin.mpsDaemon.PipeDir(),
HostPath: plugin.mpsHostRoot.PipeDir(resourceName),
},
)
logDir := filepath.Join("/mps", string(plugin.rm.Resource()), "log")
response.Envs["CUDA_MPS_LOG_DIRECTORY"] = logDir
response.Mounts = append(response.Mounts,
&pluginapi.Mount{
ContainerPath: logDir,
HostPath: filepath.Join("/var/lib/kubelet/device-plugins", logDir),
ContainerPath: plugin.mpsDaemon.PipeDir(),
HostPath: plugin.mpsHostRoot.LogDir(resourceName),
},
)
response.Mounts = append(response.Mounts,
&pluginapi.Mount{
ContainerPath: "/dev/shm",
HostPath: "/var/lib/kubelet/device-plugins/mps/shm",
ContainerPath: plugin.mpsDaemon.ShmDir(),
HostPath: plugin.mpsHostRoot.ShmDir(resourceName),
},
)
}
Expand Down

0 comments on commit c675243

Please sign in to comment.