Skip to content

Commit

Permalink
Allow mps root to be specified
Browse files Browse the repository at this point in the history
This change allows the MPS root on the host to be specified
and uses /run/nvidia/mps by default.

Signed-off-by: Evan Lezar <[email protected]>
  • Loading branch information
elezar committed Feb 21, 2024
1 parent 35c1393 commit 0405ec9
Show file tree
Hide file tree
Showing 8 changed files with 55 additions and 31 deletions.
3 changes: 3 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ type Flags struct {
type CommandLineFlags struct {
MigStrategy *string `json:"migStrategy" yaml:"migStrategy"`
FailOnInitError *bool `json:"failOnInitError" yaml:"failOnInitError"`
MpsRoot *string `json:"mpsRoot,omitempty" yaml:"mpsRoot,omitempty"`
NvidiaDriverRoot *string `json:"nvidiaDriverRoot,omitempty" yaml:"nvidiaDriverRoot,omitempty"`
GDSEnabled *bool `json:"gdsEnabled" yaml:"gdsEnabled"`
MOFEDEnabled *bool `json:"mofedEnabled" yaml:"mofedEnabled"`
Expand Down Expand Up @@ -116,6 +117,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.MigStrategy, c, n)
case "fail-on-init-error":
updateFromCLIFlag(&f.FailOnInitError, c, n)
case "mps-root":
updateFromCLIFlag(&f.MpsRoot, c, n)
case "nvidia-driver-root":
updateFromCLIFlag(&f.NvidiaDriverRoot, c, n)
case "gds-enabled":
Expand Down
16 changes: 8 additions & 8 deletions cmd/mps-control-daemon/mps/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ type Daemon struct {
}

// NewDaemon creates an MPS daemon instance.
func NewDaemon(rm rm.ResourceManager) *Daemon {
func NewDaemon(rm rm.ResourceManager, root string) *Daemon {
return &Daemon{
rm: rm,
root: "/mps",
root: root,
}
}

Expand All @@ -77,8 +77,8 @@ func (e envvars) toSlice() []string {
// TODO: Set CUDA_VISIBLE_DEVICES to include only the devices for this resource type.
func (d *Daemon) Envvars() envvars {
return map[string]string{
"CUDA_MPS_PIPE_DIRECTORY": d.pipeDir(),
"CUDA_MPS_LOG_DIRECTORY": d.logDir(),
"CUDA_MPS_PIPE_DIRECTORY": d.PipeDir(),
"CUDA_MPS_LOG_DIRECTORY": d.LogDir(),
}
}

Expand All @@ -90,12 +90,12 @@ func (d *Daemon) Start() error {

klog.InfoS("Staring MPS daemon", "resource", d.rm.Resource())

pipeDir := d.pipeDir()
pipeDir := d.PipeDir()
if err := os.MkdirAll(pipeDir, 0755); err != nil {
return fmt.Errorf("error creating directory %v: %w", pipeDir, err)
}

logDir := d.logDir()
logDir := d.LogDir()
if err := os.MkdirAll(logDir, 0755); err != nil {
return fmt.Errorf("error creating directory %v: %w", logDir, err)
}
Expand Down Expand Up @@ -151,11 +151,11 @@ func (d *Daemon) resourceRoot() string {
return filepath.Join(d.root, string(d.rm.Resource()))
}

func (d *Daemon) pipeDir() string {
func (d *Daemon) PipeDir() string {
return filepath.Join(d.resourceRoot(), "pipe")
}

func (d *Daemon) logDir() string {
func (d *Daemon) LogDir() string {
return filepath.Join(d.resourceRoot(), "log")
}

Expand Down
2 changes: 1 addition & 1 deletion cmd/mps-control-daemon/mps/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (m *manager) Daemons() ([]*Daemon, error) {
klog.InfoS("Resource is not shared", "resource", "resource", resourceManager.Resource())
continue
}
daemon := NewDaemon(resourceManager)
daemon := NewDaemon(resourceManager, "/mps")
daemons = append(daemons, daemon)
}

Expand Down
8 changes: 8 additions & 0 deletions cmd/nvidia-device-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ func main() {
Usage: "the path where the NVIDIA driver root is mounted in the container; used for generating CDI specifications",
EnvVars: []string{"CONTAINER_DRIVER_ROOT"},
},
&cli.StringFlag{
Name: "mps-root",
Usage: "the path on the host where MPS-specific mounts and files are created by the MPS control daemon manager",
EnvVars: []string{"MPS_ROOT"},
},
}

err := c.Run(os.Args)
Expand Down Expand Up @@ -148,6 +153,9 @@ func validateFlags(config *spec.Config) error {
if *config.Flags.MigStrategy == spec.MigStrategyMixed {
return fmt.Errorf("using --mig-strategy=mixed is not supported with MPS")
}
if config.Flags.MpsRoot == nil || *config.Flags.MpsRoot == "" {
return fmt.Errorf("using MPS requires --mps-root to be specified")
}
}

return nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ spec:
name: nvidia-device-plugin-ctr
command: ["nvidia-device-plugin"]
env:
- name: MPS_ROOT
value: "{{ .Values.mps.root }}"
{{- if typeIs "string" .Values.migStrategy }}
- name: MIG_STRATEGY
value: "{{ .Values.migStrategy }}"
Expand Down Expand Up @@ -215,12 +217,11 @@ spec:
path: /var/lib/kubelet/device-plugins
- name: mps-root
hostPath:
# TODO: This should be /var/run/nvidia/mps
path: /var/lib/kubelet/device-plugins/mps
path: {{ .Values.mps.root }}
type: DirectoryOrCreate
- name: mps-shm
hostPath:
path: /var/lib/kubelet/device-plugins/mps/shm
path: {{ .Values.mps.root }}/shm
{{- if typeIs "string" .Values.nvidiaDriverRoot }}
- name: driver-root
hostPath:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,11 @@ spec:
volumes:
- name: mps-root
hostPath:
# TODO: This should be /var/run/nvidia/mps
path: /var/lib/kubelet/device-plugins/mps
path: {{ .Values.mps.root }}
type: DirectoryOrCreate
- name: mps-shm
hostPath:
path: /var/lib/kubelet/device-plugins/mps/shm
path: {{ .Values.mps.root }}/shm
{{- if eq $hasConfigMap "true" }}
- name: available-configs
configMap:
Expand Down
7 changes: 7 additions & 0 deletions deployments/helm/nvidia-device-plugin/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,10 @@ nfd:
- "0302"
deviceLabelFields:
- vendor

mps:
# root specifies the location where files and folders for managing MPS will
# be created. This includes a daemon-specific /dev/shm and pipe and log
# directories.
# Pipe directories will be created at {{ mps.root }}/{{ .ResourceName }}
root: "/run/nvidia/mps"
38 changes: 22 additions & 16 deletions internal/plugin/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,13 @@ func (plugin *NvidiaDevicePlugin) waitForMPSDaemon() error {
if plugin.config.Sharing.SharingStrategy() != spec.SharingStrategyMPS {
return nil
}
// TODO: Check the started file here.
// TODO: Check the .ready file here.
// TODO: Have some retry strategy here.
if err := mps.NewDaemon(plugin.rm).AssertHealthy(); err != nil {
mpsDaemon := mps.NewDaemon(plugin.rm, "/mps")
if err := mpsDaemon.AssertHealthy(); err != nil {
return fmt.Errorf("error checking MPS daemon health: %w", err)
}
klog.InfoS("MPS daemon is healthy", "resource", plugin.rm.Resource())
return nil
}

Expand Down Expand Up @@ -329,7 +331,6 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu
response := &pluginapi.ContainerAllocateResponse{
Envs: make(map[string]string),
}

if plugin.deviceListStrategies.IsCDIEnabled() {
responseID := uuid.New().String()
if err := plugin.updateResponseForCDI(response, responseID, deviceIDs...); err != nil {
Expand Down Expand Up @@ -361,26 +362,31 @@ func (plugin *NvidiaDevicePlugin) getAllocateResponse(requestIds []string) (*plu
// This includes per-resource pipe and log directories as well as a global daemon-specific shm
// and assumes that an MPS control daemon has already been started.
func (plugin NvidiaDevicePlugin) updateResponseForMPS(response *pluginapi.ContainerAllocateResponse) {
pipeDir := filepath.Join("/mps", string(plugin.rm.Resource()), "pipe")
response.Envs["CUDA_MPS_PIPE_DIRECTORY"] = pipeDir
// TODO: We should check that the deviceIDs are shared using MPS.

// TODO: We use the Daemon here just to construct the pipe and log dirs for the specified resource.
containerMpsDaemon := mps.NewDaemon(plugin.rm, "/mps")

containerPipeDir := containerMpsDaemon.PipeDir()
containerLogDir := containerMpsDaemon.LogDir()

response.Envs["CUDA_MPS_PIPE_DIRECTORY"] = containerPipeDir
response.Envs["CUDA_MPS_LOG_DIRECTORY"] = containerLogDir

// TODO: We use the Daemon here just to construct the pipe and log dirs for the specified resource on the host.
hostMpsDaemon := mps.NewDaemon(plugin.rm, *plugin.config.Flags.MpsRoot)
response.Mounts = append(response.Mounts,
&pluginapi.Mount{
ContainerPath: pipeDir,
HostPath: filepath.Join("/var/lib/kubelet/device-plugins", pipeDir),
ContainerPath: containerPipeDir,
HostPath: hostMpsDaemon.PipeDir(),
},
)
logDir := filepath.Join("/mps", string(plugin.rm.Resource()), "log")
response.Envs["CUDA_MPS_LOG_DIRECTORY"] = logDir
response.Mounts = append(response.Mounts,
&pluginapi.Mount{
ContainerPath: logDir,
HostPath: filepath.Join("/var/lib/kubelet/device-plugins", logDir),
ContainerPath: containerLogDir,
HostPath: hostMpsDaemon.LogDir(),
},
)
response.Mounts = append(response.Mounts,
&pluginapi.Mount{
ContainerPath: "/dev/shm",
HostPath: "/var/lib/kubelet/device-plugins/mps/shm",
HostPath: filepath.Join(*plugin.config.Flags.MpsRoot, "shm"),
},
)
}
Expand Down

0 comments on commit 0405ec9

Please sign in to comment.