Skip to content

Commit

Permalink
Merge pull request #211 from lengrongfu/feat/add-user-uuid
Browse files Browse the repository at this point in the history
add use can specify use or not use device id feature
  • Loading branch information
archlitchi authored Mar 27, 2024
2 parents a375ed0 + af0ba46 commit 1f8df63
Show file tree
Hide file tree
Showing 11 changed files with 274 additions and 0 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ English version|[中文版](README_cn.md)

***Device Type Specification***: You can specify the type of device to use or avoid for a particular task by setting annotations, such as "nvidia.com/use-gputype" or "nvidia.com/nouse-gputype".

***Device UUID Specification***: You can specify the UUID of device to use or avoid for a particular task by setting annotations, such as "nvidia.com/use-gpuuuid" or "nvidia.com/nouse-gpuuuid".

***Easy to use***: You don't need to modify your task YAML to use our scheduler. All your jobs will be automatically supported after installation. Additionally, you can specify a resource name other than "nvidia.com/gpu" if you prefer.

## Major Features
Expand Down
2 changes: 2 additions & 0 deletions README_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

***指定设备型号***:当前任务可以通过设置annotation的方式,来选择使用或者不使用某些具体型号的设备

***设备指定UUID***:当前任务可以通过设置`annotation`的方式,来选择使用或者不使用指定的设备,比如:"nvidia.com/use-gpuuuid" or "nvidia.com/nouse-gpuuuid"

***无侵入***: vGPU调度器兼容nvidia官方插件的显卡分配方式,所以安装完毕后,你不需要修改原有的任务文件就可以使用vGPU的功能。当然,你也可以自定义的资源名称

## 使用场景
Expand Down
14 changes: 14 additions & 0 deletions examples/nvidia/specify_uuid_not_use.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod
annotations:
nvidia.com/nouse-gpuuuid: "GPU-123456"
spec:
containers:
- name: ubuntu-container
image: ubuntu:18.04
command: ["bash", "-c", "sleep 86400"]
resources:
limits:
nvidia.com/gpu: 2 # requesting 2 vGPUs
14 changes: 14 additions & 0 deletions examples/nvidia/specify_uuid_to_use.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod
annotations:
nvidia.com/use-gpuuuid: "GPU-123456"
spec:
containers:
- name: ubuntu-container
image: ubuntu:18.04
command: ["bash", "-c", "sleep 86400"]
resources:
limits:
nvidia.com/gpu: 2 # requesting 2 vGPUs
33 changes: 33 additions & 0 deletions pkg/device/cambricon/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ const (
MluMemSplitEnable = "CAMBRICON_SPLIT_ENABLE"
MLUInUse = "cambricon.com/use-mlutype"
MLUNoUse = "cambricon.com/nouse-mlutype"
// MLUUseUUID is user can use specify MLU device for set MLU UUID
MLUUseUUID = "cambricon.com/use-gpuuuid"
// MLUNoUseUUID is user can not use specify MLU device for set MLU UUID
MLUNoUseUUID = "cambricon.com/nouse-gpuuuid"
)

var (
Expand Down Expand Up @@ -134,6 +138,35 @@ func (dev *CambriconDevices) CheckType(annos map[string]string, d util.DeviceUsa
return false, false, false
}

func (dev *CambriconDevices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool {
userUUID, ok := annos[MLUUseUUID]
if ok {
klog.V(5).Infof("check uuid for mlu user uuid [%s], device id is %s", userUUID, d.Id)

Check failure on line 144 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / Unit test

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 144 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / build-dev-image

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 144 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / lint

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 144 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / lint

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
// use , symbol to connect multiple uuid
userUUIDs := strings.Split(userUUID, ",")
for _, uuid := range userUUIDs {
if d.Id == uuid {

Check failure on line 148 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / Unit test

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 148 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / build-dev-image

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 148 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / lint

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 148 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / lint

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
return true
}
}
return false
}

noUserUUID, ok := annos[MLUNoUseUUID]
if ok {
klog.V(5).Infof("check uuid for mlu not user uuid [%s], device id is %s", noUserUUID, d.Id)

Check failure on line 157 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / Unit test

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 157 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / build-dev-image

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 157 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / lint

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 157 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / lint

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
// use , symbol to connect multiple uuid
noUserUUIDs := strings.Split(noUserUUID, ",")
for _, uuid := range noUserUUIDs {
if d.Id == uuid {

Check failure on line 161 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / Unit test

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 161 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / build-dev-image

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 161 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / lint

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id) (typecheck)

Check failure on line 161 in pkg/device/cambricon/device.go

View workflow job for this annotation

GitHub Actions / lint

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)) (typecheck)
return false
}
}
return true
}
return true
}

func (dev *CambriconDevices) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest {
klog.Info("Counting mlu devices")
mluResourceCount := corev1.ResourceName(MLUResourceCount)
Expand Down
2 changes: 2 additions & 0 deletions pkg/device/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ type Devices interface {
NodeCleanUp(nn string) error
GetNodeDevices(n v1.Node) ([]*api.DeviceInfo, error)
CheckType(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) (bool, bool, bool)
// CheckUUID is check current device id whether in GPUUseUUID or GPUNoUseUUID set, return true is check success.
CheckUUID(annos map[string]string, d util.DeviceUsage) bool
GenerateResourceRequests(ctr *v1.Container) util.ContainerDeviceRequest
PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string
ParseConfig(fs *flag.FlagSet)
Expand Down
33 changes: 33 additions & 0 deletions pkg/device/hygon/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ const (
HygonDCUCommonWord = "DCU"
DCUInUse = "hygon.com/use-dcutype"
DCUNoUse = "hygon.com/nouse-dcutype"
// DCUUseUUID is user can use specify DCU device for set DCU UUID
DCUUseUUID = "hygon.com/use-gpuuuid"
// DCUNoUseUUID is user can not use specify DCU device for set DCU UUID
DCUNoUseUUID = "hygon.com/nouse-gpuuuid"
)

var (
Expand Down Expand Up @@ -111,6 +115,35 @@ func (dev *DCUDevices) CheckType(annos map[string]string, d util.DeviceUsage, n
return false, false, false
}

func (dev *DCUDevices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool {
userUUID, ok := annos[DCUUseUUID]
if ok {
klog.V(5).Infof("check uuid for dcu user uuid [%s], device id is %s", userUUID, d.Id)

Check failure on line 121 in pkg/device/hygon/device.go

View workflow job for this annotation

GitHub Actions / Unit test

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 121 in pkg/device/hygon/device.go

View workflow job for this annotation

GitHub Actions / build-dev-image

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
// use , symbol to connect multiple uuid
userUUIDs := strings.Split(userUUID, ",")
for _, uuid := range userUUIDs {
if d.Id == uuid {

Check failure on line 125 in pkg/device/hygon/device.go

View workflow job for this annotation

GitHub Actions / Unit test

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)

Check failure on line 125 in pkg/device/hygon/device.go

View workflow job for this annotation

GitHub Actions / build-dev-image

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
return true
}
}
return false
}

noUserUUID, ok := annos[DCUNoUseUUID]
if ok {
klog.V(5).Infof("check uuid for dcu not user uuid [%s], device id is %s", noUserUUID, d.Id)

Check failure on line 134 in pkg/device/hygon/device.go

View workflow job for this annotation

GitHub Actions / Unit test

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
// use , symbol to connect multiple uuid
noUserUUIDs := strings.Split(noUserUUID, ",")
for _, uuid := range noUserUUIDs {
if d.Id == uuid {

Check failure on line 138 in pkg/device/hygon/device.go

View workflow job for this annotation

GitHub Actions / Unit test

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
return false
}
}
return true
}
return true
}

func (dev *DCUDevices) GenerateResourceRequests(ctr *corev1.Container) util.ContainerDeviceRequest {
klog.Info("Counting dcu devices")
dcuResourceCount := corev1.ResourceName(HygonResourceCount)
Expand Down
33 changes: 33 additions & 0 deletions pkg/device/iluvatar/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ const (
IluvatarGPUDevice = "Iluvatar"
IluvatarGPUCommonWord = "Iluvatar"
IluvatarDeviceSelection = "iluvatar.ai/predicate-gpu-idx-"
// IluvatarUseUUID is user can use specify Iluvatar device for set Iluvatar UUID
IluvatarUseUUID = "iluvatar.ai/use-gpuuuid"
// IluvatarNoUseUUID is user can not use specify Iluvatar device for set Iluvatar UUID
IluvatarNoUseUUID = "iluvatar.ai/nouse-gpuuuid"
)

var (
Expand Down Expand Up @@ -101,6 +105,35 @@ func (dev *IluvatarDevices) CheckType(annos map[string]string, d util.DeviceUsag
return false, false, false
}

func (dev *IluvatarDevices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool {
userUUID, ok := annos[IluvatarUseUUID]
if ok {
klog.V(5).Infof("check uuid for Iluvatar user uuid [%s], device id is %s", userUUID, d.Id)

Check failure on line 111 in pkg/device/iluvatar/device.go

View workflow job for this annotation

GitHub Actions / Unit test

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
// use , symbol to connect multiple uuid
userUUIDs := strings.Split(userUUID, ",")
for _, uuid := range userUUIDs {
if d.Id == uuid {

Check failure on line 115 in pkg/device/iluvatar/device.go

View workflow job for this annotation

GitHub Actions / Unit test

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
return true
}
}
return false
}

noUserUUID, ok := annos[IluvatarNoUseUUID]
if ok {
klog.V(5).Infof("check uuid for Iluvatar not user uuid [%s], device id is %s", noUserUUID, d.Id)
// use , symbol to connect multiple uuid
noUserUUIDs := strings.Split(noUserUUID, ",")
for _, uuid := range noUserUUIDs {
if d.Id == uuid {
return false
}
}
return true
}
return true
}

func (dev *IluvatarDevices) CheckHealth(devType string, n *corev1.Node) (bool, bool) {
return util.CheckHealth(devType, n)
}
Expand Down
34 changes: 34 additions & 0 deletions pkg/device/nvidia/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ const (
GPUInUse = "nvidia.com/use-gputype"
GPUNoUse = "nvidia.com/nouse-gputype"
NumaBind = "nvidia.com/numa-bind"
// GPUUseUUID is user can use specify GPU device for set GPU UUID
GPUUseUUID = "nvidia.com/use-gpuuuid"
// GPUNoUseUUID is user can not use specify GPU device for set GPU UUID
GPUNoUseUUID = "nvidia.com/nouse-gpuuuid"
)

var (
Expand Down Expand Up @@ -161,6 +165,36 @@ func (dev *NvidiaGPUDevices) CheckType(annos map[string]string, d util.DeviceUsa
return false, false, false
}

func (dev *NvidiaGPUDevices) CheckUUID(annos map[string]string, d util.DeviceUsage) bool {
userUUID, ok := annos[GPUUseUUID]
if ok {
klog.V(5).Infof("check uuid for nvidia user uuid [%s], device id is %s", userUUID, d.Id)

Check failure on line 171 in pkg/device/nvidia/device.go

View workflow job for this annotation

GitHub Actions / build-dev-image

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
// use , symbol to connect multiple uuid
userUUIDs := strings.Split(userUUID, ",")
for _, uuid := range userUUIDs {
if d.Id == uuid {

Check failure on line 175 in pkg/device/nvidia/device.go

View workflow job for this annotation

GitHub Actions / build-dev-image

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
return true
}
}
return false
}

noUserUUID, ok := annos[GPUNoUseUUID]
if ok {
klog.V(5).Infof("check uuid for nvidia not user uuid [%s], device id is %s", noUserUUID, d.Id)

Check failure on line 184 in pkg/device/nvidia/device.go

View workflow job for this annotation

GitHub Actions / build-dev-image

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
// use , symbol to connect multiple uuid
noUserUUIDs := strings.Split(noUserUUID, ",")
for _, uuid := range noUserUUIDs {
if d.Id == uuid {

Check failure on line 188 in pkg/device/nvidia/device.go

View workflow job for this annotation

GitHub Actions / build-dev-image

d.Id undefined (type "github.com/Project-HAMi/HAMi/pkg/util".DeviceUsage has no field or method Id)
return false
}
}
return true
}

return true
}

func (dev *NvidiaGPUDevices) PatchAnnotations(annoinput *map[string]string, pd util.PodDevices) map[string]string {
devlist, ok := pd[NvidiaGPUDevice]
if ok && len(devlist) > 0 {
Expand Down
92 changes: 92 additions & 0 deletions pkg/device/nvidia/device_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package nvidia
import (
"testing"

"github.com/Project-HAMi/HAMi/pkg/util"
"gotest.tools/v3/assert"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
Expand Down Expand Up @@ -96,3 +97,94 @@ func Test_MutateAdmission(t *testing.T) {
})
}
}

func Test_CheckUUID(t *testing.T) {
gpuDevices := &NvidiaGPUDevices{}
tests := []struct {
name string
args struct {
annos map[string]string
d util.DeviceUsage
}
want bool
}{
{
name: "don't set GPUUseUUID and GPUNoUseUUID annotation",
args: struct {
annos map[string]string
d util.DeviceUsage
}{
annos: make(map[string]string),
d: util.DeviceUsage{},
},
want: true,
},
{
name: "use set GPUUseUUID don't set GPUNoUseUUID annotation,device match",
args: struct {
annos map[string]string
d util.DeviceUsage
}{
annos: map[string]string{
GPUUseUUID: "abc,123",
},
d: util.DeviceUsage{
Id: "abc",
},
},
want: true,
},
{
name: "use set GPUUseUUID don't set GPUNoUseUUID annotation,device don't match",
args: struct {
annos map[string]string
d util.DeviceUsage
}{
annos: map[string]string{
GPUUseUUID: "abc,123",
},
d: util.DeviceUsage{
Id: "1abc",
},
},
want: false,
},
{
name: "use don't set GPUUseUUID set GPUNoUseUUID annotation,device match",
args: struct {
annos map[string]string
d util.DeviceUsage
}{
annos: map[string]string{
GPUNoUseUUID: "abc,123",
},
d: util.DeviceUsage{
Id: "abc",
},
},
want: false,
},
{
name: "use don't set GPUUseUUID set GPUNoUseUUID annotation,device don't match",
args: struct {
annos map[string]string
d util.DeviceUsage
}{
annos: map[string]string{
GPUNoUseUUID: "abc,123",
},
d: util.DeviceUsage{
Id: "1abc",
},
},
want: true,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
got := gpuDevices.CheckUUID(test.args.annos, test.args.d)
assert.Equal(t, test.want, got)
})
}
}
15 changes: 15 additions & 0 deletions pkg/scheduler/score.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,17 @@ func checkType(annos map[string]string, d util.DeviceUsage, n util.ContainerDevi
return false, false
}

func checkUUID(annos map[string]string, d util.DeviceUsage, n util.ContainerDeviceRequest) bool {
devices, ok := device.GetDevices()[n.Type]
if !ok {
klog.Errorf("can not get device for %s type", n.Type)
return false
}
result := devices.CheckUUID(annos, d)
klog.V(2).Infof("checkUUID result is %v for %s type", result, n.Type)
return result
}

func fitInCertainDevice(node *NodeUsage, request util.ContainerDeviceRequest, annos map[string]string, pod *v1.Pod) (bool, map[string]util.ContainerDevices) {
k := request
originReq := k.Nums
Expand All @@ -104,6 +115,10 @@ func fitInCertainDevice(node *NodeUsage, request util.ContainerDeviceRequest, an
prevnuma = node.Devices[i].Numa
tmpDevs = make(map[string]util.ContainerDevices)
}
if !checkUUID(annos, *node.Devices[i], k) {
klog.InfoS("card uuid mismatch,", "pod", klog.KObj(pod), "current device info is:", node.Devices[i])
continue
}

memreq := int32(0)
if node.Devices[i].Count <= node.Devices[i].Used {
Expand Down

0 comments on commit 1f8df63

Please sign in to comment.