Skip to content

Commit

Permalink
add support for other devices to kwok-gpu-device
Browse files Browse the repository at this point in the history
  • Loading branch information
enoodle committed Nov 26, 2024
1 parent c07ef90 commit 3573371
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 15 deletions.
2 changes: 1 addition & 1 deletion internal/common/topology/const.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
package topology

const (
cmTopologyKey = "topology.yml"
CmTopologyKey = "topology.yml"
)
8 changes: 4 additions & 4 deletions internal/common/topology/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ func GetClusterTopologyFromCM(kubeclient kubernetes.Interface) (*ClusterTopology

func FromClusterTopologyCM(cm *corev1.ConfigMap) (*ClusterTopology, error) {
var clusterTopology ClusterTopology
err := yaml.Unmarshal([]byte(cm.Data[cmTopologyKey]), &clusterTopology)
err := yaml.Unmarshal([]byte(cm.Data[CmTopologyKey]), &clusterTopology)
if err != nil {
return nil, err
}
Expand All @@ -88,7 +88,7 @@ func FromClusterTopologyCM(cm *corev1.ConfigMap) (*ClusterTopology, error) {

func FromNodeTopologyCM(cm *corev1.ConfigMap) (*NodeTopology, error) {
var nodeTopology NodeTopology
err := yaml.Unmarshal([]byte(cm.Data[cmTopologyKey]), &nodeTopology)
err := yaml.Unmarshal([]byte(cm.Data[CmTopologyKey]), &nodeTopology)
if err != nil {
return nil, err
}
Expand All @@ -110,7 +110,7 @@ func ToClusterTopologyCM(clusterTopology *ClusterTopology) (*corev1.ConfigMap, e
return nil, err
}

cm.Data[cmTopologyKey] = string(topologyData)
cm.Data[CmTopologyKey] = string(topologyData)

return cm, nil
}
Expand All @@ -134,7 +134,7 @@ func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.Conf
return nil, nil, err
}

cm.Data[cmTopologyKey] = string(topologyData)
cm.Data[CmTopologyKey] = string(topologyData)

cmApplyConfig = cmApplyConfig.WithData(cm.Data)

Expand Down
4 changes: 2 additions & 2 deletions internal/common/topology/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ type NodePoolTopology struct {
GpuCount int `yaml:"gpuCount"`
GpuMemory int `yaml:"gpuMemory"`
GpuProduct string `yaml:"gpuProduct"`
OtherDevices []GenericDevice `yaml:"otherDevices,omitempty"`
OtherDevices []GenericDevice `yaml:"otherDevices"`
}

type NodeTopology struct {
GpuMemory int `yaml:"gpuMemory"`
GpuProduct string `yaml:"gpuProduct"`
Gpus []GpuDetails `yaml:"gpus"`
MigStrategy string `yaml:"migStrategy"`
OtherDevices []GenericDevice `yaml:"otherDevices,omitempty"`
OtherDevices []GenericDevice `yaml:"otherDevices"`
}

type GpuDetails struct {
Expand Down
36 changes: 28 additions & 8 deletions internal/kwok-gpu-device-plugin/handlers/configmap/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@ package configmap

import (
"context"
"encoding/json"
"fmt"
"log"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
Expand Down Expand Up @@ -41,16 +44,33 @@ func (p *ConfigMapHandler) HandleAdd(cm *v1.ConfigMap) error {
}
nodeName := cm.Labels[constants.LabelTopologyCMNodeName]

return p.applyFakeDevicePlugin(len(nodeTopology.Gpus), nodeName)
return p.applyFakeDevicePlugin(nodeTopology, nodeName)
}

func (p *ConfigMapHandler) applyFakeDevicePlugin(gpuCount int, nodeName string) error {
patch := fmt.Sprintf(
`{"status": {"capacity": {"%s": "%d"}, "allocatable": {"%s": "%d"}}}`,
constants.GpuResourceName, gpuCount, constants.GpuResourceName, gpuCount,
)
_, err := p.kubeClient.CoreV1().Nodes().Patch(
context.TODO(), nodeName, types.MergePatchType, []byte(patch), metav1.PatchOptions{}, "status",
func (p *ConfigMapHandler) applyFakeDevicePlugin(nodeTopology *topology.NodeTopology, nodeName string) error {
nodePatch := &v1.Node{
Status: v1.NodeStatus{
Capacity: v1.ResourceList{
v1.ResourceName(constants.GpuResourceName): *resource.NewQuantity(int64(len(nodeTopology.Gpus)), resource.DecimalSI),
},
Allocatable: v1.ResourceList{
v1.ResourceName(constants.GpuResourceName): *resource.NewQuantity(int64(len(nodeTopology.Gpus)), resource.DecimalSI),
},
},
}

for _, otherDevice := range nodeTopology.OtherDevices {
nodePatch.Status.Capacity[v1.ResourceName(otherDevice.Name)] = *resource.NewQuantity(int64(otherDevice.Count), resource.DecimalSI)
nodePatch.Status.Allocatable[v1.ResourceName(otherDevice.Name)] = *resource.NewQuantity(int64(otherDevice.Count), resource.DecimalSI)
}

patchBytes, err := json.Marshal(nodePatch)
if err != nil {
return fmt.Errorf("failed to update node: failed to marshal patch: %v", err)
}

_, err = p.kubeClient.CoreV1().Nodes().Patch(
context.TODO(), nodeName, types.MergePatchType, patchBytes, metav1.PatchOptions{}, "status",
)
if err != nil {
return fmt.Errorf("failed to update node capacity and allocatable: %v", err)
Expand Down
80 changes: 80 additions & 0 deletions internal/kwok-gpu-device-plugin/handlers/configmap/handler_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package configmap

import (
"testing"

"gopkg.in/yaml.v3"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
"golang.org/x/net/context"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
)

func TestKWOKGPUDevicePlugin(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "KWOK GPU DevicePlugin Suite")
}

var _ = Describe("HandleAdd", func() {
It("should update the node capacity and allocatable", func() {
nodeName := "node1"
nodeTopology := &topology.NodeTopology{
Gpus: []topology.GpuDetails{
{ID: "0"},
},
OtherDevices: []topology.GenericDevice{
{Name: "device1", Count: 2},
},
}

topologyData, err := yaml.Marshal(nodeTopology)
if err != nil {
Fail("Failed to marshal topology data")
}

configMap := &v1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: nodeName,
Labels: map[string]string{
constants.LabelTopologyCMNodeName: nodeName,
},
},
Data: map[string]string{
topology.CmTopologyKey: string(topologyData),
},
}

node := &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: nodeName,
},
}

fakeClient := fake.NewSimpleClientset(node, configMap)

fakeNodeCMHandler := NewConfigMapHandler(fakeClient, nil)
err = fakeNodeCMHandler.HandleAdd(configMap)
Expect(err).ToNot(HaveOccurred())

updateNode, err := fakeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())
Expect(testResourceListCondition(updateNode.Status.Capacity, v1.ResourceName(constants.GpuResourceName), 1)).To(BeTrue())
Expect(testResourceListCondition(updateNode.Status.Allocatable, v1.ResourceName(constants.GpuResourceName), 1)).To(BeTrue())
Expect(testResourceListCondition(updateNode.Status.Capacity, v1.ResourceName("device1"), 2)).To(BeTrue())
Expect(testResourceListCondition(updateNode.Status.Allocatable, v1.ResourceName("device1"), 2)).To(BeTrue())
})
})

func testResourceListCondition(resourceList v1.ResourceList, resourceName v1.ResourceName, value int64) bool {
quantity, found := resourceList[resourceName]
if !found {
return false
}
return quantity.Value() == value
}

0 comments on commit 3573371

Please sign in to comment.