diff --git a/internal/common/topology/const.go b/internal/common/topology/const.go index dd7120d..cdc645f 100644 --- a/internal/common/topology/const.go +++ b/internal/common/topology/const.go @@ -1,5 +1,5 @@ package topology const ( - cmTopologyKey = "topology.yml" + CmTopologyKey = "topology.yml" ) diff --git a/internal/common/topology/kubernetes.go b/internal/common/topology/kubernetes.go index 1246fe5..39a0662 100644 --- a/internal/common/topology/kubernetes.go +++ b/internal/common/topology/kubernetes.go @@ -78,7 +78,7 @@ func GetClusterTopologyFromCM(kubeclient kubernetes.Interface) (*ClusterTopology func FromClusterTopologyCM(cm *corev1.ConfigMap) (*ClusterTopology, error) { var clusterTopology ClusterTopology - err := yaml.Unmarshal([]byte(cm.Data[cmTopologyKey]), &clusterTopology) + err := yaml.Unmarshal([]byte(cm.Data[CmTopologyKey]), &clusterTopology) if err != nil { return nil, err } @@ -88,7 +88,7 @@ func FromClusterTopologyCM(cm *corev1.ConfigMap) (*ClusterTopology, error) { func FromNodeTopologyCM(cm *corev1.ConfigMap) (*NodeTopology, error) { var nodeTopology NodeTopology - err := yaml.Unmarshal([]byte(cm.Data[cmTopologyKey]), &nodeTopology) + err := yaml.Unmarshal([]byte(cm.Data[CmTopologyKey]), &nodeTopology) if err != nil { return nil, err } @@ -110,7 +110,7 @@ func ToClusterTopologyCM(clusterTopology *ClusterTopology) (*corev1.ConfigMap, e return nil, err } - cm.Data[cmTopologyKey] = string(topologyData) + cm.Data[CmTopologyKey] = string(topologyData) return cm, nil } @@ -134,7 +134,7 @@ func ToNodeTopologyCM(nodeTopology *NodeTopology, nodeName string) (*corev1.Conf return nil, nil, err } - cm.Data[cmTopologyKey] = string(topologyData) + cm.Data[CmTopologyKey] = string(topologyData) cmApplyConfig = cmApplyConfig.WithData(cm.Data) diff --git a/internal/common/topology/types.go b/internal/common/topology/types.go index 1dbcf7c..5a343a0 100644 --- a/internal/common/topology/types.go +++ b/internal/common/topology/types.go @@ -17,7 +17,7 @@ type NodePoolTopology struct { GpuCount int `yaml:"gpuCount"` GpuMemory int `yaml:"gpuMemory"` GpuProduct string `yaml:"gpuProduct"` - OtherDevices []GenericDevice `yaml:"otherDevices,omitempty"` + OtherDevices []GenericDevice `yaml:"otherDevices"` } type NodeTopology struct { @@ -25,7 +25,7 @@ type NodeTopology struct { GpuProduct string `yaml:"gpuProduct"` Gpus []GpuDetails `yaml:"gpus"` MigStrategy string `yaml:"migStrategy"` - OtherDevices []GenericDevice `yaml:"otherDevices,omitempty"` + OtherDevices []GenericDevice `yaml:"otherDevices"` } type GpuDetails struct { diff --git a/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go b/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go index 3c342f2..7e922ce 100644 --- a/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go +++ b/internal/kwok-gpu-device-plugin/handlers/configmap/handler.go @@ -2,12 +2,15 @@ package configmap import ( "context" + "encoding/json" "fmt" "log" "github.com/run-ai/fake-gpu-operator/internal/common/constants" "github.com/run-ai/fake-gpu-operator/internal/common/topology" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" @@ -41,16 +44,33 @@ func (p *ConfigMapHandler) HandleAdd(cm *v1.ConfigMap) error { } nodeName := cm.Labels[constants.LabelTopologyCMNodeName] - return p.applyFakeDevicePlugin(len(nodeTopology.Gpus), nodeName) + return p.applyFakeDevicePlugin(nodeTopology, nodeName) } -func (p *ConfigMapHandler) applyFakeDevicePlugin(gpuCount int, nodeName string) error { - patch := fmt.Sprintf( - `{"status": {"capacity": {"%s": "%d"}, "allocatable": {"%s": "%d"}}}`, - constants.GpuResourceName, gpuCount, constants.GpuResourceName, gpuCount, - ) - _, err := p.kubeClient.CoreV1().Nodes().Patch( - context.TODO(), nodeName, types.MergePatchType, []byte(patch), metav1.PatchOptions{}, "status", +func (p *ConfigMapHandler) applyFakeDevicePlugin(nodeTopology *topology.NodeTopology, nodeName string) error { + nodePatch := &v1.Node{ + Status: v1.NodeStatus{ + Capacity: v1.ResourceList{ + v1.ResourceName(constants.GpuResourceName): *resource.NewQuantity(int64(len(nodeTopology.Gpus)), resource.DecimalSI), + }, + Allocatable: v1.ResourceList{ + v1.ResourceName(constants.GpuResourceName): *resource.NewQuantity(int64(len(nodeTopology.Gpus)), resource.DecimalSI), + }, + }, + } + + for _, otherDevice := range nodeTopology.OtherDevices { + nodePatch.Status.Capacity[v1.ResourceName(otherDevice.Name)] = *resource.NewQuantity(int64(otherDevice.Count), resource.DecimalSI) + nodePatch.Status.Allocatable[v1.ResourceName(otherDevice.Name)] = *resource.NewQuantity(int64(otherDevice.Count), resource.DecimalSI) + } + + patchBytes, err := json.Marshal(nodePatch) + if err != nil { + return fmt.Errorf("failed to update node: failed to marshal patch: %v", err) + } + + _, err = p.kubeClient.CoreV1().Nodes().Patch( + context.TODO(), nodeName, types.MergePatchType, patchBytes, metav1.PatchOptions{}, "status", ) if err != nil { return fmt.Errorf("failed to update node capacity and allocatable: %v", err) diff --git a/internal/kwok-gpu-device-plugin/handlers/configmap/handler_test.go b/internal/kwok-gpu-device-plugin/handlers/configmap/handler_test.go new file mode 100644 index 0000000..6c91b75 --- /dev/null +++ b/internal/kwok-gpu-device-plugin/handlers/configmap/handler_test.go @@ -0,0 +1,80 @@ +package configmap + +import ( + "testing" + + "gopkg.in/yaml.v3" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + "github.com/run-ai/fake-gpu-operator/internal/common/topology" + "golang.org/x/net/context" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +func TestKWOKGPUDevicePlugin(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "KWOK GPU DevicePlugin Suite") +} + +var _ = Describe("HandleAdd", func() { + It("should update the node capacity and allocatable", func() { + nodeName := "node1" + nodeTopology := &topology.NodeTopology{ + Gpus: []topology.GpuDetails{ + {ID: "0"}, + }, + OtherDevices: []topology.GenericDevice{ + {Name: "device1", Count: 2}, + }, + } + + topologyData, err := yaml.Marshal(nodeTopology) + if err != nil { + Fail("Failed to marshal topology data") + } + + configMap := &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Labels: map[string]string{ + constants.LabelTopologyCMNodeName: nodeName, + }, + }, + Data: map[string]string{ + topology.CmTopologyKey: string(topologyData), + }, + } + + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + }, + } + + fakeClient := fake.NewSimpleClientset(node, configMap) + + fakeNodeCMHandler := NewConfigMapHandler(fakeClient, nil) + err = fakeNodeCMHandler.HandleAdd(configMap) + Expect(err).ToNot(HaveOccurred()) + + updateNode, err := fakeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) + Expect(err).ToNot(HaveOccurred()) + Expect(testResourceListCondition(updateNode.Status.Capacity, v1.ResourceName(constants.GpuResourceName), 1)).To(BeTrue()) + Expect(testResourceListCondition(updateNode.Status.Allocatable, v1.ResourceName(constants.GpuResourceName), 1)).To(BeTrue()) + Expect(testResourceListCondition(updateNode.Status.Capacity, v1.ResourceName("device1"), 2)).To(BeTrue()) + Expect(testResourceListCondition(updateNode.Status.Allocatable, v1.ResourceName("device1"), 2)).To(BeTrue()) + }) +}) + +func testResourceListCondition(resourceList v1.ResourceList, resourceName v1.ResourceName, value int64) bool { + quantity, found := resourceList[resourceName] + if !found { + return false + } + return quantity.Value() == value +}