-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding status updater / device plugin for kwok nodes
- Loading branch information
Showing
15 changed files
with
381 additions
and
162 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
package main | ||
|
||
import ( | ||
"github.com/run-ai/fake-gpu-operator/internal/common/app" | ||
"github.com/run-ai/fake-gpu-operator/internal/common/config" | ||
"github.com/run-ai/fake-gpu-operator/internal/common/constants" | ||
status_updater "github.com/run-ai/fake-gpu-operator/internal/kwok-status-updater" | ||
) | ||
|
||
func main() { | ||
requiredEnvVars := []string{constants.EnvTopologyCmName, constants.EnvTopologyCmNamespace, constants.EnvFakeGpuOperatorNs} | ||
config.ValidateConfig(requiredEnvVars) | ||
|
||
appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{}) | ||
appRunner.Run() | ||
} |
26 changes: 26 additions & 0 deletions
26
deploy/fake-gpu-operator/templates/kwok-status-updater/clusterrole.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
kind: ClusterRole | ||
metadata: | ||
name: fake-kwok-status-updater | ||
rules: | ||
- apiGroups: | ||
- "" | ||
resources: | ||
- nodes | ||
- nodes/status | ||
verbs: | ||
- update | ||
- list | ||
- get | ||
- watch | ||
- patch | ||
- apiGroups: | ||
- "" | ||
resources: | ||
- configmaps | ||
verbs: | ||
- get | ||
- update | ||
- create | ||
- list | ||
- delete |
12 changes: 12 additions & 0 deletions
12
deploy/fake-gpu-operator/templates/kwok-status-updater/clusterrolebinding.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
kind: ClusterRoleBinding | ||
metadata: | ||
name: fake-kwok-status-updater | ||
roleRef: | ||
kind: ClusterRole | ||
apiGroup: rbac.authorization.k8s.io | ||
name: fake-kwok-status-updater | ||
subjects: | ||
- kind: ServiceAccount | ||
name: kwok-status-updater | ||
namespace: "{{ .Release.Namespace }}" |
39 changes: 39 additions & 0 deletions
39
deploy/fake-gpu-operator/templates/kwok-status-updater/deployment.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: kwok-status-updater | ||
annotations: | ||
checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} | ||
labels: | ||
app: kwok-status-updater | ||
spec: | ||
selector: | ||
matchLabels: | ||
app: kwok-status-updater | ||
component: kwok-status-updater | ||
replicas: 1 | ||
template: | ||
metadata: | ||
annotations: | ||
checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }} | ||
labels: | ||
app: kwok-status-updater | ||
component: kwok-status-updater | ||
spec: | ||
containers: | ||
- name: kwok-status-updater | ||
image: "{{ .Values.kwokStatusUpdater.image.repository }}:{{ .Values.kwokStatusUpdater.image.tag }}" | ||
imagePullPolicy: "{{ .Values.kwokStatusUpdater.image.pullPolicy }}" | ||
resources: | ||
{{- toYaml .Values.kwokStatusUpdater.resources | nindent 12 }} | ||
env: | ||
- name: TOPOLOGY_CM_NAME | ||
value: topology | ||
- name: TOPOLOGY_CM_NAMESPACE | ||
value: "{{ .Release.Namespace }}" | ||
- name: FAKE_GPU_OPERATOR_NAMESPACE | ||
value: "{{ .Release.Namespace }}" | ||
restartPolicy: Always | ||
serviceAccountName: kwok-status-updater | ||
imagePullSecrets: | ||
- name: gcr-secret |
4 changes: 4 additions & 0 deletions
4
deploy/fake-gpu-operator/templates/kwok-status-updater/serviceaccount.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
apiVersion: v1 | ||
kind: ServiceAccount | ||
metadata: | ||
name: kwok-status-updater |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
package status_updater | ||
|
||
import ( | ||
"sync" | ||
|
||
"k8s.io/client-go/dynamic" | ||
"k8s.io/client-go/kubernetes" | ||
"k8s.io/client-go/rest" | ||
|
||
ctrl "sigs.k8s.io/controller-runtime" | ||
|
||
nodecontroller "github.com/run-ai/fake-gpu-operator/internal/kwok-status-updater/controllers/node" | ||
"github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers" | ||
) | ||
|
||
var InClusterConfigFn = ctrl.GetConfigOrDie | ||
var KubeClientFn = func(c *rest.Config) kubernetes.Interface { | ||
return kubernetes.NewForConfigOrDie(c) | ||
} | ||
|
||
var DynamicClientFn = func(c *rest.Config) dynamic.Interface { | ||
return dynamic.NewForConfigOrDie(c) | ||
} | ||
|
||
type StatusUpdaterAppConfiguration struct { | ||
TopologyCmName string `mapstructure:"TOPOLOGY_CM_NAME" validate:"required"` | ||
TopologyCmNamespace string `mapstructure:"TOPOLOGY_CM_NAMESPACE" validate:"required"` | ||
} | ||
|
||
type StatusUpdaterApp struct { | ||
Controllers []controllers.Interface | ||
kubeClient kubernetes.Interface | ||
stopCh chan struct{} | ||
wg *sync.WaitGroup | ||
} | ||
|
||
func (app *StatusUpdaterApp) Run() { | ||
app.wg.Add(len(app.Controllers)) | ||
for _, controller := range app.Controllers { | ||
go func(controller controllers.Interface) { | ||
defer app.wg.Done() | ||
controller.Run(app.stopCh) | ||
}(controller) | ||
} | ||
|
||
app.wg.Wait() | ||
} | ||
|
||
func (app *StatusUpdaterApp) Init(stopCh chan struct{}) { | ||
app.stopCh = stopCh | ||
|
||
clusterConfig := InClusterConfigFn() | ||
clusterConfig.QPS = 100 | ||
clusterConfig.Burst = 200 | ||
|
||
app.wg = &sync.WaitGroup{} | ||
|
||
app.kubeClient = KubeClientFn(clusterConfig) | ||
|
||
app.Controllers = append(app.Controllers, nodecontroller.NewNodeController(app.kubeClient, app.wg)) | ||
} | ||
|
||
func (app *StatusUpdaterApp) Name() string { | ||
return "StatusUpdater" | ||
} | ||
|
||
func (app *StatusUpdaterApp) GetConfig() interface{} { | ||
var config StatusUpdaterAppConfiguration | ||
|
||
return config | ||
} |
86 changes: 86 additions & 0 deletions
86
internal/kwok-status-updater/controllers/node/controller.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
package node | ||
|
||
import ( | ||
"log" | ||
"sync" | ||
|
||
"github.com/run-ai/fake-gpu-operator/internal/common/constants" | ||
"github.com/run-ai/fake-gpu-operator/internal/common/topology" | ||
"github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers" | ||
"github.com/run-ai/fake-gpu-operator/internal/status-updater/controllers/util" | ||
|
||
nodehandler "github.com/run-ai/fake-gpu-operator/internal/kwok-status-updater/handlers/node" | ||
|
||
v1 "k8s.io/api/core/v1" | ||
|
||
"k8s.io/client-go/informers" | ||
"k8s.io/client-go/kubernetes" | ||
"k8s.io/client-go/tools/cache" | ||
) | ||
|
||
type NodeController struct { | ||
kubeClient kubernetes.Interface | ||
informer cache.SharedIndexInformer | ||
handler nodehandler.Interface | ||
|
||
clusterTopology *topology.ClusterTopology | ||
} | ||
|
||
var _ controllers.Interface = &NodeController{} | ||
|
||
func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *NodeController { | ||
clusterTopology, err := topology.GetClusterTopologyFromCM(kubeClient) | ||
if err != nil { | ||
log.Fatalf("Failed to get cluster topology: %v", err) | ||
} | ||
|
||
c := &NodeController{ | ||
kubeClient: kubeClient, | ||
informer: informers.NewSharedInformerFactory(kubeClient, 0).Core().V1().Nodes().Informer(), | ||
handler: nodehandler.NewNodeHandler(kubeClient, clusterTopology), | ||
clusterTopology: clusterTopology, | ||
} | ||
|
||
_, err = c.informer.AddEventHandler(cache.FilteringResourceEventHandler{ | ||
FilterFunc: func(obj interface{}) bool { | ||
switch node := obj.(type) { | ||
case *v1.Node: | ||
return c.isFakeGpuKWOKNode(node) | ||
default: | ||
return false | ||
} | ||
}, | ||
Handler: cache.ResourceEventHandlerFuncs{ | ||
AddFunc: func(obj interface{}) { | ||
go func() { | ||
node := obj.(*v1.Node) | ||
util.LogErrorIfExist(c.handler.HandleAdd(node), "Failed to handle node addition") | ||
}() | ||
}, | ||
DeleteFunc: func(obj interface{}) { | ||
go func() { | ||
node := obj.(*v1.Node) | ||
util.LogErrorIfExist(c.handler.HandleDelete(node), "Failed to handle node deletion") | ||
}() | ||
}, | ||
}, | ||
}) | ||
if err != nil { | ||
log.Fatalf("Failed to add node event handler: %v", err) | ||
} | ||
|
||
return c | ||
} | ||
|
||
func (c *NodeController) Run(stopCh <-chan struct{}) { | ||
log.Println("Starting node controller") | ||
c.informer.Run(stopCh) | ||
} | ||
|
||
func (c *NodeController) isFakeGpuKWOKNode(node *v1.Node) bool { | ||
if node == nil || node.Labels == nil { | ||
return false | ||
} | ||
_, isNodeAssignedToNodePool := node.Labels[c.clusterTopology.NodePoolLabelKey] | ||
return isNodeAssignedToNodePool && node.Annotations[constants.AnnotationKwokNode] == "fake" | ||
} |
Oops, something went wrong.