Skip to content

Commit

Permalink
adding kwok cm controller retries
Browse files Browse the repository at this point in the history
  • Loading branch information
enoodle committed Aug 18, 2024
1 parent 2034f8b commit 65838fd
Showing 1 changed file with 9 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package configmamp

import (
"log"
"time"

"github.com/run-ai/fake-gpu-operator/internal/common/constants"
"github.com/run-ai/fake-gpu-operator/internal/common/topology"
Expand Down Expand Up @@ -61,7 +62,7 @@ func NewConfigMapController(
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
go func() {
c.callConfigMapHandler(obj.(*v1.ConfigMap))
c.callConfigMapHandler(obj.(*v1.ConfigMap), 0)
}()
},
},
Expand All @@ -86,11 +87,16 @@ func (c *ConfigMapController) isFakeGpuKWOKNodeConfigMap(cm *v1.ConfigMap) bool
return cm.Annotations[constants.AnnotationKwokNode] == "fake"
}

func (c *ConfigMapController) callConfigMapHandler(cm *v1.ConfigMap) {
func (c *ConfigMapController) callConfigMapHandler(cm *v1.ConfigMap, retryCount int) {
nodeName := cm.Labels[constants.LabelTopologyCMNodeName]
node, err := c.nodeLister.Get(nodeName)
if err != nil {
log.Printf("Failed to get node %s: %v", nodeName, err)
delay := time.Millisecond * 100 * (1 << retryCount)
log.Printf("Failed to get node %s: %v. retry in %v", nodeName, err, delay)
time.Sleep(delay)
if retryCount < 5 {
c.callConfigMapHandler(cm, retryCount+1)
}
return
}
util.LogErrorIfExist(c.handler.HandleAdd(cm, node), "Failed to handle cm addition")
Expand Down

0 comments on commit 65838fd

Please sign in to comment.