Skip to content

Commit

Permalink
Address comments and make node pool deletion delay configurable to al…
Browse files Browse the repository at this point in the history
…low tests to pass
  • Loading branch information
nstogner committed Mar 12, 2024
1 parent 1039c27 commit b9fdef5
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 49 deletions.
5 changes: 4 additions & 1 deletion tpu-provisioner/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ func main() {
// the node to become Ready and for a pending Pod to be scheduled on it.
NodeMinLifespan time.Duration `envconfig:"NODE_MIN_LIFESPAN" default:"3m"`

NodepoolDeletionDelay time.Duration `envconfig:"NODEPOOL_DELETION_DELAY" default:"30s"`

PodResourceType string `envconfig:"POD_RESOURCE_TYPE" default:"google.com/tpu"`

Concurrency int `envconfig:"CONCURRENCY" default:"3"`
Expand Down Expand Up @@ -229,7 +231,8 @@ func main() {
Recorder: mgr.GetEventRecorderFor("tpu-provisioner"),
Provider: provider,
NodeCriteria: controller.NodeCriteria{
MinLifetime: cfg.NodeMinLifespan,
MinLifetime: cfg.NodeMinLifespan,
PoolDeletionDelay: cfg.NodepoolDeletionDelay,
},
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DeletionReconciler")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
package cloud

import (
"errors"
"time"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)

const (
keyPrefix = "google.com/"

Expand All @@ -23,3 +32,24 @@ const (

EventNodePoolNotFound = "NodePoolNotFound"
)

type Provider interface {
NodePoolLabelKey() string
EnsureNodePoolForPod(*corev1.Pod, string) error
DeleteNodePoolForNode(*corev1.Node, string) error
DeleteNodePool(string, client.Object, string) error
ListNodePools() ([]NodePoolRef, error)
}

var ErrDuplicateRequest = errors.New("duplicate request")

type NodePoolRef struct {
Name string

CreationTime time.Time

CreatedForPod types.NamespacedName

Error bool
Message string
}
6 changes: 3 additions & 3 deletions tpu-provisioner/internal/cloud/gke.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ func (g *GKE) ListNodePools() ([]NodePoolRef, error) {

for _, np := range resp.NodePools {
refs = append(refs, NodePoolRef{
Name: np.Name,
Error: np.Status == "ERROR",
ErrorMsg: np.StatusMessage,
Name: np.Name,
Error: np.Status == "ERROR",
Message: np.StatusMessage,
CreatedForPod: types.NamespacedName{
Name: np.Config.Labels[LabelPodName],
Namespace: np.Config.Labels[LabelPodNamespace],
Expand Down
31 changes: 0 additions & 31 deletions tpu-provisioner/internal/cloud/interface.go

This file was deleted.

18 changes: 9 additions & 9 deletions tpu-provisioner/internal/controller/deletion_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/source"
)

// nodePoolDeletionCheckInterval is the interval between the first and
// second node pool deletion checks. Once the node pool deletion check
// has passed twice, the node pool can be safely deleted. This second
// check is ensure the node pool is not prematurely deleted, in the case
// where a JobSet is restarted, but no pods have been created yet.
var nodePoolDeletionCheckInterval = 30 * time.Second

// DeletionReconciler watches Pods and Nodes and deletes Node Pools.
type DeletionReconciler struct {
client.Client
Expand All @@ -43,6 +36,13 @@ type DeletionReconciler struct {

type NodeCriteria struct {
MinLifetime time.Duration

// PoolDeletionDelay is the interval between the first and
// second node pool deletion checks. Once the node pool deletion check
// has passed twice, the node pool can be safely deleted. This second
// check is ensure the node pool is not prematurely deleted, in the case
// where a JobSet is restarted, but no pods have been created yet.
PoolDeletionDelay time.Duration
}

//+kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -128,14 +128,14 @@ func (r *DeletionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
if !exists {
lg.Info(fmt.Sprintf("Node pool %q passed deletion check once", nodePoolName))
r.NodePoolsMarkedForDeletion.Store(nodePoolName, time.Now())
return ctrl.Result{RequeueAfter: nodePoolDeletionCheckInterval}, nil
return ctrl.Result{RequeueAfter: r.NodeCriteria.PoolDeletionDelay}, nil
}

// If we haven't reached the node pool deletion check interval, this reconcile was
// caused by something else, we can return early, and wait for the manually requeued
// reconcile we did after the first deletion check passed.
firstDeletionCheckTime := value.(time.Time)
if time.Now().Sub(firstDeletionCheckTime) < nodePoolDeletionCheckInterval {
if time.Now().Sub(firstDeletionCheckTime) < r.NodeCriteria.PoolDeletionDelay {
return ctrl.Result{}, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ var _ = Describe("Deletion controller", func() {

By("Checking the first deletion attempt only occurred after the node had existed for >= nodeDeletionInterval")
actualDuration := deletionTimestamp.Sub(createdNode.CreationTimestamp.Time)
requiredDuration := nodePoolDeletionCheckInterval + minNodeLifetime
requiredDuration := nodepoolDeletionDelay + minNodeLifetime
Expect(actualDuration).Should(BeNumerically(">=", requiredDuration))

By("Checking that other Nodes were ignored")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ import (
ctrllog "sigs.k8s.io/controller-runtime/pkg/log"
)

// NodePoolGarbageCollector deletes node pools that have no Nodes,
// are in an errored state, and where the Pod that created the node pool
// no longer exists (the deletion reconciler would not see these b/c there
// are no Node objects).
type NodePoolGarbageCollector struct {
Interval time.Duration
client.Client
Expand Down Expand Up @@ -84,7 +88,7 @@ func (g *NodePoolGarbageCollector) Run(ctx context.Context) {
log.Info("garbage collecting node pool in error state")
// TODO: Lookup namespace from env with downward API.
whyDelete := fmt.Sprintf("the node pool has no corresponding Nodes, the Pod (%s/%s) that triggered its creation no longer exists, and node pool is in an error state: %s",
np.CreatedForPod.Namespace, np.CreatedForPod.Name, np.ErrorMsg)
np.CreatedForPod.Namespace, np.CreatedForPod.Name, np.Message)
if err := g.Provider.DeleteNodePool(np.Name, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "tpu-provisioner-system"}}, whyDelete); err != nil {
log.Error(err, "failed to garbage collect node pool")
continue
Expand Down
8 changes: 5 additions & 3 deletions tpu-provisioner/internal/controller/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ var (
)

const (
resourceName = "test.com/tpu"
minNodeLifetime = time.Second
resourceName = "test.com/tpu"
minNodeLifetime = time.Second
nodepoolDeletionDelay = 5 * time.Second
)

func TestAPIs(t *testing.T) {
Expand Down Expand Up @@ -103,7 +104,8 @@ var _ = BeforeSuite(func() {
Recorder: mgr.GetEventRecorderFor("tpu-provisioner-deleter"),
Provider: provider,
NodeCriteria: NodeCriteria{
MinLifetime: minNodeLifetime,
MinLifetime: minNodeLifetime,
PoolDeletionDelay: nodepoolDeletionDelay,
},
}).SetupWithManager(mgr)
Expect(err).ToNot(HaveOccurred())
Expand Down

0 comments on commit b9fdef5

Please sign in to comment.