diff --git a/tpu-provisioner/internal/cloud/gke.go b/tpu-provisioner/internal/cloud/gke.go index 35bdd4846..7c614fd1f 100644 --- a/tpu-provisioner/internal/cloud/gke.go +++ b/tpu-provisioner/internal/cloud/gke.go @@ -104,7 +104,7 @@ func (g *GKE) EnsureNodePoolForPod(p *corev1.Pod, why string) error { } func (g *GKE) ListNodePools() ([]NodePoolRef, error) { - var names []NodePoolRef + var refs []NodePoolRef resp, err := g.Service.Projects.Locations.Clusters.NodePools.List(g.ClusterContext.ClusterName()).Do() if err != nil { @@ -113,7 +113,7 @@ func (g *GKE) ListNodePools() ([]NodePoolRef, error) { } for _, np := range resp.NodePools { - names = append(names, NodePoolRef{ + refs = append(refs, NodePoolRef{ Name: np.Name, Error: np.Status == "ERROR", ErrorMsg: np.StatusMessage, @@ -124,7 +124,7 @@ func (g *GKE) ListNodePools() ([]NodePoolRef, error) { }) } - return names, nil + return refs, nil } func (g *GKE) DeleteNodePoolForNode(node *corev1.Node, why string) error { diff --git a/tpu-provisioner/internal/controller/nodepool_garbage_collector.go b/tpu-provisioner/internal/controller/nodepool_garbage_collector.go index 1a2270d82..abea43deb 100644 --- a/tpu-provisioner/internal/controller/nodepool_garbage_collector.go +++ b/tpu-provisioner/internal/controller/nodepool_garbage_collector.go @@ -18,7 +18,7 @@ type NodePoolGarbageCollector struct { } func (g *NodePoolGarbageCollector) Run(ctx context.Context) { - log := ctrllog.Log + log := ctrllog.Log.WithName("nodepool-garbage-collector") t := time.NewTicker(g.Interval) @@ -39,15 +39,21 @@ func (g *NodePoolGarbageCollector) Run(ctx context.Context) { } for _, np := range nodepools { + log = log.WithValues("nodepool", np.Name) + if !np.Error { continue } + if np.CreatedForPod.Name == "" || np.CreatedForPod.Namespace == "" { + log.Info("skipping garbage collection of node pool, no pod reference") + continue + } + // Check if the Pod that triggered the Node Pool creation still exists. err := g.Get(ctx, np.CreatedForPod, &v1.Pod{}) if err == nil { log.Info("skipping garbage collection of node pool, pod still exists", - "nodepool", np.Name, "podName", np.CreatedForPod.Name, "podNamespace", np.CreatedForPod.Namespace, ) @@ -66,17 +72,15 @@ func (g *NodePoolGarbageCollector) Run(ctx context.Context) { continue } if len(nodes.Items) > 0 { - log.Info("skipping garbage collection of node pool, nodes exist", - "nodepool", np.Name, - ) + log.Info("skipping garbage collection of node pool, nodes exist") continue } - log.Info("garbage collecting node pool in error state", "nodepool", np.Name) + log.Info("garbage collecting node pool in error state") // TODO: Lookup namespace from env with downward API. if err := g.Provider.DeleteNodePool(np.Name, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "tpu-provisioner-system"}}, "the node pool has no corresponding Nodes, the Pod that triggered its creation no longer exists, and node pool is in an error state: "+np.ErrorMsg); err != nil { - log.Error(err, "failed to garbage colelct node pool", "nodepool", np.Name) + log.Error(err, "failed to garbage colelct node pool") continue } }